/* $Id: gramgen.h Tue Apr 05 10:20:24 PDT 2008 abehm$ Copyright (C) 2007 by The Regents of the University of California Redistribution of this file is permitted under the terms of BSD license Date: 04/05/2008 Author: Alexander Behm Rares Vernica */ #ifndef _gramgen_h_ #define _gramgen_h_ #include #include #include #include #include #include "typedef.h" typedef enum { GGT_FIXED } GramGenType; // gram generator interface class GramGen { protected: static const std::tr1::hash hashString; static const uchar PREFIXCHAR = 156; // pound static const uchar SUFFIXCHAR = 190; // yen GramGenType gramGenType; public: // pre-and postfix string when generating grams? // needs to be accessible by simmetric for calculating filter bounds (e.g. by jacc and cos) bool prePost; GramGen(bool usePrePost = true): prePost(usePrePost) {} virtual ~GramGen(){} // convert a string to a BAG of grams virtual void decompose( const std::string &s, std::vector &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; // convert a string to a BAG of hashed grams virtual void decompose( const std::string &s, std::vector &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; // convert a string to a BAG (multiset) of grams virtual void decompose( const std::string &s, std::multiset &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; // convert a string to a BAG (multiset) of hashed grams virtual void decompose( const std::string &s, std::multiset &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; // convert a string to a SET of grams virtual void decompose( const std::string &s, std::set &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; // convert a string to a SET of hashed grams virtual void decompose( const std::string &s, std::set &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; // convert a string to a SET of hashed grams with count virtual void decompose( const std::string &s, std::map &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const = 0; virtual uint getNumGrams(const std::string& s) const = 0; virtual uint getGramLength() const = 0; virtual void saveGramGenInstance(std::ofstream& fpOut) = 0; GramGenType getType() const { return gramGenType; } static GramGen* loadGramGenInstance(std::ifstream& fpIn); }; class GramGenFixedLen: public GramGen { private: uint q; bool noSpace; bool containsSpace(std::string& s) const; public: GramGenFixedLen( uint gramLength = 3, bool usePrePost = true, bool ignoreSpaces = false): GramGen(usePrePost), q(gramLength), noSpace(ignoreSpaces) { gramGenType = GGT_FIXED; } GramGenFixedLen(std::ifstream& fpIn); void decompose( const std::string &s, std::vector &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; void decompose( const std::string &s, std::vector &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; void decompose( const std::string &s, std::multiset &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; void decompose( const std::string &s, std::multiset &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; void decompose( const std::string &s, std::set &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; void decompose( const std::string &s, std::set &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; void decompose( const std::string &s, std::map &res, uchar st = PREFIXCHAR, uchar en = SUFFIXCHAR) const; uint getNumGrams(const std::string& s) const; uint getGramLength() const { return q; } void saveGramGenInstance(std::ofstream& fpOut); }; #endif