/* $Id: simmetric.h 4025 2008-10-01 00:01:14Z abehm $ Copyright (C) 2007 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license Date: 04/15/2008 Author: Rares Vernica , Alexander Behm */ #ifndef _simmetric_h_ #define _simmetric_h_ #include "typedef.h" #include "gramgen.h" // TODO: discuss this with Ray! // preliminary location of filtertreetypes, just to make everything work enum FilterType { FT_NONE, FT_LENGTH, FT_CHECKSUM }; #define CHECKSUM_ASCII_MAX 127 enum DistanceMeasure { DM_EDIT_DISTANCE, DM_JACCARD, DM_COSINE }; class SimMetric // abstract class { public: const GramGen& gramGen; const std::string name; SimMetric(const GramGen &gramGen, const std::string name): gramGen(gramGen), name(name) {} virtual ~SimMetric() {}; virtual float operator()(const std::string &s1, const std::string &s2) const = 0; virtual bool operator()( const std::string &s1, const std::string &s2, float threshold) const; // compute merging threshold virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const = 0; // get filter bounds virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const = 0; // bounds on similarity virtual float getSimMin( uint noGramsQuery, uint noGramsData, uint noGramsCommon) const = 0; virtual float getSimMax( uint lenQuery, uint noGramsQuery, uint noGramsData, uint noGramsCommon) const = 0; // bound on common grams virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const = 0; }; class SimMetricEd: public SimMetric { public: SimMetricEd(const GramGen &gramGen, const std::string name = "ed"): SimMetric(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual bool operator()( const std::string &s1, const std::string &s2, float threshold) const; virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual float getSimMin( uint noGramsQuery, uint noGramsData, uint noGramsCommon) const; virtual float getSimMax( uint lenQuery, uint noGramsQuery, uint noGramsData, uint noGramsCommon) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; class SimMetricEdNorm: public SimMetricEd { public: SimMetricEdNorm(const GramGen &gramGen, const std::string name = "ednorm"): SimMetricEd(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual bool operator()( const std::string &s1, const std::string &s2, float threshold) const; virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual float getSimMin( uint noGramsQuery, uint noGramsData, uint noGramsCommon) const; virtual float getSimMax( uint lenQuery, uint noGramsQuery, uint noGramsData, uint noGramsCommon) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; class SimMetricEdSwap: public SimMetric { public: SimMetricEdSwap(const GramGen &gramGen, const std::string name = "edswap"): SimMetric(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual bool operator()( const std::string &s1, const std::string &s2, float threshold) const; virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual float getSimMin( uint noGramsQuery, uint noGramsData, uint noGramsCommon) const; virtual float getSimMax( uint lenQuery, uint noGramsQuery, uint noGramsData, uint noGramsCommon) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; class SimMetricGram: public SimMetric // abstract class { public: SimMetricGram(const GramGen &gramGen, const std::string name): SimMetric(gramGen, name) {} virtual float operator()( uint noGramsData, uint noGramsQuery, uint noGramsCommon) const = 0; virtual bool operator()( uint noGramsData, uint noGramsQuery, uint noGramsCommon, float threshold) const { return operator()(noGramsData, noGramsQuery, noGramsCommon) >= threshold; } virtual float getSimMin( uint noGramsQuery, uint noGramsData, uint noGramsCommon) const { return operator()(noGramsData, noGramsQuery, noGramsCommon); } virtual float getSimMax( uint lenQuery, uint noGramsQuery, uint noGramsData, uint noGramsCommon) const { return operator()(noGramsData, noGramsQuery, noGramsCommon); } }; class SimMetricJacc: public SimMetricGram // set semantics ! { public: SimMetricJacc(const GramGen &gramGen, const std::string name = "jc"): SimMetricGram(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual float operator()( uint noGramsData, uint noGramsQuery, uint noGramsCommon) const; // TODO: talk to Ray! // compute merging threshold virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; // TODO: talk to Ray! // get filter bounds virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; class SimMetricCos: public SimMetricGram // set semantics ! { public: SimMetricCos(const GramGen &gramGen, const std::string name = "cos"): SimMetricGram(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual float operator()( uint noGramsData, uint noGramsQuery, uint noGramsCommon) const; virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; class SimMetricDice: public SimMetricGram // set semantics ! { public: SimMetricDice(const GramGen &gramGen, const std::string name = "dice"): SimMetricGram(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual float operator()( uint noGramsData, uint noGramsQuery, uint noGramsCommon) const; virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; class SimMetricGramCount: public SimMetricGram { public: SimMetricGramCount(const GramGen &gramGen, const std::string name = "cnt"): SimMetricGram(gramGen, name) {} virtual float operator()(const std::string &s1, const std::string &s2) const; virtual float operator()( uint noGramsData, uint noGramsQuery, uint noGramsCommon) const; virtual uint getMergeThreshold( const std::string& query, const std::vector& queryGramCodes, const float simThreshold) const; virtual void getFilterBounds( const std::string& query, const float simThreshold, const FilterType filterType, uint& lbound, uint& ubound) const; virtual uint getNoGramsMin( uint lenQuery, uint noGramsMin, uint noGramsQuery, float sim) const; }; #endif