/* $Id: example.cc 4056 2008-10-10 22:01:51Z abehm $ Copyright (C) 2008 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license Date: 02/04/2008 Author: Alexander Behm */ #include "ftsearchermem.h" #include "common/query.h" #include "common/simmetric.h" #include "listmerger/divideskipmerger.h" #include "listmerger/scancountmerger.h" vector dictionary; // create a dummy dictionary void initDictionary(); void basicUsage1(); void basicUsage2(); void basicUsage3(); int main() { initDictionary(); basicUsage1(); basicUsage2(); basicUsage3(); return 0; } void initDictionary() { vector prefixes; prefixes.push_back("string"); prefixes.push_back("example"); prefixes.push_back("test"); prefixes.push_back("hello"); prefixes.push_back("world"); prefixes.push_back("foo"); prefixes.push_back("bar"); vector suffixes; suffixes.push_back("1"); suffixes.push_back("10"); suffixes.push_back("100"); suffixes.push_back("2"); suffixes.push_back("20"); suffixes.push_back("200"); suffixes.push_back("3"); suffixes.push_back("30"); suffixes.push_back("300"); cout << "---------------------------------------" << endl; cout << "STRING DICTIONARY:" << endl; for(unsigned j = 0; j < prefixes.size(); j++) for(unsigned i = 0; i < suffixes.size(); i++) { dictionary.push_back(prefixes.at(j) + suffixes.at(i)); cout << dictionary.at(dictionary.size()-1) << endl; } cout << "---------------------------------------" << endl << endl; } void basicUsage1() { cout << "----- BASIC USAGE 1 ----" << endl; // create gramgenerator and similarity metric GramGenFixedLen gramGen(3); // using fixed-length grams SimMetricEd simMetric(gramGen); // using the edit distance //SimMetricJacc simMetric(gramGen); // using jaccard similarity //SimMetricCos simMetric(gramGen); // using cosine similarity //SimMetricDice simMetric(gramGen); // using dice similarity // create simple indexer with default template arguments // default: in-memory index using Array as an inverted list container // first create a string container and fill it with strings to index StringContainerVector strContainer; strContainer.fillContainer(dictionary); // fill the container from a vector FtIndexerSimple<> indexer(&strContainer, &gramGen); indexer.addFilter(new LengthFilter(50)); // add length filtering with a maximum string length of 50 indexer.buildIndex(); // create merger DivideSkipMerger<> merger; // create searcher passing merger and indexer with default template arguments // default: same as indexer, i.e. assumed simple indexer with Array as inverted lists and DivideSkipMerger as merger type FtSearcherMem<> searcher(&merger, &indexer); vector resultStringIDs; Query query("xample", simMetric, 2.0f); // query string, similarity metric, similarity threshold searcher.search(query, resultStringIDs); cout << "SIMILAR STRINGS: " << endl; for(unsigned i = 0; i < resultStringIDs.size(); i++) { string tmp; strContainer.retrieveString(tmp, resultStringIDs.at(i)); cout << tmp << endl; } cout << "SAVING INDEX" << endl; indexer.saveIndex("ExampleIndex.ix"); cout << "LOADING INDEX" << endl; FtIndexerSimple<> indexerLoaded(&strContainer); indexerLoaded.loadIndex("ExampleIndex.ix"); resultStringIDs.clear(); searcher.setFtIndexer(&indexerLoaded); searcher.search(query, resultStringIDs); cout << "SIMILAR STRINGS: " << endl; for(unsigned i = 0; i < resultStringIDs.size(); i++) { string tmp; strContainer.retrieveString(tmp, resultStringIDs.at(i)); cout << tmp << endl; } cout << "----------------------" << endl << endl; } void basicUsage2() { cout << "----- BASIC USAGE 2 ----" << endl; // create gramgenerator and similarity metric GramGenFixedLen gramGen(3); // using fixed-length grams SimMetricEd simMetric(gramGen); // using the edit distance // create simple indexer with default template arguments // default: in-memory index using Array as an inverted list container // first create a string container and fill it with strings to index StringContainerVector strContainer; strContainer.fillContainer("data/dummy.txt", 180); // fill the container from a datafile and use the first 180 lines FtIndexerSimple<> indexer(&strContainer, &gramGen); indexer.addFilter(new LengthFilter(50)); // add length filtering with a maximum string length of 50 indexer.buildIndex(); // create merger DivideSkipMerger<> merger; // create searcher passing merger and indexer with default template arguments // default: same as indexer, i.e. assumed simple indexer with Array as inverted lists and DivideSkipMerger as merger type FtSearcherMem<> searcher(&merger, &indexer); vector resultStringIDs; Query query("elloorld", simMetric, 3.0f); searcher.search(query, resultStringIDs); cout << "SIMILAR STRINGS: " << endl; for(unsigned i = 0; i < resultStringIDs.size(); i++) { string tmp; strContainer.retrieveString(tmp, resultStringIDs.at(i)); cout << tmp << endl; } cout << "SAVING INDEX" << endl; indexer.saveIndex("ExampleIndex.ix"); cout << "LOADING INDEX" << endl; FtIndexerSimple<> indexerLoaded(&strContainer); indexerLoaded.loadIndex("ExampleIndex.ix"); resultStringIDs.clear(); searcher.setFtIndexer(&indexerLoaded); searcher.search(query, resultStringIDs); cout << "SIMILAR STRINGS: " << endl; for(unsigned i = 0; i < resultStringIDs.size(); i++) { string tmp; strContainer.retrieveString(tmp, resultStringIDs.at(i)); cout << tmp << endl; } cout << "----------------------" << endl << endl; } void basicUsage3() { cout << "----- BASIC USAGE 3 ----" << endl; // create gramgenerator and similarity metric GramGenFixedLen gramGen(3); // using fixed-length grams SimMetricJacc simMetric(gramGen); // using the jaccard distance (using set semantics) // create simple indexer with default template arguments // default: in-memory index using Array as an inverted list container // first create a string container and fill it with strings to index StringContainerVector strContainer; strContainer.fillContainer("data/dummy.txt", 180); // fill the container from a datafile and use the first 180 lines FtIndexerSimple<> indexer(&strContainer, &gramGen); indexer.addFilter(new ChecksumFilter(50)); // add checksum filtering with a maximum string length of 50 indexer.buildIndex(); // create merger ScanCountMerger<> merger(180); // create searcher, specifying a non-default merger FtSearcherMem, ScanCountMerger<> > searcher(&merger, &indexer); vector resultStringIDs; Query query("elloworld", simMetric, 0.4f); searcher.search(query, resultStringIDs); cout << "SIMILAR STRINGS: " << endl; for(unsigned i = 0; i < resultStringIDs.size(); i++) { string tmp; strContainer.retrieveString(tmp, resultStringIDs.at(i)); cout << tmp << endl; } cout << "SAVING INDEX" << endl; indexer.saveIndex("ExampleIndex.ix"); cout << "LOADING INDEX" << endl; FtIndexerSimple<> indexerLoaded(&strContainer); indexerLoaded.loadIndex("ExampleIndex.ix"); resultStringIDs.clear(); searcher.setFtIndexer(&indexerLoaded); searcher.search(query, resultStringIDs); cout << "SIMILAR STRINGS: " << endl; for(unsigned i = 0; i < resultStringIDs.size(); i++) { string tmp; strContainer.retrieveString(tmp, resultStringIDs.at(i)); cout << tmp << endl; } cout << "----------------------" << endl << endl; }