/* $Id$ Copyright (C) 2007 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license Date: 09/17/2007 Author: Alexander Behm */ #ifndef _ftindexermemabs_h_ #define _ftindexermemabs_h_ #include "ftindexerabs.h" #include "gramlistsimple.h" template class FtIndexerMemAbs; template struct FtIndexerTraits > { typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; }; template class FtIndexerMemAbs : public FtIndexerAbs > { public: typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; protected: void addStringToNode(FilterTreeNode* node, const string& s, const unsigned stringId); typedef FtIndexerAbs > SuperClass; // used for reading/writing index from/to file, // using logical separation into functions so compressed indexers can use these, too void saveLeavesRec(FilterTreeNode* node, ofstream& fpOut); void loadLeavesRec(FilterTreeNode* node, ifstream& fpIn); public: typedef typename SuperClass::GramMap GramMap; FtIndexerMemAbs(StringContainer* strContainer) : SuperClass(strContainer) {} FtIndexerMemAbs(StringContainer* strContainer, GramGen* gramGenerator, const unsigned maxStrLen = 150, const unsigned ftFanout = 50) : SuperClass(strContainer, gramGenerator, maxStrLen, ftFanout) {} void addStringId(FilterTreeNode* leaf, const vector& gramCodes, unsigned stringId) { static_cast(this)->addStringId_Impl(leaf, gramCodes, stringId); } void saveIndex(const char* indexFileName) { static_cast(this)->saveIndex_Impl(indexFileName); } void loadIndex(const char* indexFileName) { static_cast(this)->loadIndex_Impl(indexFileName); } void buildIndex_Impl(const string& dataFile, const unsigned linesToRead = 0, bool showProgress = true, StatsUtil* sutil = NULL); void buildIndex_Impl(bool showProgress = true, StatsUtil* sutil = NULL); void printFilterTree(FilterTreeNode* node = NULL); }; template void FtIndexerMemAbs:: addStringToNode(FilterTreeNode* node, const string& s, const unsigned stringId) { // add stringId to distinctStringIDs and create the GramListSimple instance if neccessary if(!node->distinctStringIDs) node->distinctStringIDs = new GramListSimple(); node->distinctStringIDs->getArray()->append(stringId); // add stringId to gramMap of leaf vector gramCodes; this->gramGen->decompose(s, gramCodes); addStringId(node, gramCodes, stringId); } template void FtIndexerMemAbs:: buildIndex_Impl(const string& dataFile, const unsigned linesToRead, bool showProgress, StatsUtil* sutil) { this->strContainer->fillContainer(dataFile.c_str(), linesToRead, this->maxStrLen); this->buildIndex(showProgress, sutil); } template void FtIndexerMemAbs:: buildIndex_Impl(bool showProgress, StatsUtil* sutil) { // do we want to gather stats? if(sutil) { // fill stats information known at this time sutil->filterTreeStats.dictionarySize = this->strContainer->size(); sutil->filterTreeStats.maxStrLength = this->maxStrLen; sutil->filterTreeStats.numberFilters = this->filterTypes.size(); sutil->startTimeMeasurement(); } LoopTimer loopTimer; // build hollow tree, i.e. create nodes and leaves without filling inverted lists if(this->filterTypes.size() > 0 && this->ftFanout > 0) { this->buildHollowTreeRecursive(this->filterTreeRoot, 0); // add stringIds to tree if(showProgress) loopTimer.begin("INSERTING INTO INDEX", this->strContainer->size()); for(unsigned stringId = 0; stringId < this->strContainer->size(); stringId++) { FilterTreeNode* currentNode = this->filterTreeRoot; string currentString; this->strContainer->retrieveString(currentString, stringId); if(currentString.empty()) continue; // position currentNode to appropriate leaf node in filter tree for(unsigned filterId = 0; filterId < this->filterTypes.size(); filterId++) { AbstractFilter* filter = this->filterTypes.at(filterId); unsigned key = filter->getKey(currentString); vector*> >& children = currentNode->children; // match key to node for(unsigned childId = 0; childId < children.size(); childId++) { pair*>& child = children.at(childId); if(child.first >= key) { currentNode = child.second; break; } } } addStringToNode(currentNode, currentString, stringId); if(showProgress) loopTimer.next(); } if(showProgress) loopTimer.end(); } else { // no filters selected, so just create one leaf node to store all inverted lists FilterTreeNode* n = new FilterTreeNode(true); pair*> p(0, n); this->filterTreeRoot->children.push_back(p); if(showProgress) loopTimer.begin("INSERTING INTO INDEX", this->strContainer->size()); for(unsigned stringId = 0; stringId < this->strContainer->size(); stringId++) { string currentString; this->strContainer->retrieveString(currentString, stringId); if(currentString.empty()) continue; addStringToNode(n, currentString, stringId); if(showProgress) loopTimer.next(); } if(showProgress) loopTimer.end(); } // set next stringID this->nextStringID = this->strContainer->size(); // do we want to gather stats? if(sutil) { sutil->stopTimeMeasurement(); sutil->filterTreeStats.buildFilterTreeTime = sutil->getTimeMeasurement(); } } template void FtIndexerMemAbs:: saveLeavesRec(FilterTreeNode* node, ofstream& fpOut) { unsigned u; if(node->isLeaf) { if(node->distinctStringIDs) { InvList* distinctArr = node->distinctStringIDs->getArray(); // write distinct string ids u = distinctArr->size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(unsigned i = 0; i < distinctArr->size(); i++) { u = distinctArr->at(i); fpOut.write((const char*)&u, sizeof(unsigned)); } } else { u = 0; fpOut.write((const char*)&u, sizeof(unsigned)); } // write gram map GramMap& gramMap = node->gramMap; u = gramMap.size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(typename GramMap::iterator iter = gramMap.begin(); iter != gramMap.end(); iter++) { u = iter->first; fpOut.write((const char*)&u, sizeof(unsigned)); Array* arr = iter->second->getArray(); if(arr) { u = arr->size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(unsigned i = 0; i < arr->size(); i++) { u = arr->at(i); fpOut.write((const char*)&u, sizeof(unsigned)); } } else { u = 0; fpOut.write((const char*)&u, sizeof(unsigned)); } } } else { vector*> >& children = node->children; for(unsigned i = 0; i < children.size(); i++) saveLeavesRec(children.at(i).second, fpOut); } } template void FtIndexerMemAbs:: loadLeavesRec(FilterTreeNode* node, ifstream& fpIn) { if(node->isLeaf) { if(node->distinctStringIDs) delete node->distinctStringIDs; unsigned distinctStrings; fpIn.read((char*)&distinctStrings, sizeof(unsigned)); if(distinctStrings > 0) node->distinctStringIDs = new GramListSimple(); for(unsigned i = 0; i < distinctStrings; i++) { unsigned tmp; fpIn.read((char*)&tmp, sizeof(unsigned)); node->distinctStringIDs->getArray()->append(tmp); } GramMap& gramMap = node->gramMap; unsigned gramMapSize; unsigned gramCode; unsigned arrSize; fpIn.read((char*)&gramMapSize, sizeof(unsigned)); for(unsigned j = 0; j < gramMapSize; j++) { fpIn.read((char*)&gramCode, sizeof(unsigned)); fpIn.read((char*)&arrSize, sizeof(unsigned)); if(arrSize != 0) { GramListSimple* gramList = new GramListSimple(); gramMap[gramCode] = gramList; InvList* newArr = gramList->getArray(); for(unsigned i = 0; i < arrSize; i++) { unsigned tmp; fpIn.read((char*)&tmp, sizeof(unsigned)); newArr->append(tmp); } } } } else { vector*> >& children = node->children; for(unsigned i = 0; i < children.size(); i++) loadLeavesRec(children.at(i).second, fpIn); } } template void FtIndexerMemAbs:: printFilterTree(FilterTreeNode* node) { if(!node) { printFilterTree(this->filterTreeRoot); return; } if(node->isLeaf()) { GramMap& gramMap = node->gramMap; for(typename GramMap::iterator iter = gramMap.begin(); iter != gramMap.end(); iter++) { Array* arr = iter->second->getArray(); cout << iter->first << " "; if(arr) { for(unsigned i = 0; i < arr->size(); i++) cout << arr->at(i) << " "; } cout << endl; } } else { vector*> >& children = node->children; for(unsigned i = 0; i < children.size(); i++) { cout << children.at(i).first << endl; printFilterTree(children.at(i).second); } } } #endif