/* $Id: ftindexermemabs.h 5788 2010-10-22 10:09:57Z abehm $ Copyright (C) 2010 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license. Date: 04/04/2008 Author: Alexander Behm */ #ifndef _ftindexermemabs_h_ #define _ftindexermemabs_h_ #include "ftindexerabs.h" #include "gramlistsimple.h" #include "util/src/debug.h" #include template class FtIndexerMemAbs; template struct FtIndexerTraits > { typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; }; template class FtIndexerMemAbs : public FtIndexerAbs > { public: typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; protected: typedef FtIndexerAbs > SuperClass; // insert string into index (and not into stringcontainer) with a given id void insertStringIntoIndex(const string& str, unsigned stringId); void addStringToLeaf(FilterTreeNode* node, const string& s, const unsigned stringId); // used for reading/writing index from/to file, // using logical separation into functions so compressed indexers can use these, too void saveLeavesRec(FilterTreeNode* node, ofstream& fpOut); void loadLeavesRec(FilterTreeNode* node, ifstream& fpIn); public: typedef typename SuperClass::GramMap GramMap; FtIndexerMemAbs(StringContainer* strContainer) : SuperClass(strContainer) {} FtIndexerMemAbs(StringContainer* strContainer, GramGen* gramGenerator, const unsigned maxStrLen = 150, const unsigned ftFanout = 50) : SuperClass(strContainer, gramGenerator, maxStrLen, ftFanout) {} void buildIndex_Impl(const string& dataFile, const unsigned linesToRead = 0); void buildIndex_Impl(); // insert string into stringcontainer and into index void insertString_Impl(const string& str); // clear all inverted lists void clearInvertedLists(); void integrateUpdates_Impl(); // here are the statically resolved interfaces that a concrete indexer MUST implement // we use the CRTP design pattern void prepare() { static_cast(this)->prepare_Impl(); } void addStringId(FilterTreeNode* leaf, const vector& gramCodes, unsigned stringId) { static_cast(this)->addStringId_Impl(leaf, gramCodes, stringId); } void saveIndex(const char* indexFileName) { static_cast(this)->saveIndex_Impl(indexFileName); } void loadIndex(const char* indexFileName) { static_cast(this)->loadIndex_Impl(indexFileName); } CompressionArgs* getCompressionArgs() { return static_cast(this)->getCompressionArgs_Impl(); } string getName() { return "FtIndexerMemAbs"; } fstream* getInvListsFile_Impl() { return NULL; } void printFilterTree(FilterTreeNode* node = NULL); }; template void FtIndexerMemAbs:: integrateUpdates_Impl() { if(this->deletedStringIds.size() > 0) { unsigned numStrings = this->strContainer->size(); unsigned* stringIdMapping = new unsigned[numStrings]; for(unsigned i = 0; i < numStrings; i++) stringIdMapping[i] = i; this->strContainer->integrateUpdates(this->deletedStringIds, stringIdMapping); // brute force updating, scan all inverted lists and remap the string ids vector* > leaves; FilterTreeNode::getSubTreeLeaves(this->filterTreeRoot, leaves); for(unsigned i = 0; i < leaves.size(); i++) { unordered_set processedInvLists; typename SuperClass::GramMap& gramMap = leaves[i]->gramMap; typename SuperClass::GramMap::iterator iter; vector emptyInvLists; for(iter = gramMap.begin(); iter != gramMap.end(); iter++) { InvList* invList = iter->second->getArray(); uintptr_t invListAddr = (uintptr_t)invList; // account for potentially combined lists (several grams point to same physical list) if(processedInvLists.find(invListAddr) == processedInvLists.end()) { InvList newInvList; for(unsigned j = 0; j < invList->size(); j++) { if(stringIdMapping[(*invList)[j]] != 0xFFFFFFFF) { newInvList.push_back(stringIdMapping[(*invList)[j]]); } } if(newInvList.size() > 0) std::swap(*invList, newInvList); else emptyInvLists.push_back(iter->first); processedInvLists.insert(invListAddr); } } // remove empty inverted lists from gram map for(unsigned i = 0; i < emptyInvLists.size(); i++) { delete gramMap[emptyInvLists[i]]; gramMap.erase(emptyInvLists[i]); } } delete[] stringIdMapping; this->deletedStringIds.clear(); } } template void FtIndexerMemAbs:: insertString_Impl(const string& str) { unsigned stringId = this->strContainer->insertString(str); insertStringIntoIndex(str, stringId); } template void FtIndexerMemAbs:: insertStringIntoIndex(const string& str, unsigned stringId) { if(str.empty()) return; FilterTreeNode* homeLeaf = this->findHomeLeafNode(str); if(homeLeaf) this->addStringToLeaf(homeLeaf, str, stringId); } template void FtIndexerMemAbs:: addStringToLeaf(FilterTreeNode* node, const string& s, const unsigned stringId) { // add stringId to distinctStringIDs and create the GramListSimple instance if neccessary if(!node->distinctStringIDs) node->distinctStringIDs = new GramListSimple(); node->distinctStringIDs->getArray()->push_back(stringId); // add stringId to gramMap of leaf vector gramCodes; this->gramGen->decompose(s, gramCodes); addStringId(node, gramCodes, stringId); } template void FtIndexerMemAbs:: buildIndex_Impl(const string& dataFile, const unsigned linesToRead) { this->strContainer->fillContainer(dataFile.c_str(), linesToRead, this->maxStrLen); this->buildIndex(); } template void FtIndexerMemAbs:: buildIndex_Impl() { #ifdef DEBUG_STAT this->ixStats->dictSize = this->strContainer->size(); this->ixStats->gramLen = this->gramGen->getGramLength(); this->ixStats->maxStrLen = this->maxStrLen; this->ixStats->ftFanout = this->ftFanout; this->ixStats->partFilters = this->filterTypes.size(); this->sutil.startTimeMeasurement(); #endif // prepare compressor for building index prepare(); // build empty filtertree this->buildHollowTreeRecursive(this->filterTreeRoot, 0); TIMER_START("INSERTING INTO INDEX", this->strContainer->size()); for(unsigned stringId = 0; stringId < this->strContainer->size(); stringId++) { string currentString; this->strContainer->retrieveString(currentString, stringId); insertStringIntoIndex(currentString, stringId); TIMER_STEP(); } TIMER_STOP(); #ifdef DEBUG_STAT this->sutil.stopTimeMeasurement(); this->ixStats->buildTime = this->sutil.getTimeMeasurement(); #endif } template void FtIndexerMemAbs:: clearInvertedLists() { vector*> leaves; FilterTreeNode::getSubTreeLeaves(this->filterTreeRoot, leaves); for(unsigned i = 0; i < leaves.size(); i++) { GramMap& gramMap = leaves[i]->gramMap; typename GramMap::iterator iter; for(iter = gramMap.begin(); iter != gramMap.end(); iter++) delete iter->second; gramMap.clear(); } } template void FtIndexerMemAbs:: saveLeavesRec(FilterTreeNode* node, ofstream& fpOut) { unsigned u; if(node->isLeaf()) { if(node->distinctStringIDs) { InvList* distinctArr = node->distinctStringIDs->getArray(); // write distinct string ids u = distinctArr->size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(unsigned i = 0; i < distinctArr->size(); i++) { u = distinctArr->at(i); fpOut.write((const char*)&u, sizeof(unsigned)); } } else { u = 0; fpOut.write((const char*)&u, sizeof(unsigned)); } // write gram map GramMap& gramMap = node->gramMap; u = gramMap.size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(typename GramMap::iterator iter = gramMap.begin(); iter != gramMap.end(); iter++) { u = iter->first; fpOut.write((const char*)&u, sizeof(unsigned)); InvList* arr = iter->second->getArray(); if(arr) { u = arr->size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(unsigned i = 0; i < arr->size(); i++) { u = arr->at(i); fpOut.write((const char*)&u, sizeof(unsigned)); } } else { u = 0; fpOut.write((const char*)&u, sizeof(unsigned)); } } } else { vector >& children = node->children; for(unsigned i = 0; i < children.size(); i++) saveLeavesRec(children[i].node, fpOut); } } template void FtIndexerMemAbs:: loadLeavesRec(FilterTreeNode* node, ifstream& fpIn) { if(node->isLeaf()) { if(node->distinctStringIDs) delete node->distinctStringIDs; unsigned distinctStrings; fpIn.read((char*)&distinctStrings, sizeof(unsigned)); if(distinctStrings > 0) node->distinctStringIDs = new GramListSimple(); for(unsigned i = 0; i < distinctStrings; i++) { unsigned tmp; fpIn.read((char*)&tmp, sizeof(unsigned)); node->distinctStringIDs->getArray()->push_back(tmp); } GramMap& gramMap = node->gramMap; unsigned gramMapSize; unsigned gramCode; unsigned arrSize; fpIn.read((char*)&gramMapSize, sizeof(unsigned)); for(unsigned j = 0; j < gramMapSize; j++) { fpIn.read((char*)&gramCode, sizeof(unsigned)); fpIn.read((char*)&arrSize, sizeof(unsigned)); if(arrSize != 0) { GramListSimple* gramList = new GramListSimple(); gramMap[gramCode] = gramList; InvList* newArr = gramList->getArray(); for(unsigned i = 0; i < arrSize; i++) { unsigned tmp; fpIn.read((char*)&tmp, sizeof(unsigned)); newArr->push_back(tmp); } } } } else { vector >& children = node->children; for(unsigned i = 0; i < children.size(); i++) loadLeavesRec(children[i].node, fpIn); } } template void FtIndexerMemAbs:: printFilterTree(FilterTreeNode* node) { if(!node) { printFilterTree(this->filterTreeRoot); return; } if(node->isLeaf()) { GramMap& gramMap = node->gramMap; for(typename GramMap::iterator iter = gramMap.begin(); iter != gramMap.end(); iter++) { InvList* arr = iter->second->getArray(); cout << iter->first << " "; if(arr) { for(unsigned i = 0; i < arr->size(); i++) cout << arr->at(i) << " "; } cout << endl; } } else { vector*> >& children = node->children; for(unsigned i = 0; i < children.size(); i++) { cout << children[i].first << endl; printFilterTree(children[i].second); } } } #endif