/* $Id: ftindexerdiscardlists.h 5144 2010-03-24 22:06:23Z abehm $ Copyright (C) 2010 by The Regents of the University of California Redistribution of this file is permitted under the terms of the Academic BSD license. Date: 12/27/2007 Author: Alexander Behm */ #ifndef _ftindexerdiscardlists_h_ #define _ftindexerdiscardlists_h_ #include "ftindexermemabs.h" #include "ftindexermem.h" #include "common/simmetric.h" #include "listmerger/divideskipmerger.h" struct ValPair { unsigned gramCode; unsigned listSize; }; struct ValPairCmpFunc { bool operator()(const ValPair& vp1, const ValPair& vp2) const { ; return (vp1.listSize >= vp2.listSize); } }; template class FtIndexerDiscardListsAbs; template struct FtIndexerTraits > { typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; }; template class FtIndexerDiscardListsAbs : public FtIndexerMemAbs > { public: typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; #ifdef DEBUG_STAT typedef HolesGlobalIndexStats IxStatsType; #endif private: typedef FtIndexerMemAbs > SuperClass; protected: typedef typename SuperClass::GramMap GramMap; float reductionRatio; set holeGrams; bool duplicateGram(const unsigned gramCode, const vector& gramCodes, const unsigned iterStep); void fillStructsForHoles(set& orderedGramSet, unordered_map& countMap, long unsigned* totalListElements); void saveHoleGrams(ofstream& fpOut); void loadHoleGrams(ifstream& fpIn); public: FtIndexerDiscardListsAbs(StringContainer* strContainer) : SuperClass(strContainer) { #ifdef DEBUG_STAT this->ixStats = new HolesGlobalIndexStats(); #endif } FtIndexerDiscardListsAbs(StringContainer* strContainer, GramGen* gramGenerator, float reductionRatio, unsigned maxStrLen = 150, unsigned ftFanout = 50) : SuperClass(strContainer, gramGenerator, maxStrLen, ftFanout), reductionRatio(reductionRatio) { #ifdef DEBUG_STAT this->ixStats = new HolesGlobalIndexStats(); dynamic_cast(this->ixStats)->reductionRatio = reductionRatio; #endif } // statically delegate the prepare implementation to appropriate subclass void prepare_Impl() { static_cast(this)->chooseHoleGrams(); } void addStringId_Impl(FilterTreeNode* leaf, const vector& gramCodes, unsigned stringId); CompressionArgs* getCompressionArgs_Impl(); string getName() { return "FtIndexerDiscardListsAbs"; } void saveIndex_Impl(const char* indexFileName); void loadIndex_Impl(const char* indexFileName); }; template bool FtIndexerDiscardListsAbs:: duplicateGram(const unsigned gramCode, const vector& gramCodes, const unsigned iterStep) { for(signed i = (signed)iterStep - 1; i >= 0; i--) { if(i > 0) if(gramCode == gramCodes.at(i)) return true; } return false; } template void FtIndexerDiscardListsAbs:: fillStructsForHoles(set& orderedGramSet, unordered_map& countMap, long unsigned* totalListElements) { unsigned long tmpTotalListElements = 0; // compute gram counts for(unsigned stringId = 0; stringId < this->strContainer->size(); stringId++) { vector gramCodes; string currentString; this->strContainer->retrieveString(currentString, stringId); this->gramGen->decompose(currentString, gramCodes); for(unsigned i = 0; i < gramCodes.size(); i++) { unsigned gramCode = gramCodes.at(i); if(!duplicateGram(gramCode, gramCodes, i)) { if(countMap.find(gramCode) == countMap.end()) countMap[gramCode] = 1; else countMap[gramCode]++; tmpTotalListElements++; } } } // create set ordered descending by listSize for (unordered_map::iterator iter = countMap.begin(); iter != countMap.end(); iter++ ) { struct ValPair vp; vp.gramCode = iter->first; vp.listSize = iter->second; orderedGramSet.insert(vp); } *totalListElements = tmpTotalListElements; } template void FtIndexerDiscardListsAbs:: addStringId_Impl(FilterTreeNode* leaf, const vector& gramCodes, unsigned stringId) { for(unsigned i = 0; i < gramCodes.size(); i++) { unsigned gramCode = gramCodes.at(i); // ignore grams in set of holeGrams if(holeGrams.find(gramCode) != holeGrams.end()) continue; GramMap& gramMap = leaf->gramMap; if (gramMap.find(gramCode) == gramMap.end()) { // a new gram GramList* newGramList = new GramListSimple(); gramMap[gramCode] = newGramList; InvList* array = newGramList->getArray(); array->append(stringId); } else { // an existing gram GramList* gramList = gramMap[gramCode]; InvList* array = gramList->getArray(); //avoid adding duplicate elements if(array->at(array->size()-1) != stringId) array->append(stringId); } } } template CompressionArgs* FtIndexerDiscardListsAbs:: getCompressionArgs_Impl() { this->compressionArgs.blackList = &holeGrams; this->compressionArgs.holesOptimized = true; return &(this->compressionArgs); } template void FtIndexerDiscardListsAbs:: saveHoleGrams(ofstream& fpOut) { unsigned u; u = holeGrams.size(); fpOut.write((const char*)&u, sizeof(unsigned)); for(set::iterator iter = holeGrams.begin(); iter != holeGrams.end(); iter++) { u = *iter; fpOut.write((const char*)&u, sizeof(unsigned)); } } template void FtIndexerDiscardListsAbs:: loadHoleGrams(ifstream& fpIn) { unsigned nGrams; unsigned u; fpIn.read((char*)&nGrams, sizeof(unsigned)); for(unsigned i = 0; i < nGrams; i++) { fpIn.read((char*)&u, sizeof(unsigned)); holeGrams.insert(u); } } template void FtIndexerDiscardListsAbs:: saveIndex_Impl(const char* indexFileName) { ofstream fpOut; fpOut.open(indexFileName, ios::out); if(fpOut.is_open()) { this->saveBasicInfo(fpOut); this->saveLeavesRec(this->filterTreeRoot, fpOut); saveHoleGrams(fpOut); fpOut.close(); } else cout << "ERROR: could not open file " << indexFileName << endl; } template void FtIndexerDiscardListsAbs:: loadIndex_Impl(const char* indexFileName) { ifstream fpIn; fpIn.open(indexFileName, ios::in); if(fpIn.is_open()) { this->filterTypes.clear(); this->loadBasicInfo(fpIn); this->buildHollowTreeRecursive(this->filterTreeRoot, 0); this->loadLeavesRec(this->filterTreeRoot, fpIn); loadHoleGrams(fpIn); fpIn.close(); } else cout << "ERROR: could not open file " << indexFileName << endl; } template class FtIndexerDiscardListsLLF; template struct FtIndexerTraits > { typedef TStringContainer StringContainer; typedef TInvList InvList; }; template > class FtIndexerDiscardListsLLF : public FtIndexerDiscardListsAbs > { private: typedef FtIndexerDiscardListsAbs > SuperClass; public: FtIndexerDiscardListsLLF(StringContainer* strContainer) : SuperClass(strContainer) {} FtIndexerDiscardListsLLF(StringContainer* strContainer, GramGen* gramGen, float reductionRatio, unsigned maxStrLength = 150, unsigned ftFanout = 50) : SuperClass(strContainer, gramGen, reductionRatio, maxStrLength, ftFanout) { this->indexerType = HOLES_GLOBAL_LLF; } void chooseHoleGrams(); string getName() { return "FtIndexerDiscardListsLLF"; } }; template void FtIndexerDiscardListsLLF:: chooseHoleGrams() { set orderedGramSet; unordered_map countMap; unsigned long totalListElements = 0; this->fillStructsForHoles(orderedGramSet, countMap, &totalListElements); // create gram blacklist, i.e. which grams should be holes unsigned long memmax = (unsigned long)(totalListElements * this->reductionRatio); unsigned memsum = 0; set::iterator iter = orderedGramSet.begin(); while(memsum < memmax && iter != orderedGramSet.end()) { memsum += (*iter).listSize; this->holeGrams.insert((*iter).gramCode); iter++; } #ifdef DEBUG_STAT dynamic_cast(this->ixStats)->numberHoles = this->holeGrams.size(); #endif } template class FtIndexerDiscardListsSLF; template struct FtIndexerTraits > { typedef TStringContainer StringContainer; typedef TInvList InvList; }; template > class FtIndexerDiscardListsSLF : public FtIndexerDiscardListsAbs > { private: typedef FtIndexerDiscardListsAbs > SuperClass; public: FtIndexerDiscardListsSLF(StringContainer* strContainer) : SuperClass(strContainer) {} FtIndexerDiscardListsSLF(StringContainer* strContainer, GramGen* gramGen, float reductionRatio, unsigned maxStrLength = 150, unsigned ftFanout = 50) : SuperClass(strContainer, gramGen, reductionRatio, maxStrLength, ftFanout) { this->indexerType = HOLES_GLOBAL_SLF; } void chooseHoleGrams(); string getName() { return "FtIndexerDiscardListsSLF"; } }; template void FtIndexerDiscardListsSLF:: chooseHoleGrams() { set orderedGramSet; unordered_map countMap; unsigned long totalListElements = 0; this->fillStructsForHoles(orderedGramSet, countMap, &totalListElements); // create gram blacklist, i.e. which grams should be holes unsigned long memmax = (unsigned long)(totalListElements * this->reductionRatio); unsigned memsum = 0; set::reverse_iterator iter = orderedGramSet.rbegin(); while(memsum < memmax && iter != orderedGramSet.rend()) { memsum += (*iter).listSize; this->holeGrams.insert((*iter).gramCode); iter++; } #ifdef DEBUG_STAT dynamic_cast(this->ixStats)->numberHoles = this->holeGrams.size(); #endif } template class FtIndexerDiscardListsRandom; template struct FtIndexerTraits > { typedef TStringContainer StringContainer; typedef TInvList InvList; }; template > class FtIndexerDiscardListsRandom : public FtIndexerDiscardListsAbs > { private: typedef FtIndexerDiscardListsAbs > SuperClass; public: FtIndexerDiscardListsRandom(StringContainer* strContainer) : SuperClass(strContainer) {} FtIndexerDiscardListsRandom(StringContainer* strContainer, GramGen* gramGen, float reductionRatio, unsigned maxStrLength = 150, unsigned ftFanout = 50) : SuperClass(strContainer, gramGen, reductionRatio, maxStrLength, ftFanout) { this->indexerType = HOLES_GLOBAL_RANDOM; } void chooseHoleGrams(); string getName() { return "FtIndexerDiscardListsRandom"; } }; template void FtIndexerDiscardListsRandom:: chooseHoleGrams() { set orderedGramSet; unordered_map countMap; unsigned long totalListElements = 0; this->fillStructsForHoles(orderedGramSet, countMap, &totalListElements); // create gram blacklist, i.e. which grams should be holes unsigned long memmax = (unsigned long)(totalListElements * this->reductionRatio); unsigned memsum = 0; while(memsum < memmax && orderedGramSet.size() > 0) { unsigned rand = random() % orderedGramSet.size(); unsigned iterfind = 0; set::iterator iter = orderedGramSet.begin(); while(iterfind < rand) { iter++; iterfind++; } memsum += (*iter).listSize; this->holeGrams.insert((*iter).gramCode); orderedGramSet.erase(iter); } #ifdef DEBUG_STAT dynamic_cast(this->ixStats)->numberHoles = this->holeGrams.size(); #endif } // used by the cost based hole choosers typedef struct { unsigned length; vector gramCodes; char holes; unsigned originalCandidates; vector scanCounts; Array* nonZeroScanCounts; unsigned panicCost; unsigned originalThreshold; vector* > affectedLeaves; unordered_map gramOccur; bool isPanic; } QueryInfo; template class FtIndexerDiscardListsCostAbs; template struct FtIndexerTraits > { typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; }; template class FtIndexerDiscardListsCostAbs : public FtIndexerDiscardListsAbs > { public: typedef FtIndexerTraits TraitsType; typedef typename TraitsType::StringContainer StringContainer; typedef typename TraitsType::InvList InvList; private: typedef FtIndexerDiscardListsAbs > SuperClass; protected: typedef typename SuperClass::GramMap GramMap; const vector* workload; const SimMetric* simMetric; float simThreshold; bool ratioCost; float dictSamplingFrac; float queriesSamplingFrac; void fillSampleSets(const bool trueRand, vector& dictSampleSet, vector& queriesSampleSet); bool cleanseOrderedGramSet(const unsigned long totalElements, const unsigned numberLists, set& orderedGramSet); void getFilterTreeLeaves(const FtIndexerMem<>& ftIndexer, FilterTreeNode >* node, const string& query, const vector& filterTypes, const unsigned filterId, vector >* >& leaves); unsigned initScanCountArray(const vector*> &arrays, const unsigned threshold, const unsigned arraySize, unsigned scanCountArray[]); void fillQueryInfo(const FtIndexerMem<>& sampleIndexer, const vector& queriesSampleSet, const unsigned dictSize, QueryInfo queryInfo[], unsigned& totalMergeElements, unsigned& totalCandidates, unsigned& totalPanicCandidates); void freeQueryInfo(QueryInfo queryInfo[], const unsigned size); public: FtIndexerDiscardListsCostAbs(StringContainer* strContainer) : SuperClass(strContainer) {} ; FtIndexerDiscardListsCostAbs(StringContainer* strContainer, GramGen* gramGen, float reductionRatio, const vector* workload, const SimMetric* simMetric, float simThreshold, bool ratioCost = true, float dictSamplingFrac = 0.01f, float queriesSamplingFrac = 0.25f, unsigned maxStrLen = 150, unsigned ftFanout = 50) : SuperClass(strContainer, gramGen, reductionRatio, maxStrLen, ftFanout), workload(workload), simMetric(simMetric), simThreshold(simThreshold), ratioCost(ratioCost), dictSamplingFrac(dictSamplingFrac), queriesSamplingFrac(queriesSamplingFrac) {} // statically delegate the chooseHoleGrams method to appropriate subclass void chooseHoleGrams() { static_cast(this)->chooseHoleGrams_Cost(); } }; template void FtIndexerDiscardListsCostAbs:: fillSampleSets(const bool trueRand, vector& dictSampleSet, vector& queriesSampleSet) { if(trueRand) { srand(time(NULL)); } else srand(150); // choose dictionary sample if(dictSamplingFrac < 1.0) { unsigned dictSampleStrings = (unsigned)ceil((float)this->strContainer->size() * dictSamplingFrac); for(unsigned i = 0; i < dictSampleStrings; i++) { unsigned stringIndex = rand() % this->strContainer->size(); string currentString; this->strContainer->retrieveString(currentString, stringIndex); dictSampleSet.push_back(currentString); } } else { for(unsigned i = 0; i < this->strContainer->size(); i++) { string currentString; this->strContainer->retrieveString(currentString, i); dictSampleSet.push_back(currentString); } } // choose queries if(queriesSamplingFrac < 1.0) { unsigned queriesSampleStrings = (unsigned)floor((float)workload->size() * queriesSamplingFrac); for(unsigned i = 0; i < queriesSampleStrings; i++) { unsigned stringIndex = rand() % workload->size(); queriesSampleSet.push_back(workload->at(stringIndex)); } } else { for(unsigned i = 0; i < workload->size(); i++) { queriesSampleSet.push_back(workload->at(i)); } } } template bool FtIndexerDiscardListsCostAbs:: cleanseOrderedGramSet(const unsigned long totalElements, const unsigned numberLists, set& orderedGramSet) { set::iterator del; bool found = false; unsigned avgListSize = (unsigned) floor( (double)totalElements / (double)numberLists ); unsigned threshold = avgListSize; unsigned long sum = 0; unsigned long memmax = (unsigned long)(totalElements * this->reductionRatio); for(set::iterator iter = orderedGramSet.begin(); iter != orderedGramSet.end(); iter++) { sum += iter->listSize; if(iter->listSize <= threshold && sum >= memmax) { del = iter; found = true; break; } } if(found) { orderedGramSet.erase(del, orderedGramSet.end()); return true; } else return false; } template void FtIndexerDiscardListsCostAbs:: getFilterTreeLeaves(const FtIndexerMem<>& ftIndexer, FilterTreeNode >* node, const string& query, const vector& filterTypes, const unsigned filterId, vector >* >& leaves) { // in case no filters were selected just add the leaf of the root if(filterTypes.size() <= 0) { leaves.push_back( node->children.at(0).node ); return; } if(node->isLeaf) leaves.push_back((FilterTreeNode<>*)node); else { AbstractFilter* filter = filterTypes.at(filterId); // get the bounds for this filter unsigned lbound, ubound; simMetric->getFilterBounds(query, simThreshold, filter, lbound, ubound); vector >& children = node->children; // nodeLbound and nodeUbound denote upper and lower bound of node that we are looking at unsigned nodeLbound, nodeUbound; nodeLbound = filter->getFilterLbound(); // for all children check if we need to recurse into them for(unsigned i = 0; i < children.size(); i++) { // selectively recurse into children, // i.e. recurse into all nodes that overlap with [lbound, ubound] nodeUbound = children.at(i).key; if(lbound <= nodeUbound && ubound >= nodeLbound) getFilterTreeLeaves(ftIndexer, children.at(i).node, query, filterTypes, filterId+1, leaves); nodeLbound = nodeUbound; } } } template unsigned FtIndexerDiscardListsCostAbs:: initScanCountArray(const vector*> &arrays, const unsigned threshold, const unsigned arraySize, unsigned scanCountArray[]) { unsigned resultSize = 0; for(unsigned i=0;i *v = arrays.at(i); for(unsigned j=0;jsize();j++) { unsigned id = v->at(j); scanCountArray[id]++; if (scanCountArray[id] == threshold ) resultSize++; } } return resultSize; } template void FtIndexerDiscardListsCostAbs:: fillQueryInfo(const FtIndexerMem<>& sampleIndexer, const vector& queriesSampleSet, const unsigned dictSize, QueryInfo queryInfo[], unsigned& totalMergeElements, unsigned& totalCandidates, unsigned& totalPanicCandidates) { totalMergeElements = 0; totalCandidates = 0; totalPanicCandidates = 0; for(unsigned i = 0; i < queriesSampleSet.size(); i++) { string currentString = queriesSampleSet.at(i); // fill gram occurences this->gramGen->decompose(currentString, queryInfo[i].gramCodes); for(unsigned g = 0; g < queryInfo[i].gramCodes.size(); g++) { unsigned gramCode = queryInfo[i].gramCodes.at(g); unordered_map* map = &queryInfo[i].gramOccur; if(map->find(gramCode) == map->end()) (*map)[gramCode] = 1; else ((*map)[gramCode])++; } queryInfo[i].length = currentString.size(); queryInfo[i].holes = 0; queryInfo[i].originalThreshold = simMetric->getMergeThreshold(currentString, simThreshold, (unsigned)0); getFilterTreeLeaves(sampleIndexer, sampleIndexer.filterTreeRoot, currentString, this->filterTypes, 0, queryInfo[i].affectedLeaves); unsigned nLeaves = queryInfo[i].affectedLeaves.size(); // init array of nonZeroScanCounts queryInfo[i].nonZeroScanCounts = new Array[nLeaves]; queryInfo[i].panicCost = 0; queryInfo[i].isPanic = false; queryInfo[i].originalCandidates = 0; for(unsigned j = 0; j < nLeaves; j++) { FilterTreeNode >* leaf = queryInfo[i].affectedLeaves.at(j); unsigned* newScanCount = new unsigned[dictSize]; memset(newScanCount, 0, dictSize * sizeof(unsigned)); queryInfo[i].scanCounts.push_back(newScanCount); vector *> invertedLists; leaf->getInvertedLists(currentString, *(this->gramGen), invertedLists); // fill panic cost if(leaf->distinctStringIDs) queryInfo[i].panicCost += leaf->distinctStringIDs->getArray()->size(); // if the query panics set a flag if((signed)queryInfo[i].originalThreshold <= 0) { queryInfo[i].isPanic = true; if(leaf->distinctStringIDs) totalPanicCandidates += leaf->distinctStringIDs->getArray()->size(); } else { // get other costs, i.e. merge and postprocess queryInfo[i].originalCandidates += initScanCountArray(invertedLists, queryInfo[i].originalThreshold, queriesSampleSet.size(), queryInfo[i].scanCounts.at(j)); // fill nonZeroScanCount unsigned* arr = queryInfo[i].scanCounts.at(j); Array* nonZeroScanCounts = &queryInfo[i].nonZeroScanCounts[j]; for(unsigned z = 0; z < dictSize; z++) if(arr[z] > 0) nonZeroScanCounts->append( &arr[z] ); } for(unsigned k = 0; k < invertedLists.size(); k++) totalMergeElements += invertedLists.at(k)->size(); } // sanity check if(queryInfo[i].originalCandidates > queryInfo[i].panicCost) cout << "INSANE " << queryInfo[i].originalCandidates << " " << queryInfo[i].panicCost << endl; totalCandidates += queryInfo[i].originalCandidates; } } template void FtIndexerDiscardListsCostAbs:: freeQueryInfo(QueryInfo queryInfo[], const unsigned size) { for(unsigned i = 0; i < size; i++) { unsigned scanCountSize = queryInfo[i].scanCounts.size(); for(unsigned j = 0; j < scanCountSize; j++) { unsigned* scanCountArr = queryInfo[i].scanCounts.at(j); if(scanCountArr) delete scanCountArr; } Array* nonZeroScanCounts = queryInfo[i].nonZeroScanCounts; if(nonZeroScanCounts) delete[] nonZeroScanCounts; } } template class FtIndexerDiscardListsTimeCost; template struct FtIndexerTraits > { typedef TStringContainer StringContainer; typedef TInvList InvList; }; template > class FtIndexerDiscardListsTimeCost : public FtIndexerDiscardListsCostAbs > { private: typedef FtIndexerDiscardListsCostAbs > SuperClass; protected: typedef typename SuperClass::GramMap GramMap; float avgPostprocessTime; float avgMergeTime; void estimateAvgPostprocessTime(const vector& dictSampleSet, const vector& queriesSampleSet, const QueryInfo queryInfo[], const unsigned iterations); void estimateAvgMergeTime(const vector& dictSampleSet, const vector& queriesSampleSet, const QueryInfo queryInfo[], const unsigned iterations); inline float getApproxPostprocessTime(const unsigned totalCandidates) { return avgPostprocessTime * (float)totalCandidates; } inline float getApproxMergeTime(const unsigned totalElements) { return avgMergeTime * (float)totalElements; } unsigned chooseGram(set& orderedGramSet, const vector& queriesSampleSet, const vector& dictSampleSet, GramMap& queriesMap, QueryInfo queryInfo[], unsigned& totalMergeElements, unsigned& totalCandidates, unsigned& totalPanicCandidates, unsigned long& memsum, float& bestRatio); unsigned calculateNewCandidates(const string& query, const unsigned threshold, const unsigned holeOccur, const unsigned arraySize, const Array& nonZeroScanCounts, const Array* affectedStrings, const bool makePermanent, unsigned scanCountArray[]); void simulateHole(const bool makePermanent, const unsigned hole, const unsigned dictSize, const vector& queriesSampleSet, GramMap& queriesMap, QueryInfo queryInfo[], unsigned& oldTotalMergeElements, unsigned& oldTotalCandidates, unsigned& oldTotalPanicCandidates, unsigned& newTotalMergeElements, unsigned& newTotalCandidates, unsigned& newTotalPanicCandidates); public: FtIndexerDiscardListsTimeCost(StringContainer* strContainer) : SuperClass(strContainer) {} ; FtIndexerDiscardListsTimeCost(StringContainer* strContainer, GramGen* gramGen, const float reductionRatio, const vector* workload, const SimMetric* simMetric, const float simThreshold, const bool ratioCost = false, const float dictSamplingFrac = 0.01f, const float queriesSamplingFrac = 0.25f, const unsigned maxStrLen = 150, const unsigned ftFanout = 50) : SuperClass(strContainer, gramGen, reductionRatio, workload, simMetric, simThreshold, ratioCost, dictSamplingFrac, queriesSamplingFrac, maxStrLen, ftFanout), avgPostprocessTime(0.0f), avgMergeTime(0.0f) { this->indexerType = HOLES_GLOBAL_TIMECOST; } void chooseHoleGrams_Cost(); string getName() { return "FtIndexerDiscardListsTimeCost"; } }; template void FtIndexerDiscardListsTimeCost:: estimateAvgPostprocessTime(const vector& dictSampleSet, const vector& queriesSampleSet, const QueryInfo queryInfo[], const unsigned iterations) { StatsUtil sutil; srand(150); float totalTime = 0.0; unsigned counter = 0; unsigned rep = 2; // for some reason this estimation is sometimes much better than the one below... // for(unsigned z = 0; z < iterations; z++) { // unsigned q = rand() % queriesSampleSet.size(); // string queryString = queriesSampleSet.at(q); // sutil.startTimeMeasurement(); // distanceMeasure->similar(queryString, queryString); // sutil.stopTimeMeasurement(); // totalTime += sutil.getTimeMeasurement(TFMSEC); // counter++; // } for(unsigned i = 0; i < rep; i++) { for(unsigned z = 0; z < queriesSampleSet.size(); z++) { unsigned q = z; for(unsigned j = 0; j < queryInfo[q].affectedLeaves.size(); j++) { FilterTreeNode<>* leaf = queryInfo[q].affectedLeaves.at(j); if(!leaf->distinctStringIDs) continue; InvList* distinctIDs = leaf->distinctStringIDs->getArray(); for(unsigned k = 0; k < distinctIDs->size(); k++) { string queryString = queriesSampleSet.at(q); string dictString = dictSampleSet.at(distinctIDs->at(k)); sutil.startTimeMeasurement(); (*(this->simMetric))(dictString, queryString); sutil.stopTimeMeasurement(); totalTime += sutil.getTimeMeasurement(TFMSEC); counter++; } } } } avgPostprocessTime = totalTime / (float)counter; } template void FtIndexerDiscardListsTimeCost:: estimateAvgMergeTime(const vector& dictSampleSet, const vector& queriesSampleSet, const QueryInfo queryInfo[], const unsigned iterations) { StatsUtil sutil; srand(150); float totalTime = 0.0; float totalElements = 0.0; DivideSkipMerger merger; unsigned rep = 2; for(unsigned i = 0; i < rep; i++) { for(unsigned z = 0; z < queriesSampleSet.size(); z++) { unsigned q = z; string query = queriesSampleSet.at(q); for(unsigned j = 0; j < queryInfo[q].affectedLeaves.size(); j++) { FilterTreeNode* leaf = queryInfo[q].affectedLeaves.at(j); vector invertedLists; leaf->getInvertedLists(query, *(this->gramGen), invertedLists); if(invertedLists.size() <= 0) continue; vector candidates; sutil.startTimeMeasurement(); merger.merge(invertedLists, queryInfo[q].originalThreshold, candidates); sutil.stopTimeMeasurement(); totalTime += sutil.getTimeMeasurement(TFMSEC); for(unsigned k = 0; k < invertedLists.size(); k++) totalElements += invertedLists.at(k)->size(); } } } if(totalElements > 0) avgMergeTime = totalTime / totalElements; else { //cout << "WARNING: forced estimated mergetime to zero! something is wrong. check ftixerholesglobal.cc" << endl; // just set some default value avgMergeTime = 1.0e-35; } } template unsigned FtIndexerDiscardListsTimeCost:: chooseGram(set& orderedGramSet, const vector& queriesSampleSet, const vector& dictSampleSet, GramMap& queriesMap, QueryInfo queryInfo[], unsigned& totalMergeElements, unsigned& totalCandidates, unsigned& totalPanicCandidates, unsigned long& memsum, float& bestRatio) { float currentRatio = 0; bestRatio = 0.0; pair bestGram; bestGram.first = 0; bestGram.second = 0; set::iterator remove_best; for(set::iterator iter = orderedGramSet.begin(); iter != orderedGramSet.end(); iter++) { unsigned gramHole = iter->gramCode; // simulate the hole and estimate new number of candidates and number of elements to merge unsigned newTotalMergeElements = totalMergeElements; unsigned newTotalCandidates = totalCandidates; unsigned newTotalPanicCandidates = totalPanicCandidates; simulateHole(false, gramHole, dictSampleSet.size(), queriesSampleSet, queriesMap, queryInfo, totalMergeElements, totalCandidates, totalPanicCandidates, newTotalMergeElements, newTotalCandidates, newTotalPanicCandidates); float approxMergeTime = getApproxMergeTime(newTotalMergeElements); float approxPostprocessTime = getApproxPostprocessTime(newTotalCandidates); float approxPanicTime = getApproxPostprocessTime(newTotalPanicCandidates); float approxTotalTime = approxMergeTime + approxPostprocessTime + approxPanicTime; if(this->ratioCost) currentRatio = (float)iter->listSize / (float)approxTotalTime; else currentRatio = 1.0 / (float)approxTotalTime; if(currentRatio > bestRatio) { bestGram.first = iter->gramCode; bestGram.second = iter->listSize; bestRatio = currentRatio; remove_best = iter; } } memsum += bestGram.second; orderedGramSet.erase(remove_best); return bestGram.first; } template unsigned FtIndexerDiscardListsTimeCost:: calculateNewCandidates(const string& query, const unsigned threshold, const unsigned holeOccur, const unsigned arraySize, const Array& nonZeroScanCounts, const Array* affectedStrings, const bool makePermanent, unsigned scanCountArray[]) { if(affectedStrings != NULL) { for(unsigned i = 0; i < affectedStrings->size(); i++) { unsigned id = affectedStrings->at(i); scanCountArray[id] -= holeOccur; //if( (signed)scanCountArray[id] < 0) scanCountArray[id] = 0; } } // use non-zero scan count vector to calculate new candidates unsigned resultSize = 0; unsigned end = nonZeroScanCounts.size(); for(unsigned i = 0; i < end; i++) if( *(nonZeroScanCounts.at(i)) >= threshold && (signed)*(nonZeroScanCounts.at(i)) > 0) resultSize++; // roll back if(!makePermanent) { if(affectedStrings != NULL) { for(unsigned i = 0; i < affectedStrings->size(); i++) { unsigned id = affectedStrings->at(i); scanCountArray[id] += holeOccur; } } } return resultSize; } template void FtIndexerDiscardListsTimeCost:: simulateHole(const bool makePermanent, const unsigned hole, const unsigned dictSize, const vector& queriesSampleSet, GramMap& queriesMap, QueryInfo queryInfo[], unsigned& oldTotalMergeElements, unsigned& oldTotalCandidates, unsigned& oldTotalPanicCandidates, unsigned& newTotalMergeElements, unsigned& newTotalCandidates, unsigned& newTotalPanicCandidates) { newTotalMergeElements = oldTotalMergeElements; newTotalCandidates = oldTotalCandidates; newTotalPanicCandidates = oldTotalPanicCandidates; if(queriesMap.find(hole) != queriesMap.end()) { GramList<>* gramList = queriesMap[hole]; Array* changedQueries = gramList->getArray(); for(unsigned i = 0; i < changedQueries->size(); i++) { unsigned changedQueryId = changedQueries->at(i); string query = queriesSampleSet.at(changedQueryId); unsigned holes = queryInfo[changedQueryId].holes + queryInfo[changedQueryId].gramOccur[hole]; vector* >* affectedLeaves = &queryInfo[changedQueryId].affectedLeaves; // calculate T signed T = (signed)this->simMetric->getMergeThreshold(query, this->simThreshold, holes); // check for panic unsigned currentCandidates = 0; if(T <= 0) { if(!queryInfo[changedQueryId].isPanic) { newTotalPanicCandidates += queryInfo[changedQueryId].panicCost; if(makePermanent) queryInfo[changedQueryId].isPanic = true; } } else { for(unsigned j = 0; j < affectedLeaves->size(); j++) { GramMap& gramMap = affectedLeaves->at(j)->gramMap; Array* invertedList = NULL; if(gramMap.find(hole) != gramMap.end()) invertedList = gramMap[hole]->getArray(); currentCandidates += calculateNewCandidates(query, T, queryInfo[changedQueryId].gramOccur[hole], dictSize, queryInfo[changedQueryId].nonZeroScanCounts[j], invertedList, makePermanent, queryInfo[changedQueryId].scanCounts.at(j)); } for(unsigned j = 0; j < affectedLeaves->size(); j++) { GramMap& gramMap = affectedLeaves->at(j)->gramMap; if(gramMap.find(hole) != gramMap.end()) newTotalMergeElements -= gramMap[hole]->getArray()->size(); } } // sanity check if(currentCandidates > queryInfo[changedQueryId].panicCost) cout << "INSANE " << currentCandidates << " " << queryInfo[changedQueryId].panicCost << endl; newTotalCandidates -= queryInfo[changedQueryId].originalCandidates; newTotalCandidates += currentCandidates; if(makePermanent) { queryInfo[changedQueryId].originalCandidates = currentCandidates; //queryInfo[changedQueryId].holes++; queryInfo[changedQueryId].holes += queryInfo[changedQueryId].gramOccur[hole]; } } if(makePermanent) { oldTotalMergeElements = newTotalMergeElements; oldTotalCandidates = newTotalCandidates; oldTotalPanicCandidates = newTotalPanicCandidates; } } } template void FtIndexerDiscardListsTimeCost:: chooseHoleGrams_Cost() { vector dictSampleSet; vector queriesSampleSet; this->fillSampleSets(false, dictSampleSet, queriesSampleSet); unsigned queriesSampleSize = queriesSampleSet.size(); set orderedGramSet; unordered_map countMap; unsigned long totalListElements = 0; this->fillStructsForHoles(orderedGramSet, countMap, &totalListElements); // ATTENTION: heuristic for speedup // cleanse orderedGramSet of insignificant grams, e.g. grams with listsize 1 bool memCanBeSatisfied = this->cleanseOrderedGramSet(totalListElements, countMap.size(), orderedGramSet); if(!memCanBeSatisfied) { for(set::iterator iter = orderedGramSet.begin(); iter != orderedGramSet.end(); iter++) { this->holeGrams.insert(iter->gramCode); } cout << "MEMORY REQUIREMENT CANNOT BE SATISFIED BY HOLES" << endl; return; } // create index to run the queries on using global holes indexer // create a temp string collection StringContainerVector* tmpDictContainer = new StringContainerVector(); tmpDictContainer->fillContainer(dictSampleSet.begin(), dictSampleSet.end()); FtIndexerMem<> sampleIndexer(tmpDictContainer, this->gramGen, this->maxStrLen, this->ftFanout); for(unsigned i = 0; i < this->filterTypes.size(); i++) sampleIndexer.addPartFilter(this->filterTypes.at(i)->clone()); sampleIndexer.buildIndex(); delete tmpDictContainer; // create index of queries // create a temp string collection StringContainerVector* tmpQueriesContainer = new StringContainerVector(); tmpQueriesContainer->fillContainer(queriesSampleSet.begin(), queriesSampleSet.end()); FtIndexerMem<> queriesIndexer(tmpQueriesContainer, this->gramGen, this->maxStrLen, this->ftFanout); queriesIndexer.buildIndex(); delete tmpQueriesContainer; GramMap& queriesMap = queriesIndexer.filterTreeRoot->children.at(0).node->gramMap; // pre-process queries and store information into queryInfo unsigned totalMergeElements = 0; unsigned totalCandidates = 0; unsigned totalPanicCandidates = 0; QueryInfo* queryInfo = new QueryInfo[queriesSampleSize]; this->fillQueryInfo(sampleIndexer, queriesSampleSet, dictSampleSet.size(), queryInfo, totalMergeElements, totalCandidates, totalPanicCandidates); estimateAvgPostprocessTime(dictSampleSet, queriesSampleSet, queryInfo, this->workload->size()); estimateAvgMergeTime(dictSampleSet, queriesSampleSet, queryInfo, this->workload->size()); unsigned long memmax = (unsigned long)(totalListElements * this->reductionRatio); unsigned long memsum = 0; while(memsum < memmax) { unsigned chosenGram = 0; float bestRatio = 0.0; // choose gram chosenGram = chooseGram(orderedGramSet, queriesSampleSet, dictSampleSet, queriesMap, queryInfo, totalMergeElements, totalCandidates, totalPanicCandidates, memsum, bestRatio); // make change permanent unsigned tmpTotalMergeElements = totalMergeElements; unsigned tmpTotalCandidates = totalCandidates; unsigned tmpTotalPanicCandidates = totalPanicCandidates; simulateHole(true, chosenGram, dictSampleSet.size(), queriesSampleSet, queriesMap, queryInfo, totalMergeElements, totalCandidates, totalPanicCandidates, tmpTotalMergeElements, tmpTotalCandidates, tmpTotalPanicCandidates); // insert gram into blacklist this->holeGrams.insert(chosenGram); } // free QueryInfo memory this->freeQueryInfo(queryInfo, queriesSampleSet.size()); delete[] queryInfo; #ifdef DEBUG_STAT dynamic_cast(this->ixStats)->numberHoles = this->holeGrams.size(); #endif } template class FtIndexerDiscardListsPanicCost; template struct FtIndexerTraits > { typedef TStringContainer StringContainer; typedef TInvList InvList; }; template > class FtIndexerDiscardListsPanicCost : public FtIndexerDiscardListsCostAbs > { private: typedef FtIndexerDiscardListsCostAbs > SuperClass; protected: typedef typename SuperClass::GramMap GramMap; void simulateHole(unsigned hole, vector& queriesSampleSet, GramMap& queriesMap, QueryInfo queryInfo[], unsigned& oldPanics, unsigned& newPanics, bool makePermanent); public: FtIndexerDiscardListsPanicCost(StringContainer* strContainer) : SuperClass(strContainer) {} ; FtIndexerDiscardListsPanicCost(StringContainer* strContainer, GramGen* gramGen, const float reductionRatio, const vector* workload, const SimMetric* simMetric, const float simThreshold, const bool ratioCost = false, const float dictSamplingFrac = 0.01f, const float queriesSamplingFrac = 0.25f, const unsigned maxStrLen = 150, const unsigned ftFanout = 50) : SuperClass(strContainer, gramGen, reductionRatio, workload, simMetric, simThreshold, ratioCost, dictSamplingFrac, queriesSamplingFrac, maxStrLen, ftFanout) { this->indexerType = HOLES_GLOBAL_PANICCOST; } string getName() { return "FtIndexerDiscardListsPanicCost"; } void chooseHoleGrams_Cost(); }; template void FtIndexerDiscardListsPanicCost:: simulateHole(unsigned hole, vector& queriesSampleSet, GramMap& queriesMap, QueryInfo queryInfo[], unsigned& oldPanics, unsigned& newPanics, bool makePermanent) { if(queriesMap.find(hole) != queriesMap.end()) { GramList<>* gramList = queriesMap[hole]; Array* changedQueries = gramList->getArray(); for(unsigned i = 0; i < changedQueries->size(); i++) { unsigned changedQueryId = changedQueries->at(i); string query = queriesSampleSet.at(changedQueryId); unsigned holes = queryInfo[changedQueryId].holes + queryInfo[changedQueryId].gramOccur[hole]; // calculate T signed T = (signed)this->simMetric->getMergeThreshold(query, this->simThreshold, holes); // check for panic if(T <= 0) { if(!queryInfo[changedQueryId].isPanic) { newPanics += queryInfo[changedQueryId].panicCost; if(makePermanent) queryInfo[changedQueryId].isPanic = true; } } if(makePermanent) queryInfo[changedQueryId].holes += queryInfo[changedQueryId].gramOccur[hole]; } if(makePermanent) oldPanics = newPanics; } } template void FtIndexerDiscardListsPanicCost:: chooseHoleGrams_Cost() { vector dictSampleSet; vector queriesSampleSet; this->fillSampleSets(false, dictSampleSet, queriesSampleSet); unsigned queriesSampleSize = queriesSampleSet.size(); set orderedGramSet; unordered_map countMap; unsigned long totalListElements = 0; this->fillStructsForHoles(orderedGramSet, countMap, &totalListElements); // ATTENTION: heuristic for speedup // cleanse orderedGramSet of insignificant grams, e.g. grams with listsize 1 bool memCanBeSatisfied = this->cleanseOrderedGramSet(totalListElements, countMap.size(), orderedGramSet); if(!memCanBeSatisfied) { for(set::iterator iter = orderedGramSet.begin(); iter != orderedGramSet.end(); iter++) { this->holeGrams.insert(iter->gramCode); } cout << "MEMORY REQUIREMENT CANNOT BE SATISFIED BY HOLES" << endl; return; } // create index to run the queries on using global holes indexer // create a temp string collection StringContainerVector* tmpDictContainer = new StringContainerVector(); tmpDictContainer->fillContainer(dictSampleSet.begin(), dictSampleSet.end()); FtIndexerMem<> sampleIndexer(tmpDictContainer, this->gramGen, this->maxStrLen, this->ftFanout); for(unsigned i = 0; i < this->filterTypes.size(); i++) sampleIndexer.addPartFilter(this->filterTypes.at(i)->clone()); sampleIndexer.buildIndex(); delete tmpDictContainer; // create index of queries // create a temp string collection StringContainerVector* tmpQueriesContainer = new StringContainerVector(); tmpQueriesContainer->fillContainer(queriesSampleSet.begin(), queriesSampleSet.end()); FtIndexerMem<> queriesIndexer(tmpQueriesContainer, this->gramGen, this->maxStrLen, this->ftFanout); queriesIndexer.buildIndex(); delete tmpQueriesContainer; GramMap& queriesMap = queriesIndexer.filterTreeRoot->children.at(0).node->gramMap; // pre-process queryinfo unsigned totalMergeElements = 0; unsigned totalCandidates = 0; unsigned totalPanicCandidates = 0; QueryInfo* queryInfo = new QueryInfo[queriesSampleSize]; this->fillQueryInfo(sampleIndexer, queriesSampleSet, dictSampleSet.size(), queryInfo, totalMergeElements, totalCandidates, totalPanicCandidates); unsigned totalPanics = totalPanicCandidates; unsigned long memmax = (unsigned long)(totalListElements * this->reductionRatio); unsigned long memsum = 0; while(memsum < memmax) { float currentRatio = 0; float bestRatio = 0; pair bestGram; bestGram.first = 0; bestGram.second = 0; set::iterator remove; for(set::iterator iter = orderedGramSet.begin(); iter != orderedGramSet.end(); iter++) { // simulate the hole and estimate new number of candidates and number of elements to merge unsigned newTotalPanics = totalPanics; simulateHole(iter->gramCode, queriesSampleSet, queriesMap, queryInfo, totalPanics, newTotalPanics, false); if(newTotalPanics <= 0) newTotalPanics = 1; if(this->ratioCost) currentRatio = (float)iter->listSize / (float)newTotalPanics; else currentRatio = 1.0 / (float)newTotalPanics; if(currentRatio > bestRatio) { bestGram.first = iter->gramCode; bestGram.second = iter->listSize; bestRatio = currentRatio; remove = iter; } } memsum += bestGram.second; orderedGramSet.erase(remove); this->holeGrams.insert(bestGram.first); // permanently record the hole for all affected queries unsigned newTotalPanics = totalPanics; simulateHole(bestGram.first, queriesSampleSet, queriesMap, queryInfo, totalPanics, newTotalPanics, true); //cout << "CHOSEN " << memsum << " " << memmax << " " << bestGram.first << " " << bestGram.second << " " << totalPanics << endl; } // free QueryInfo memory this->freeQueryInfo(queryInfo, queriesSampleSet.size()); delete[] queryInfo; #ifdef DEBUG_STAT dynamic_cast(this->ixStats)->numberHoles = this->holeGrams.size(); #endif } #endif