/* $Id: ondiskmergersimple.h 5750 2010-10-01 02:44:26Z abehm $ Copyright (C) 2010 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license. Date: 09/06/2008 Author: Alexander Behm */ #ifndef _ondiskmergersimple_h_ #define _ondiskmergersimple_h_ #include "ondiskmergerabs.h" #include "mergeoptmerger.h" #include #include #include template > class OnDiskMergerSimple : public OnDiskMergerAbs, InvList> { private: typedef OnDiskMergerAbs, InvList> SuperClass; MergeOptMerger* moMerger; // comparison function for ordering set< pair* >, GramListOnDiskCmpFunc > struct QueryGramListCmpFunc { bool operator()(QueryGramList* a, QueryGramList* b) { if(a->gramCode == b->gramCode) return dynamic_cast*>(a->gl)->startOffset < dynamic_cast*>(b->gl)->startOffset; return a->gl < b->gl; } }; public: vector numberStringsInLeaf; OnDiskMergerSimple(bool singleFilterOpt = true, bool hasDuplicateLists = false) : SuperClass(singleFilterOpt) { this->moMerger = new MergeOptMerger(hasDuplicateLists); } bool estimationParamsNeeded_Impl() { return false; } void setEstimationParams_Impl(float readInvListTimeSlope, float readInvListTimeIntercept, float readStringAvgTime) { } void merge_Impl(const Query& query, vector* >* >& arrays, const unsigned threshold, fstream& invListsFile, unsigned numberFilters, vector& results); string getName() { return "OnDiskMergerSimple"; } ~OnDiskMergerSimple() { delete moMerger; } }; template void OnDiskMergerSimple:: merge_Impl(const Query& query, vector* >* >& arrays, const unsigned threshold, fstream& invListsFile, unsigned numberFilters, vector& results) { // special case if one filter is applied if(numberFilters == 1 && this->singleFilterOpt && arrays.size() > 1) { typedef set*, QueryGramListCmpFunc> GLSet; tr1::unordered_map groupedGlists; // fill the groupedGlists for(unsigned i = 0; i < arrays.size(); i++) { vector* >* currentLists = arrays.at(i); for(unsigned j = 0; j < currentLists->size(); j++) { QueryGramList* qgl = currentLists->at(j); if(groupedGlists.find(qgl->gramCode) == groupedGlists.end()) groupedGlists[qgl->gramCode] = new GLSet(); groupedGlists[qgl->gramCode]->insert(qgl); } } // since a filter is applied each gram has several sorted sublists, // we fill all sublists belonging to the same gram with one sequantial I/O typename tr1::unordered_map::iterator iter; for(iter = groupedGlists.begin(); iter != groupedGlists.end(); iter++) { GLSet* currentLists = iter->second; // seek to start offset of first sorted sublist GramListOnDisk* firstGl = dynamic_cast*>( (*currentLists->begin())->gl ); invListsFile.seekg( firstGl->startOffset ); #ifdef DEBUG_STAT this->invListSeeks++; #endif // now fill sorted sublists in order in one sequential I/O // fillArray is implemented NOT to perform a disk seek typename GLSet::iterator setiter; GramListOnDisk* lastGl = dynamic_cast*>( (*currentLists->rbegin())->gl ); unsigned bytes = (unsigned)(lastGl->startOffset - firstGl->startOffset) + (lastGl->listSize * sizeof(typename InvList::value_type)); char* buffer = new char[bytes]; invListsFile.read(buffer, bytes); for(setiter = currentLists->begin(); setiter != currentLists->end(); setiter++) { QueryGramList* qgl = *setiter; GramListOnDisk* gl = dynamic_cast*>(qgl->gl); unsigned offset = (unsigned)(gl->startOffset - firstGl->startOffset); gl->fillArray(&buffer[offset]); } #ifdef DEBUG_STAT this->invListData += bytes; #endif delete[] buffer; } // cleanup groupedGlists; for(iter = groupedGlists.begin(); iter != groupedGlists.end(); iter++) delete iter->second; // search leaves one-by-one, // if one filter was applied the lists are already read from disk in the above code // if no filter or several filters are applied then getArray() will read the list using a random disk I/O for(unsigned i = 0; i < arrays.size(); i++) { vector* >* currentLeafLists = arrays.at(i); vector lists; for(unsigned j = 0; j < currentLeafLists->size(); j++) { InvList* arr = currentLeafLists->at(j)->gl->getArray(&invListsFile); lists.push_back(arr); } switch(this->pmf) { case PMF_CSF_REG: case PMF_CSF_OPT: { vector temp; moMerger->merge(lists, threshold, temp); for(unsigned j = 0; j < temp.size(); j++) if(this->charsumFilter->passesFilter(this->queryCharsumStats, &this->charsumStats[temp.at(j)], (unsigned)query.threshold)) results.push_back(temp.at(j)); } break; case PMF_LC: { vector temp; moMerger->merge(lists, threshold, temp); for(unsigned j = 0; j < temp.size(); j++) if(letterCountFilter(this->queryLcStats, &this->dataLcStats[temp.at(j)], this->lcCharNum, query.threshold)) results.push_back(temp.at(j)); } break; default: { moMerger->merge(lists, threshold, results); } break; } } } else { // search leaves one-by-one, // if one filter was applied the lists are already read from disk in the above code // if no filter or several filters are applied then getArray() will read the list using a random disk I/O for(unsigned i = 0; i < arrays.size(); i++) { vector* >* currentLeafLists = arrays.at(i); vector lists; for(unsigned j = 0; j < currentLeafLists->size(); j++) { InvList* arr = currentLeafLists->at(j)->gl->getArray(&invListsFile); lists.push_back(arr); #ifdef DEBUG_STAT this->invListSeeks++; this->invListData += arr->size() * sizeof(typename InvList::value_type); #endif } switch(this->pmf) { case PMF_CSF_REG: case PMF_CSF_OPT: { vector temp; moMerger->merge(lists, threshold, temp); for(unsigned j = 0; j < temp.size(); j++) if(this->charsumFilter->passesFilter(this->queryCharsumStats, &this->charsumStats[temp.at(j)], (unsigned)query.threshold)) results.push_back(temp.at(j)); } break; case PMF_LC: { vector temp; moMerger->merge(lists, threshold, temp); for(unsigned j = 0; j < temp.size(); j++) if(letterCountFilter(this->queryLcStats, &this->dataLcStats[temp.at(j)], this->lcCharNum, query.threshold)) results.push_back(temp.at(j)); } break; default: { moMerger->merge(lists, threshold, results); } break; } } } } #endif