/* $Id: ondiskmergersimple.h 5149 2010-03-24 23:37:18Z abehm $ Copyright (C) 2010 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license. Date: 09/06/2008 Author: Alexander Behm */ #ifndef _ondiskmergersimple_h_ #define _ondiskmergersimple_h_ #include "ondiskmergerabs.h" #include "divideskipmerger.h" #include #include #include template > class OnDiskMergerSimple : public OnDiskMergerAbs, InvList> { private: typedef OnDiskMergerAbs, InvList> SuperClass; DivideSkipMerger* dsMerger; // comparison function for ordering set< pair* >, GramListOnDiskCmpFunc > struct QueryGramListCmpFunc { bool operator()(QueryGramList* a, QueryGramList* b) { if(a->gramCode == b->gramCode) return dynamic_cast*>(a->gl)->startOffset < dynamic_cast*>(b->gl)->startOffset; return a->gl < b->gl; } }; public: vector numberStringsInLeaf; OnDiskMergerSimple(bool singleFilterOpt = true, bool hasDuplicateLists = false) : SuperClass(singleFilterOpt) { this->dsMerger = new DivideSkipMerger(hasDuplicateLists); } bool estimationParamsNeeded_Impl() { return false; } void setEstimationParams_Impl(float readInvListTimeSlope, float readInvListTimeIntercept, float readStringAvgTime) { } void merge_Impl(const Query& query, vector* >* >& arrays, const unsigned threshold, fstream& invListsFile, unsigned numberFilters, vector& results); string getName() { return "OnDiskMergerSimple"; } ~OnDiskMergerSimple() { delete dsMerger; } }; template void OnDiskMergerSimple:: merge_Impl(const Query& query, vector* >* >& arrays, const unsigned threshold, fstream& invListsFile, unsigned numberFilters, vector& results) { // special case if one filter is applied if(numberFilters == 1 && this->singleFilterOpt && arrays.size() > 1) { typedef set*, QueryGramListCmpFunc> GLSet; tr1::unordered_map groupedGlists; // fill the groupedGlists for(unsigned i = 0; i < arrays.size(); i++) { vector* >* currentLists = arrays.at(i); for(unsigned j = 0; j < currentLists->size(); j++) { QueryGramList* qgl = currentLists->at(j); if(groupedGlists.find(qgl->gramCode) == groupedGlists.end()) groupedGlists[qgl->gramCode] = new GLSet(); groupedGlists[qgl->gramCode]->insert(qgl); } } // since a filter is applied each gram has several sorted sublists, // we fill all sublists belonging to the same gram with one sequantial I/O typename tr1::unordered_map::iterator iter; for(iter = groupedGlists.begin(); iter != groupedGlists.end(); iter++) { GLSet* currentLists = iter->second; // seek to start offset of first sorted sublist GramListOnDisk* firstGl = dynamic_cast*>( (*currentLists->begin())->gl ); invListsFile.seekg( firstGl->startOffset ); #ifdef DEBUG_STAT this->numberSeeks++; #endif // now fill sorted sublists in order in one sequential I/O // fillArray is implemented NOT to perform a disk seek typename GLSet::iterator setiter; GramListOnDisk* lastGl = dynamic_cast*>( (*currentLists->rbegin())->gl ); unsigned bytes = (unsigned)(lastGl->startOffset - firstGl->startOffset) + (lastGl->listSize * InvList::elementSize()); char* buffer = new char[bytes]; invListsFile.read(buffer, bytes); for(setiter = currentLists->begin(); setiter != currentLists->end(); setiter++) { QueryGramList* qgl = *setiter; GramListOnDisk* gl = dynamic_cast*>(qgl->gl); unsigned offset = (unsigned)(gl->startOffset - firstGl->startOffset); gl->fillArray(&buffer[offset]); } delete[] buffer; } // cleanup groupedGlists; for(iter = groupedGlists.begin(); iter != groupedGlists.end(); iter++) delete iter->second; // search leaves one-by-one, // if one filter was applied the lists are already read from disk in the above code // if no filter or several filters are applied then getArray() will read the list using a random disk I/O for(unsigned i = 0; i < arrays.size(); i++) { vector* >* currentLeafLists = arrays.at(i); vector lists; for(unsigned j = 0; j < currentLeafLists->size(); j++) { InvList* arr = currentLeafLists->at(j)->gl->getArray(&invListsFile); lists.push_back(arr); } if(this->charsumStats) { vector temp; dsMerger->merge(lists, threshold, temp); for(unsigned j = 0; j < temp.size(); j++) if(this->charsumFilter->passesFilter(this->queryCharsumStats, &this->charsumStats[temp.at(j)], (unsigned)query.threshold)) results.push_back(temp.at(j)); } else { dsMerger->merge(lists, threshold, results); } } } else { // search leaves one-by-one, // if one filter was applied the lists are already read from disk in the above code // if no filter or several filters are applied then getArray() will read the list using a random disk I/O for(unsigned i = 0; i < arrays.size(); i++) { vector* >* currentLeafLists = arrays.at(i); vector lists; for(unsigned j = 0; j < currentLeafLists->size(); j++) { InvList* arr = currentLeafLists->at(j)->gl->getArray(&invListsFile); lists.push_back(arr); #ifdef DEBUG_STAT this->numberSeeks++; #endif } if(this->charsumStats) { vector temp; dsMerger->merge(lists, threshold, temp); for(unsigned j = 0; j < temp.size(); j++) if(this->charsumFilter->passesFilter(this->queryCharsumStats, &this->charsumStats[temp.at(j)], (unsigned)query.threshold)) results.push_back(temp.at(j)); } else { dsMerger->merge(lists, threshold, results); } } } } #endif