/* $Id: utilities.cc 5149 2010-03-24 23:37:18Z abehm $ Copyright (C) 2010 by The Regents of the University of California Redistribution of this file is permitted under the terms of the BSD license. Author: Jiaheng Lu Date: 05/11/2007 */ #include "utilities.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "heap.h" #include "showutil.h" using namespace std; typedef set > UnsignedSet; unsigned getTotalLogSize(const vector*> &lists) { unsigned totalSize = 0; unsigned numberOfLists = lists.size(); for(unsigned i=0;i< numberOfLists;i++) totalSize +=(unsigned)ceil(log(lists.at(i)->size())); return totalSize; }//end getTotalSize unsigned getTotalSize(const vector*> &lists) { unsigned totalSize = 0; unsigned numberOfLists = lists.size(); for(unsigned i=0;i< numberOfLists;i++) { totalSize +=lists.at(i)->size(); // cout <<"i is " << i << //" size is "<< lists.at(i)->size() < &result1, const vector &result2) { if (result1.size() != result2.size()) return false; UnsignedSet s1 (result1.begin(),result1.end()); UnsignedSet s2 (result2.begin(),result2.end()); UnsignedSet::iterator ite1 = s1.begin(); UnsignedSet::iterator ite2 = s2.begin(); while (ite1!= s1.end() ) { if (*ite1 != *ite2) return false; ite1++; ite2++; }//end while return true; }// end testConsistent bool binarySearch(Array *v, unsigned value, unsigned start, unsigned end) { int low = start; int high = end - 1; int middle = (low+high+1)/2; bool found = false; do { if (value == v->at(middle)) return true; else if ( value < v->at(middle)) high = middle - 1; else low = middle + 1; middle = (low+high+1)/2; } while ((low <=high) && (found == false) ); return found; }//end binarySearch /* Get statics data frbom running */ void getStatistics(const vector*> &arrays, unsigned threshold, unsigned longListsSize, const vector &partialResults, const vector &results) { cout<<"~~~~~~~~~~~~~~"<*> *longLists, vector*> *shortLists, const unsigned threshold, const vector*> &originalLists, unsigned sortedIndex[], unsigned longListsSize) { unsigned originalListSize = originalLists.size(); for(unsigned i=0;ipush_back(originalLists.at(sortedIndex[originalListSize-i-1])); else shortLists->push_back(originalLists.at(sortedIndex[originalListSize-i-1])); }//end for }//end splitTwoSets void splitTwoSizeSets(vector *longLists, vector *shortLists, const unsigned threshold, const vector &originalLists, unsigned sortedIndex[], unsigned longListsSize) { unsigned originalListSize = originalLists.size(); for(unsigned i=0;ipush_back(originalLists.at(sortedIndex[originalListSize-i-1])); else shortLists->push_back(originalLists.at(sortedIndex[originalListSize-i-1])); }//end for }//end splitTwoSets void splitTwoSetsWithDuplicates(vector*> &longLists, vector*> &shortLists, vector &longListsWeights, vector &shortListsWeights, const vector*> &originalLists, const vector &originalWeights, const unsigned shortListsSize) { unsigned originalListSize = originalLists.size(); for(unsigned i=0;i0) { longLists.push_back(originalLists.at(i)); longListsWeights.push_back(listSizeforLonger); } i++; for(; i*> *longLists, vector*> *shortLists, const unsigned threshold, const vector*> &originalLists, unsigned sortedIndex[]) { unsigned originalListSize = originalLists.size(); for(unsigned i=0;ipush_back(originalLists.at(sortedIndex[originalListSize-i-1])); else shortLists->push_back(originalLists.at(sortedIndex[originalListSize-i-1])); }//end for }//end separateTwoSets /* Use binray search to search the value in search set of lists; change the count */ void binarySearchSet(unsigned &count, vector* > *lists, unsigned data) { vector*>::iterator ite = lists->begin(); while(ite != lists->end()) { Array *v = *ite; if (binarySearch(v,data,0,v->size())) count++; ite++; }//end while }//end binarySearchSet void sortBySize(const vector &allLists, unsigned sortedIndex [] ) { vector::const_iterator ite = allLists.begin(); unsigned sizeHeap[allLists.size()]; unsigned indexHeap[allLists.size()]; unsigned index = 0; unsigned heapSize = 0; while (ite != allLists.end()) { unsigned sizeOfList = *ite; heapInsert(sizeOfList,index++,sizeHeap,indexHeap,heapSize); ite++; }//end while index = 0; while (heapSize > 0) { sortedIndex[index++] = indexHeap[0]; heapDelete(sizeHeap,indexHeap,heapSize); }//end while }//end sortInvertedList /* Sort the inverted list with increasing order according to the their sizes by heapSort This function is used in MergeOpt. */ void sortBySizeOfLists(const vector*> &allLists, unsigned sortedIndex [] ) { vector*>::const_iterator ite = allLists.begin(); unsigned sizeHeap[allLists.size()]; unsigned indexHeap[allLists.size()]; unsigned index = 0; unsigned heapSize = 0; while (ite != allLists.end()) { unsigned sizeOfList = (*ite)->size(); heapInsert(sizeOfList,index++,sizeHeap,indexHeap,heapSize); ite++; }//end while index = 0; while (heapSize > 0) { sortedIndex[index++] = indexHeap[0]; heapDelete(sizeHeap,indexHeap,heapSize); }//end while }//end sortInvertedList /* Insert items to heap This function is used in binarySearchMerger */ void insertToHeaps(unsigned dataHeap[], unsigned indexHeap[], unsigned &heapSize, const vector< Array* > &lists, unsigned pointersIndexList[], unsigned vectorIndexContainer[], unsigned containerSize) { for(unsigned i=0;iat(position); heapInsert(newData,index,dataHeap,indexHeap, heapSize); }//end for }//end insertToHeaps /* This fucntion skips nodes for mergebinarySeach algorithm */ void skipNodes (const vector* > &lists, unsigned vectorIndexContainer[], unsigned containerSize, unsigned pivotData, unsigned pointersIndexList[]) { for(unsigned i=0;ibinarySearch(pivotData,oldPosition); // cout<< "new data" << lists.at(j)->at( pointersIndexList[j]) << // " old data " << pivotData <at( pointersIndexList[j]) >= pivotData); }//end for }//end skipNodes /* This function can be deleted for the final release */ void CountSkipNodes (const vector* > &lists, unsigned vectorIndexContainer[], unsigned containerSize, unsigned pivotData, unsigned pointersIndexList[], unsigned &elementScanned) { for(unsigned i=0;ibinarySearch(pivotData,oldPosition); elementScanned += (unsigned)ceil(0.01*log(lists.at(j)->size())); // cout<< "new data" << lists.at(j)->at( pointersIndexList[j]) << // " old data " << pivotData <at( pointersIndexList[j]) >= pivotData); }//end for }//end skipNodes unsigned max(unsigned s1, unsigned s2) { if (s1>s2) return s1; else return s2; }//end max /* This function selects the shorest string for computing the thresholds in filters. */ unsigned shorestStringSize(const vector &strings ) { unsigned minSize = ~0; for(unsigned i=0;i<(unsigned)strings.size();i++) { unsigned s = strings.at(i).length(); if (s < minSize) minSize = s; }//end for return minSize ; }//end /* This function is only for MergeSkip algorithm. Use MergeSkip algorithm to process short lists in DivideSkip algorithm. */ void mergeSkipShortLists(const vector*> &arrays, const unsigned threshold, vector &results, vector &counters) { //const unsigned maxUnsigned = 0x10000000 - 2; const unsigned maxUnsigned = 0xFFFFFFFF; unsigned numberOfInvertedList = arrays.size(); if (threshold>numberOfInvertedList) return; // no answer unsigned pointersIndex [numberOfInvertedList]; for(unsigned k=0;k= threshold) // we got the result { counters.push_back(containerSize); results.push_back(minData); //cout<< "We get a result, rule ID is " << minData <<", count is " << containerSize <*> &arrays, const vector &weights, const unsigned threshold, vector &results, vector &counters) { const unsigned maxUnsigned = 0xFFFFFFFF; unsigned numberOfInvertedList = arrays.size(); unsigned pointersIndex [numberOfInvertedList]; for(unsigned k=0;k= threshold) // we got the result { counters.push_back(containerWeight); results.push_back(minData); //cout<< "We get a result, rule ID is " << minData <<", count is " << containerSize < pivot) break; vectorIndexContainer[containerSize++] = indexHeap[0]; containerWeight += weights[indexHeap[0]]; heapDelete(dataHeap,indexHeap,sizeOfHeaps); }//end while (containerWeight < pivot ) //printArray( vectorIndexContainer,containerSize); // cout<< "Pivot node is " << dataHeap[0] << endl; skipNodes(arrays, vectorIndexContainer, containerSize, dataHeap[0], pointersIndex); //cout<<"After skip, current nodes are : " <*> &arrays, vector*> &newArrays, vector &newWeights) { unsigned sizeOfInvertedLists = arrays.size(); unsigned sortedIndex[sizeOfInvertedLists]; sortBySizeOfLists(arrays,sortedIndex);//increasing order Array *currentArray=arrays.at(sortedIndex[0]); unsigned currentCount = 1; for(unsigned i=1;i *iArray = arrays.at(sortedIndex[i]); if (iArray == currentArray) currentCount++; else { newArrays.push_back(currentArray); newWeights.push_back(currentCount); currentCount = 1; currentArray = iArray; }//end if }//end for newArrays.push_back(currentArray); newWeights.push_back(currentCount); }//end detectDuplicateLists */ // BUGFIX BY ALEX void detectDuplicateLists(const vector*> &arrays, vector*> &newArrays, vector &newWeights) { unsigned sizeOfInvertedLists = arrays.size(); unsigned sortedIndex[sizeOfInvertedLists]; sortBySizeOfLists(arrays,sortedIndex); // we need to take care of unequal lists with the same size // do exhaustive search within each list length group set arraysAdded; for(unsigned i=0;i *currentArray=arrays.at(sortedIndex[i]); unsigned arrAddr = (unsigned)currentArray; unsigned currentCount = 1; // if the array has not been added previously if(arraysAdded.find(arrAddr) == arraysAdded.end()) { // search all arrays with the same length for identical pointers for(unsigned j = i + 1; j < sizeOfInvertedLists; j++) { Array *iArray = arrays.at(sortedIndex[j]); if(iArray->size() != currentArray->size()) break; if(iArray == currentArray) currentCount++; } // add the array newArrays.push_back(currentArray); newWeights.push_back(currentCount); arraysAdded.insert(arrAddr); } } // ALEX DEBUG /* unsigned weightSum = 0; for(unsigned i = 0; i < newArrays.size(); i++) { Array* tmp = newArrays.at(i); bool contains = false; for(unsigned j = 0; j < tmp->size(); j++) { if(tmp->at(j) == 49) { contains = true; break; } } if(contains) { cout << "LIST: " << (unsigned)newArrays.at(i) << " " << newWeights.at(i) << endl; weightSum += newWeights.at(i); } } cout << "WEIGHTSUM: " << weightSum << endl; */ }