// // $Id: stringmap.cpp 1106 2007-04-16 23:20:29Z chenli $ // // stringmap.cpp // // Copyright (C) 2003 - 2007 by The Regents of the University of // California // // Redistribution of this file is permitted under the terms of the GNU // Public License (GPL). // // Date: March 2002 // // Authors: // Chen Li (chenli@ics.uci.edu) // Liang Jin (liangj@ics.uci.edu) // #include #include #include #include #include "rtreeparams.h" #include "editdistance.h" #include "index.h" #include "mapper.h" #include "stringmap.h" using namespace std; #define DUMMYMAXNUM (1000000) //------------------------------------------------------------ // Constructor //------------------------------------------------------------ StringMap::StringMap(vector &allStrings) { mapper = new Mapper(allStrings, NUMDIMS); rtreeSearch = NULL; } StringMap::StringMap(const string mapperFileName) { mapper = new Mapper(mapperFileName); rtreeSearch = NULL; } vector StringMap::getStrings() { return mapper->getStrings(); } //------------------------------------------------------------ // Destructor //------------------------------------------------------------ StringMap::~StringMap() { if (rtreeSearch != NULL) delete rtreeSearch; if (mapper != NULL) delete mapper; } void StringMap::Map() { cout << "Mapper.map() ... " << endl << endl; mapper->map(); } // save data to a file void StringMap::saveMapperToFile(const string mapperFileName) { mapper->saveToFile(mapperFileName); } float StringMap::getNewThreshold(const vector &stringVector, const double percentage, const int edThreshold) { return getNewThreshold(stringVector, percentage, stringVector, percentage, edThreshold); } float StringMap::getNewThreshold(const vector &stringVector1, const double percentage1, const vector &stringVector2, const double percentage2, const int edThreshold) { cout << "StringMap.getNewThreshold() ... " << endl << endl; float newThreshold = mapper->getNewThreshold(stringVector1, percentage1, stringVector2, percentage2, edThreshold); return newThreshold; } int StringMap::Join(const vector &stringVector1, const vector &stringVector2, const int edThreshold, float newThreshold, const string resultFileName, const bool countingDuplicates) { cout << "Constructing two R-trees ...." << endl << endl; Rtree *rtree1 = new Rtree; ConstructRTree(stringVector1, rtree1, "rtree1.bin"); Rtree *rtree2 = new Rtree; ConstructRTree(stringVector2, rtree2, "rtree2.bin"); cout << "Querying the R-trees ...." << endl << endl; int numberFoundPairs = RTreeJoin(stringVector1, stringVector2, edThreshold, rtree1, rtree2, newThreshold, countingDuplicates, resultFileName ); cout << "Free space ... " << endl; delete rtree2; delete rtree1; cout << "Done. Results are in result.txt." << endl << endl; return numberFoundPairs; } void StringMap::ConstructRect(string s, Rect &rect) { Coordinates coordinates = mapper->mappedCoordinates(s); //cout << "string " << id //<< " (" << stringVector[id] << ") is mapped to:\t"; for (register int i = 0; i < NUMDIMS; i++) { rect.boundary[i] = coordinates[i]; rect.boundary[i + NUMDIMS] = coordinates[i]; //cout << rect.boundary[i] << "\t"; } } void StringMap::ConstructRTree(const vector &stringVector, Rtree *&rtree, string persistentFile) { for (register int id = 0; id < stringVector.size(); id ++) { Rect rect; ConstructRect(stringVector[id], rect); //for (register int i = 0; i < NUMDIMS * 2; i++) //cout << rect.boundary[i] << "\t"; //cout << endl; // for some reason the R-tree implementation doesn't like an ID of // 0, so we have to replace 0 with a dummy maximual number int tid = id; if (id == 0) tid = DUMMYMAXNUM; rtree->RTreeInsertRect(&rect, tid); } // for some reason, this R-tree implementation requires us to dump the R-tree // file and then reload it from disk. Let's do it to make it happy. rtree->RTreeDump((char *)persistentFile.c_str()); // now, make the tree persistent delete rtree; rtree = new Rtree; rtree->RTreeLoad((char *)persistentFile.c_str()); } // ------------------------------------------------------------ // Return the # of similar pairs // ------------------------------------------------------------ int StringMap::RTreeJoin(const vector &stringVector1, const vector &stringVector2, const int edThreshold, Rtree *rtree1, Rtree *rtree2, float newThreshold, const bool countingDuplicates, const string resultFileName) { DistanceFunction df; join_pair_result join_result; int numberFoundPairs = 0; // parameters for collecting performance info: long disk_acc1[10], disk_acc2[10], cpu_comps[10], total_cpus; float distance = 0.0; int result=1; //float delta = DELTA; // maximum mapped distance between pairs // open the join rtree1->SearchJoinOpen(rtree2, &df, &newThreshold, disk_acc1, 10, disk_acc2, 10, cpu_comps, 10, &total_cpus); int falsepositive=0; ofstream out; out.open(resultFileName.c_str()); while (rtree1->SearchJoinNext(distance, join_result) == 1) { int id1 = join_result.first.the_id; int id2 = join_result.second.the_id; // we have to convert a dummy maximal number back to 0 since we have // replaced id 0 with this dummy number when constructing the R-trees if( id1 == DUMMYMAXNUM ) id1 = 0; if( id2 == DUMMYMAXNUM ) id2 = 0; /*int eddist = ed(stringVector1[id1], stringVector2[id2]); if (eddist <= edThreshold && (countingDuplicates || eddist > 0)) // check if need to ignore those identical-string pairs*/ if (SimilarString(stringVector1[id1], stringVector2[id2], edThreshold) && (countingDuplicates || stringVector1[id1] != stringVector2[id2])) // check if need to ignore those identical-string pairs { out << stringVector1[id1] << endl; out << stringVector2[id2] << endl << endl; numberFoundPairs ++; double newDist = mapper->mappedDistance(stringVector1[id1], stringVector2[id2]); //cout << "Found: [" << stringVector1[id1] << "],[" //<< stringVector2[id2] << "] = " << newDist << endl; } else falsepositive++; } out.close(); rtree1->SearchJoinClose(); // cleanup /*out.open(ERRORFILE); out< &stringVector1, const vector &stringVector2, int edThreshold, bool countingDuplicates) { int count = 0; cout << "Computing # of similar pairs. Be patient, since it may take O(n^2) time ... " << endl << endl; for (int i = 0; i < stringVector1.size(); i ++) for (int j = 0; j < stringVector2.size(); j ++) { /*int eddist = ed(stringVector1[i], stringVector2[j]); if (eddist <= edThreshold && (countingDuplicates || eddist > 0)) // check if need to ignore those identical-string pairs*/ if (SimilarString(stringVector1[i], stringVector2[j], edThreshold) && (countingDuplicates || stringVector1[i] != stringVector2[j])) // check if need to ignore those identical-string pairs { //cout << stringVector1[i] << " || " << stringVector2[j] << endl; count ++; } } return count; } void StringMap::ConstructSearchRtree(const vector &searchStrings, string rtreeFileName) { rtreeSearch = new Rtree; ConstructRTree(searchStrings, rtreeSearch, rtreeFileName); this->searchStrings = &searchStrings; } void StringMap::LoadSearchRTree(const vector &searchStrings, string rtreeFileName) { rtreeSearch = new Rtree; rtreeSearch->RTreeLoad((char *)rtreeFileName.c_str()); this->searchStrings = &searchStrings; } int StringMap::Search(const string queryString, int edThreshold, float newThreshold, bool countingDuplicates) { if (rtreeSearch == NULL) { cout << "You need to call ConstructSearchRtree() first to build " << "the RTree before you can do a search." << endl; return 0; } Coordinates coordinates = mapper->mappedCoordinates(queryString); vector answers; struct Point queryPoint; for (register int i = 0; i < NUMDIMS; i++) queryPoint.position[i]= coordinates[i]; rtreeSearch->RTreeSearchSphere(&queryPoint, newThreshold, answers); /*// use the old rectanngle search Rect rect; for (register int i = 0; i < NUMDIMS; i++) { rect.boundary[i] = coordinates[i] - newThreshold; rect.boundary[i + NUMDIMS] = coordinates[i] + newThreshold; } rtreeSearch->RTreeSearch(&rect, answers);*/ int count = 0; for (register int i = 0; i < answers.size(); i ++) { int id = answers[i]; // we have to convert a dummy maximal number back to 0 since we have // replaced id 0 with this dummy number when constructing the R-tree if( id == DUMMYMAXNUM ) id = 0; /*int eddist = ed(queryString, (*searchStrings)[id]); if (eddist <= edThreshold && (countingDuplicates || eddist > 0)) // check if need to ignore those identical-string pairs*/ if (SimilarString(queryString, (*searchStrings)[id], edThreshold) && (countingDuplicates || queryString != (*searchStrings)[id])) // check if need to ignore those identical-string pairs { //cout << "StringMap found:\t [" << (*searchStrings)[id] << "]" << endl; count ++; } } return count; } // compute the real number of similar pairs int StringMap::SimilarStrings(const string queryString, const vector &stringVector, const int edThreshold, bool countingDuplicates) { int count = 0; for (int i = 0; i < stringVector.size(); i ++) { int eddist = ed(queryString, stringVector[i]); if (eddist <= edThreshold && (countingDuplicates || eddist > 0)) // check if need to ignore those identical-string pairs { //cout << "Similar String:\t [" << stringVector[i] << "]" << endl; count ++; } } return count; }