00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026 class CXMLElement;
00027
00047 #ifndef _CINVERTEDFILEACCESSOR
00048 #define _CINVERTEDFILEACCESSOR
00049 #include "libGIFTAcInvertedFile/include/uses-declarations.h"
00050 #include <string>
00051 #include "libMRML/include/TID.h"
00052 #include "libMRML/include/CSelfDestroyPointer.h"
00053 #include "libMRML/include/CArraySelfDestroyPointer.h"
00054 #include "libGIFTAcInvertedFile/include/CDocumentFrequencyList.h"
00055 #include "CCollectionFrequencyList.h"
00056 #include "libGIFTAcInvertedFile/include/CADIHash.h"
00057 #include "libGIFTAcURL2FTS/include/CAcURL2FTS.h"
00058 #include <iostream>
00059 #include <fstream>
00060 #include <map>
00061 #include <vector>
00062 #ifdef HAS_HASH_MAP
00063 #include <hash_map>
00064 #else
00065 #define hash_map map
00066 #endif
00067 #include <functional>
00068 #include <algorithm>
00069
00070 #include "libMRML/include/CMagic.h"
00071
00072
00073 typedef TID TFeatureID ;
00074
00081 class CAcInvertedFile:public CAcURL2FTS{
00082
00083 protected:
00085 TID mMaximumFeatureID;
00088 CArraySelfDestroyPointer<char> mInvertedFileBuffer;
00090 mutable CSelfDestroyPointer<istream> mInvertedFile;
00091
00093 mutable ifstream mOffsetFile;
00094
00096 ifstream mFeatureDescriptionFile;
00097
00099 string mInvertedFileName;
00100
00102 string mOffsetFileName;
00103
00105 string mFeatureDescriptionFileName;
00106
00108 typedef hash_map<TID,unsigned int> CIDToOffset;
00110 CIDToOffset mIDToOffset;
00111
00113 mutable hash_map<TID,double> mFeatureToCollectionFrequency;
00114
00118 hash_map<TID,unsigned int> mFeatureDescription;
00119
00123 CADIHash mDocumentInformation;
00125
00128 void writeOffsetFileElement(TID inFeatureID,
00129 int inPosition,
00130 ostream& inOpenOffsetFile);
00132 CDocumentFrequencyList* getFeatureFile(string inFileName)const;
00133 public:
00135 bool operator()()const;
00136
00151 CAcInvertedFile(const CXMLElement& inCollectionElement);
00153 bool init(bool);
00154
00156 ~CAcInvertedFile();
00157
00159 string IDToURL(TID inID)const;
00160
00162 TID URLToID(const string& inURL)const;
00163
00167 CDocumentFrequencyList* FeatureToList(TFeatureID)const;
00168
00170 CDocumentFrequencyList* URLToFeatureList(string inURL)const;
00171
00173 CDocumentFrequencyList* DIDToFeatureList(TID inDID)const;
00174
00176
00177
00181 double FeatureToCollectionFrequency(TFeatureID)const;
00182
00184 unsigned int getFeatureDescription(TID inFeatureID)const;
00186
00190 double DIDToMaxDocumentFrequency(TID)const;
00191
00193 double DIDToDFSquareSum(TID)const;
00194
00196 double DIDToSquareDFLogICFSum(TID)const;
00198
00199
00201
00209 bool generateInvertedFile();
00210
00218 bool newGenerateInvertedFile();
00219
00222 bool checkConsistency();
00223
00227 bool findWithinStream(TID inFeatureID,
00228 TID inDocumentID,
00229 double inDocumentFrequency)const;
00230
00232
00234 TID getMaximumFeatureID()const;
00242 list<TID>* getAllFeatureIDs()const;
00243 };
00244
00245 #endif