vocabularymerger.h

Go to the documentation of this file.
00001 /*
00002  * Merge different vocabularies together and create the tag and facet indexes
00003  *
00004  * Copyright (C) 2003-2007  Enrico Zini <enrico@debian.org>
00005  *
00006  * This program is free software; you can redistribute it and/or modify
00007  * it under the terms of the GNU General Public License as published by
00008  * the Free Software Foundation; either version 2 of the License, or
00009  * (at your option) any later version.
00010  *
00011  * This program is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  * GNU General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU General Public License
00017  * along with this program; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  */
00020 
00021 #include <tagcoll/diskindex/mmap.h>
00022 #include <tagcoll/input/base.h>
00023 #include <string>
00024 #include <map>
00025 #include <set>
00026 
00027 #ifndef EPT_DEBTAGS_VOCABULARYMERGER_H
00028 #define EPT_DEBTAGS_VOCABULARYMERGER_H
00029 
00030 namespace ept {
00031 namespace debtags {
00032 
00033 class VocabularyMerger
00034 {
00035 protected:
00036     class FacetIndexer : public tagcoll::diskindex::MMapIndexer
00037     {
00038     protected:
00039         VocabularyMerger& vm;
00040     public:
00041         FacetIndexer(VocabularyMerger& vm) : vm(vm) {}
00042         virtual ~FacetIndexer() {}
00043         virtual int encodedSize() const;
00044         virtual void encode(char* buf) const;
00045     };
00046     class TagIndexer : public tagcoll::diskindex::MMapIndexer
00047     {
00048     protected:
00049         VocabularyMerger& vm;
00050     public:
00051         TagIndexer(VocabularyMerger& vm) : vm(vm) {}
00052         virtual ~TagIndexer() {}
00053         virtual int encodedSize() const;
00054         virtual void encode(char* buf) const;
00055     };
00056     class TagData : public std::map<std::string, std::string>
00057     {
00058     public:
00059         std::string name;
00060         // Offset in the last written file (used for indexing)
00061         long ofs;
00062         int len;
00063         int id;
00064 
00065         TagData() : ofs(0), len(0) {}
00066     };
00067     class FacetData : public std::map<std::string, std::string>
00068     {
00069     public:
00070         std::string name;
00071         std::map<std::string, TagData> tags;
00072         // Offset in the last written file (used for indexing)
00073         long ofs;
00074         int len;
00075         int id;
00076 
00077         FacetData() : ofs(0), len(0) {}
00078 
00079         TagData& obtainTag(const std::string& fullname);
00080     };
00081     std::map<std::string, FacetData> facets;
00082     int tagCount;
00083     FacetIndexer findexer;
00084     TagIndexer tindexer;
00085     
00086     FacetData& obtainFacet(const std::string& name);
00087     TagData& obtainTag(const std::string& fullname);
00088     
00089 public:
00090     VocabularyMerger() : tagCount(0), findexer(*this), tindexer(*this) {}
00091 
00095     bool empty() const { return facets.empty(); }
00096 
00101     void read(tagcoll::input::Input& input);
00102 
00106     void write(const std::string& fname);
00107 
00111     void write(FILE* out);
00112 
00119     const tagcoll::diskindex::MMapIndexer& facetIndexer() const { return findexer; }
00120 
00127     const tagcoll::diskindex::MMapIndexer& tagIndexer() const { return tindexer; }
00128 
00132     bool hasFacet(const std::string& name) const
00133     {
00134         return facets.find(name) != facets.end();
00135     }
00136 
00140     bool hasTag(const std::string& fullname) const;
00141 
00145     int tagID(const std::string& fullname) const;
00146 
00150     std::set<std::string> tagNames() const;
00151 };
00152 
00153 }
00154 }
00155 
00156 // vim:set ts=4 sw=4:
00157 #endif

Generated on Fri Sep 14 23:13:18 2007 for libept by  doxygen 1.5.3