textsearch.h

Go to the documentation of this file.
00001 #ifndef EPT_TEXTSEARCH_TEXTSEARCH_H
00002 #define EPT_TEXTSEARCH_TEXTSEARCH_H
00003 
00009 /*
00010  * Copyright (C) 2007  Enrico Zini <enrico@debian.org>
00011  *
00012  * This program is free software; you can redistribute it and/or modify
00013  * it under the terms of the GNU General Public License as published by
00014  * the Free Software Foundation; either version 2 of the License, or
00015  * (at your option) any later version.
00016  *
00017  * This program is distributed in the hope that it will be useful,
00018  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00019  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00020  * GNU General Public License for more details.
00021  *
00022  * You should have received a copy of the GNU General Public License
00023  * along with this program; if not, write to the Free Software
00024  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00025  */
00026 
00027 #include <xapian.h>
00028 #include <vector>
00029 #include <string>
00030 
00031 namespace ept {
00032 namespace apt {
00033 class Apt;
00034 }
00035 namespace debtags {
00036 class Debtags;
00037 }
00038 namespace textsearch {
00039 
00040 /*
00041 Fallback on apt scan searches when index is not present
00042 
00043 Explicitly decide at instantiation (or at any other time) if a rebuild should
00044 be performed.  Just adding a 'rebuildIfNeeded' method would be enough.
00045 
00046 17:14 #xapian < enrico> Hello.  I'm finally in a position of writing a library to maintain
00047                         a xapian index with Debian package descriptions in a Debian system
00048 17:14 #xapian < enrico> I have a question, though
00049 17:14 #xapian < enrico> The descriptions change regularly as people run 'apt-get update'
00050 17:15 #xapian < enrico> I'd need to have a way to update the description index after
00051                         apt-get update, without rebuilding it from scratch
00052 17:15 #xapian < enrico> Is there some documentation on how to do that?  I can't exactly
00053                         tell Xapian "the new description for package foo is this" because
00054                         I'd need the xapian id
00055 19:11 #xapian < omega> you can add a unique term with a boolean prefix?
00056 19:11 #xapian < omega> like Qpackage-name
00057 19:11 #xapian < omega> then you search for it and replace_document
00058 19:24 #xapian < richardb> Or indeed, you use the "replace_document()" form which takes a
00059                           unique_id term.
00060 19:25 #xapian < richardb>         Xapian::docid replace_document(const std::string &
00061                           unique_term,
00062 19:25 #xapian < richardb>                                        const Xapian::Document &
00063                           document);
00064 19:43 #xapian < enrico> unique term
00065 19:43 #xapian < enrico> nice!
00066 19:44 #xapian < enrico> can I use a non-alpha prefix, like :package-name ?
00067 19:45 #xapian < enrico> or pkg:package-name
00068 19:45 #xapian < enrico> I suppose I can
00069 */
00070 
00085 class TextSearch
00086 {
00087 protected:
00088     time_t m_timestamp;
00089     Xapian::Database m_db;
00090     Xapian::Stem m_stem;
00091 
00093     static std::string toLower(const std::string& str);
00094 
00101     void normalize_and_add(Xapian::Document& doc, const std::string& term, int& pos) const;
00102 
00103 public:
00104     TextSearch();
00105 
00107     Xapian::Database& db() { return m_db; }
00108 
00110     const Xapian::Database& db() const { return m_db; }
00111 
00113     time_t timestamp() const { return m_timestamp; }
00114 
00116     bool hasData() const { return m_timestamp > 0; }
00117 
00119     bool needsRebuild(apt::Apt& apt);
00120 
00126     bool rebuildIfNeeded(apt::Apt& apt);
00127 
00135     bool rebuildIfNeeded(apt::Apt& apt, const debtags::Debtags& debtags);
00136 
00140     Xapian::docid docidByName(const std::string& pkgname) const;
00141 
00145     Xapian::Query makeORQuery(const std::string& keywords) const;
00146 
00153     Xapian::Query makePartialORQuery(const std::string& keywords) const;
00154 
00158     template<typename ITER>
00159     Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
00160     {
00161         std::vector<std::string> terms;
00162         // Insert both the lowercased and the stemmed lowercased query terms
00163         for (ITER i = begin; i != end; ++i)
00164         {
00165             std::string t = toLower(*i);
00166             std::string s = m_stem(t);
00167             terms.push_back(t);
00168             if (s != t)
00169                 terms.push_back(s);
00170         }
00171         return Xapian::Query(Xapian::Query::OP_OR, terms.begin(), terms.end());
00172     }
00173 
00175     std::vector<std::string> expand(Xapian::Enquire& enq) const;
00176 
00177 //  std::vector<std::string> similar(const std::string& pkg);
00178 
00182     Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
00183 };
00184 
00185 }
00186 }
00187 
00188 // vim:set ts=4 sw=4:
00189 #endif

Generated on Fri Sep 14 23:13:18 2007 for libept by  doxygen 1.5.3