00001
00002 #include <xapian.h>
00003 #include <ept/core/apt.h>
00004 #include <wibble/regexp.h>
00005 #include <wibble/sys/pipe.h>
00006 #include <wibble/sys/exec.h>
00007
00008 #ifndef EPT_XAPIAN_H
00009 #define EPT_XAPIAN_H
00010
00011 namespace ept {
00012 namespace core {
00013 namespace xapian {
00014
00015
00016 const Xapian::valueno VAL_APT_INSTALLED_SIZE = 1;
00017 const Xapian::valueno VAL_APT_PACKAGE_SIZE = 2;
00018 const Xapian::valueno VAL_POPCON = 10;
00019 const Xapian::valueno VAL_ITERATING_RATING = 20;
00020 const Xapian::valueno VAL_ITERATING_FUNCTIONALITY = 21;
00021 const Xapian::valueno VAL_ITERATING_USABILITY = 22;
00022 const Xapian::valueno VAL_ITERATING_SECURITY = 23;
00023 const Xapian::valueno VAL_ITERATING_PERFORMANCE = 24;
00024 const Xapian::valueno VAL_ITERATING_QUALITY = 25;
00025 const Xapian::valueno VAL_ITERATING_SUPPORT = 26;
00026 const Xapian::valueno VAL_ITERATING_ADOPTION = 27;
00027
00028 struct TagFilter : public Xapian::ExpandDecider
00029 {
00030 virtual bool operator()(const std::string &term) const {
00031 return term[0] == 'X' && term[1] == 'T';
00032 }
00033 };
00034
00035 struct List {
00036 char m_enqPlace[sizeof(Xapian::Enquire)];
00037 mutable Xapian::MSet m_matches;
00038 mutable Xapian::MSet::const_iterator m_iter;
00039 mutable int m_pos;
00040 typedef List Type;
00041
00042 static const size_t chunkSize = 20;
00043
00044 List head() const {
00045 seek();
00046 return *this;
00047 }
00048
00049 Token token() const {
00050 Token t;
00051 t._id = m_iter.get_document().get_data();
00052 return t;
00053 }
00054
00055 bool operator<( const List &o ) const {
00056 return token() < o.token();
00057 }
00058
00059 void seek() const {
00060 if ( m_matches.size() == chunkSize && m_iter == m_matches.end() ) {
00061 m_matches = enq().get_mset( m_pos, chunkSize );
00062 m_iter = m_matches.begin();
00063 m_pos += chunkSize;
00064 }
00065 }
00066
00067 bool empty() const {
00068 if ( m_pos == -1 )
00069 return true;
00070 seek();
00071 return m_matches.size() < 30 && m_iter == m_matches.end();
00072 }
00073
00074 List tail() const {
00075 List t = *this;
00076 t.seek();
00077 t.m_iter ++;
00078 return t;
00079 }
00080
00081 Xapian::Enquire const &enq() const {
00082 return *reinterpret_cast< Xapian::Enquire const * >( m_enqPlace );
00083 }
00084
00085 List( Xapian::Enquire _enq )
00086 {
00087 Xapian::Enquire *e = new (m_enqPlace) Xapian::Enquire( _enq );
00088 assert_eq( e, &enq() );
00089 m_matches = enq().get_mset( 0, chunkSize );
00090 m_iter = m_matches.begin();
00091 m_pos = chunkSize;
00092 }
00093
00094 List() {}
00095 };
00096
00097 struct Query {
00098 Xapian::Database *m_db;
00099 Xapian::Enquire m_enq;
00100 Xapian::Stem m_stem;
00101 typedef std::set< std::string > Terms;
00102 Terms m_include, m_exclude, m_secondary;
00103 int m_cutoff;
00104 bool m_expand;
00105
00106 void setQualityCutoff( int c ) {
00107 m_cutoff = c;
00108 }
00109
00110 void setExpand( bool e ) { m_expand = e; }
00111
00112 Query( Xapian::Database &e ) : m_db( &e ), m_enq( e ) {
00113 m_cutoff = 50;
00114 m_expand = true;
00115 }
00116
00117 wibble::Tokenizer queryTokenizer( std::string q ) const {
00118 return wibble::Tokenizer( q, "[A-Za-z0-9._+:-]+", REG_EXTENDED );
00119 }
00120
00121 template< typename Out >
00122 void tokenizeQuery( std::string q, Out o ) const
00123 {
00124 wibble::Tokenizer tok = queryTokenizer( q );
00125 for (wibble::Tokenizer::const_iterator i = tok.begin(); i != tok.end(); ++i )
00126 {
00127 if ( (*i).find( "::" ) != std::string::npos ) {
00128 *o++ = ("XT" + *i);
00129 } else {
00130 std::string t = wibble::str::tolower(*i);
00131 std::string s = m_stem(t);
00132 *o++ = t;
00133 if (s != t)
00134 *o++ = ("Z" + s);
00135 }
00136 }
00137 }
00138
00139 template< typename Out >
00140 void expand( Out o ) const
00141 {
00142 Xapian::RSet rset;
00143
00144 Xapian::MSet mset = m_enq.get_mset(0, 5);
00145 for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i )
00146 rset.add_document(i);
00147
00148 TagFilter tagf;
00149 Xapian::ESet eset = m_enq.get_eset(5, rset, &tagf);
00150 for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i )
00151 *o++ = *i;
00152 }
00153
00154 void updateEnquire() {
00155
00156 Xapian::Query inc( Xapian::Query::OP_OR,
00157 m_include.begin(),
00158 m_include.end() ),
00159 exc( Xapian::Query::OP_OR,
00160 m_exclude.begin(),
00161 m_exclude.end() ),
00162 secondary( Xapian::Query::OP_OR,
00163 m_secondary.begin(),
00164 m_secondary.end() ),
00165 secondary1( Xapian::Query::OP_SCALE_WEIGHT, secondary, 0.02 ),
00166 query1( Xapian::Query::OP_AND_NOT, inc, exc ),
00167 query( Xapian::Query::OP_OR, query1, secondary1 );
00168
00169 m_enq.set_query( query );
00170
00171 if ( m_expand ) {
00172 m_expand = false;
00173 expand( std::inserter( m_include, m_include.begin() ) );
00174 updateEnquire();
00175 m_expand = true;
00176 return;
00177 }
00178
00179 Xapian::MSet first = m_enq.get_mset(0, 1, 0, 0, 0);
00180 Xapian::MSetIterator ifirst = first.begin();
00181 if ( ifirst != first.end() ) {
00182
00183
00184 }
00185 }
00186
00187 List results() {
00188 updateEnquire();
00189 return List( m_enq );
00190 }
00191
00192 std::map< std::string, int > relevantTags( int n = 30 ) {
00193 updateEnquire();
00194 std::map< std::string, int > relev;
00195 Xapian::RSet rset;
00196 Xapian::MSet mset = m_enq.get_mset(0, 100);
00197 for ( Xapian::MSet::iterator i = mset.begin(); i != mset.end(); ++i )
00198 rset.add_document(i);
00199
00200 TagFilter tagf;
00201 Xapian::ESet eset = m_enq.get_eset(n, rset, &tagf);
00202 for ( Xapian::ESetIterator i = eset.begin(); i != eset.end(); ++i )
00203 relev.insert( relev.begin(),
00204 std::make_pair(
00205 std::string( *i, 2, std::string::npos ),
00206 i.get_weight() ) );
00207 return relev;
00208 }
00209
00210 void addTerms( std::string t, bool partial = false, bool exclude = false ) {
00211 if ( t.empty() )
00212 return;
00213 Terms &to = exclude ? m_exclude : m_include;
00214 std::vector< std::string > tok;
00215 tokenizeQuery( t, std::back_inserter( tok ) );
00216 if ( partial ) {
00217 if ( tok.back().size() == 1 ) {
00218 tok.pop_back();
00219 } else {
00220 std::copy(
00221 m_db->allterms_begin( tok.back() ),
00222 m_db->allterms_end( tok.back() ),
00223 std::back_inserter( tok ) );
00224 }
00225 }
00226 std::copy( tok.begin(), tok.end(), std::inserter( to, to.begin() ) );
00227 }
00228
00229 void addTerms( const Terms &t, bool exclude = false ) {
00230 Terms &to = exclude ? m_exclude : m_include;
00231 std::copy( t.begin(), t.end(), std::inserter( to, to.begin() ) );
00232 }
00233
00234 void addSecondaryTerm( const std::string &term, bool partial = false ) {
00235 if ( partial ) {
00236 std::copy(
00237 m_db->allterms_begin( term ),
00238 m_db->allterms_end( term ),
00239 std::inserter( m_secondary, m_secondary.begin() ) );
00240 } else {
00241 m_include.insert( m_secondary.begin(), term );
00242 }
00243 }
00244
00245 };
00246
00247 struct Source
00248 {
00249 protected:
00250 mutable Xapian::Database m_db;
00251 Xapian::Stem m_stem;
00252 mutable bool m_opened;
00253
00255 static std::string toLower(const std::string& str);
00256
00263 void normalize_and_add(Xapian::Document& doc, const std::string& term,
00264 int& pos) const;
00265
00266 public:
00267 Source();
00268
00270 Xapian::Database& db() {
00271 open();
00272 return m_db;
00273 }
00274
00276 const Xapian::Database& db() const {
00277 open();
00278 return m_db;
00279 }
00280
00281 void open() const;
00282 void invalidate() {
00283 m_db = Xapian::Database();
00284 m_opened = false;
00285 }
00286
00288 time_t timestamp() const;
00289
00290 void updateLeniently( AptDatabase &apt, OpProgress *op = 0 ) {
00291 if (apt.timestamp() - timestamp() > 86400 * 8)
00292 update( op );
00293 }
00294
00295 void update( OpProgress *op = 0 ) {
00296 if ( !op )
00297 op = new OpProgress();
00298
00299 wibble::exception::AddContext _ctx( "Rebuilding Xapian database." );
00300 int outfd;
00301 std::string op_str;
00302
00303 wibble::sys::Exec ex( "update-apt-xapian-index" );
00304 ex.args.push_back( "--batch-mode" );
00305 ex.searchInPath = true;
00306 ex.forkAndRedirect( 0, &outfd, 0 );
00307
00308 wibble::sys::Pipe monit( outfd );
00309 while ( !monit.eof() ) {
00310 std::string line = monit.nextLine();
00311 if ( line.empty() ) {
00312 usleep( 100000 );
00313 continue;
00314 }
00315 std::cerr << "got : " << line << std::endl;
00316 if ( wibble::str::startsWith( line, "begin: " ) ) {
00317 op_str = std::string( line, 7, std::string::npos );
00318 op->OverallProgress( 0, 100, 100, op_str );
00319
00320 } else if ( wibble::str::startsWith( line, "done: " ) ) {
00321 op->Done();
00322 } else if ( wibble::str::startsWith( line, "progress: " ) ) {
00323 wibble::ERegexp re( "progress: ([0-9]+)/([0-9]+)", 3 );
00324 if ( re.match( line ) ) {
00325 assert_eq( re[2], "100" );
00326 op->OverallProgress( atoi( re[1].c_str() ), 100, 100, op_str );
00327 }
00328 }
00329 }
00330 ex.waitForSuccess();
00331 invalidate();
00332 }
00333
00335 bool hasData() const { return timestamp() > 0; }
00336
00337 Query query( const std::string &s,
00338 bool expand = true,
00339 int qualityCutoff = 50 )
00340 {
00341 Query q( db() );
00342 q.setQualityCutoff( qualityCutoff );
00343 q.setExpand( expand );
00344 q.addTerms( s );
00345 if ( s.length() > 2 )
00346 q.addSecondaryTerm( "XP" + s, true );
00347 return q;
00348 }
00349
00350 Query partialQuery( const std::string &s ) {
00351 Query q( db() );
00352 q.addTerms( s, true );
00353 return q;
00354 }
00355
00357
00358
00359 Xapian::docid docidByName(const std::string& pkgname) const;
00360
00364 Xapian::Query makeORQuery(const std::string& keywords) const;
00365
00372 Xapian::Query makePartialORQuery(const std::string& keywords) const;
00373
00377 template<typename ITER>
00378 Xapian::Query makeORQuery(const ITER& begin, const ITER& end) const
00379 {
00380 return Xapian::Query(Xapian::Query::OP_OR, begin, end);
00381 }
00382
00384 std::vector<std::string> expand(Xapian::Enquire& enq) const;
00385
00386
00387
00391 Xapian::Query makeRelatedQuery(const std::string& pkgname) const;
00392
00396 double getDoubleValue(const std::string& pkgname,
00397 Xapian::valueno val_id) const;
00398
00402 int getIntValue(const std::string& pkgname, Xapian::valueno val_id) const;
00403 };
00404
00405 }
00406 }
00407 }
00408
00409 #endif