1 : /** \file enquire.h
2 : * \brief API for running queries
3 : */
4 : /* Copyright 1999,2000,2001 BrightStation PLC
5 : * Copyright 2001,2002 Ananova Ltd
6 : * Copyright 2002,2003,2004,2005,2006,2007 Olly Betts
7 : *
8 : * This program is free software; you can redistribute it and/or
9 : * modify it under the terms of the GNU General Public License as
10 : * published by the Free Software Foundation; either version 2 of the
11 : * License, or (at your option) any later version.
12 : *
13 : * This program is distributed in the hope that it will be useful,
14 : * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 : * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 : * GNU General Public License for more details.
17 : *
18 : * You should have received a copy of the GNU General Public License
19 : * along with this program; if not, write to the Free Software
20 : * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 : * USA
22 : */
23 :
24 : #ifndef XAPIAN_INCLUDED_ENQUIRE_H
25 : #define XAPIAN_INCLUDED_ENQUIRE_H
26 :
27 : #include <string>
28 :
29 : #include <xapian/base.h>
30 : #include <xapian/deprecated.h>
31 : #include <xapian/sorter.h>
32 : #include <xapian/types.h>
33 : #include <xapian/termiterator.h>
34 : #include <xapian/visibility.h>
35 :
36 : namespace Xapian {
37 :
38 : class Database;
39 : class Document;
40 : class ErrorHandler;
41 : class ExpandDecider;
42 : class MSetIterator;
43 : class Query;
44 : class Weight;
45 :
46 : /** A match set (MSet).
47 : * This class represents (a portion of) the results of a query.
48 : */
49 : class XAPIAN_VISIBILITY_DEFAULT MSet {
50 : public:
51 : class Internal;
52 : /// @internal Reference counted internals.
53 : Xapian::Internal::RefCntPtr<Internal> internal;
54 :
55 : /// @internal Constructor for internal use.
56 : explicit MSet(MSet::Internal * internal_);
57 :
58 : /// Create an empty Xapian::MSet.
59 : MSet();
60 :
61 : /// Destroy a Xapian::MSet.
62 : ~MSet();
63 :
64 : /// Copying is allowed (and is cheap).
65 : MSet(const MSet & other);
66 :
67 : /// Assignment is allowed (and is cheap).
68 : void operator=(const MSet &other);
69 :
70 : /** Fetch the document info for a set of items in the MSet.
71 : *
72 : * This method causes the documents in the range specified by the
73 : * iterators to be fetched from the database, and cached in the
74 : * Xapian::MSet object. This has little effect when performing a
75 : * search across a local database, but will greatly speed up
76 : * subsequent access to the document contents when the documents are
77 : * stored in a remote database.
78 : *
79 : * The iterators must be over this Xapian::MSet - undefined behaviour
80 : * will result otherwise.
81 : *
82 : * @param begin MSetIterator for first item to fetch.
83 : * @param end MSetIterator for item after last item to fetch.
84 : */
85 : void fetch(const MSetIterator &begin, const MSetIterator &end) const;
86 :
87 : /** Fetch the single item specified.
88 : */
89 : void fetch(const MSetIterator &item) const;
90 :
91 : /** Fetch all the items in the MSet.
92 : */
93 : void fetch() const;
94 :
95 : /** This converts the weight supplied to a percentage score.
96 : * The return value will be in the range 0 to 100, and will be 0 if
97 : * and only if the item did not match the query at all.
98 : */
99 : Xapian::percent convert_to_percent(Xapian::weight wt) const;
100 :
101 : /// Return the percentage score for a particular item.
102 : Xapian::percent convert_to_percent(const MSetIterator &it) const;
103 :
104 : /** Return the term frequency of the given query term.
105 : *
106 : * @param tname The term to look for.
107 : *
108 : * @exception Xapian::InvalidArgumentError is thrown if the term was
109 : * not in the query.
110 : */
111 : Xapian::doccount get_termfreq(const std::string &tname) const;
112 :
113 : /** Return the term weight of the given query term.
114 : *
115 : * @param tname The term to look for.
116 : *
117 : * @exception Xapian::InvalidArgumentError is thrown if the term was
118 : * not in the query.
119 : */
120 : Xapian::weight get_termweight(const std::string &tname) const;
121 :
122 : /** The index of the first item in the result which was put into the
123 : * MSet.
124 : *
125 : * This corresponds to the parameter "first" specified in
126 : * Xapian::Enquire::get_mset(). A value of 0 corresponds to the
127 : * highest result being the first item in the MSet.
128 : */
129 : Xapian::doccount get_firstitem() const;
130 :
131 : /** A lower bound on the number of documents in the database which
132 : * match the query.
133 : *
134 : * This figure takes into account collapsing of duplicates,
135 : * and weighting cutoff values.
136 : *
137 : * This number is usually considerably less than the actual number
138 : * of documents which match the query.
139 : */
140 : Xapian::doccount get_matches_lower_bound() const;
141 :
142 : /** An estimate for the number of documents in the database which
143 : * match the query.
144 : *
145 : * This figure takes into account collapsing of duplicates,
146 : * and weighting cutoff values.
147 : *
148 : * This value is returned because there is sometimes a request to
149 : * display such information. However, our experience is that
150 : * presenting this value to users causes them to worry about the
151 : * large number of results, rather than how useful those at the top
152 : * of the result set are, and is thus undesirable.
153 : */
154 : Xapian::doccount get_matches_estimated() const;
155 :
156 : /** An upper bound on the number of documents in the database which
157 : * match the query.
158 : *
159 : * This figure takes into account collapsing of duplicates,
160 : * and weighting cutoff values.
161 : *
162 : * This number is usually considerably greater than the actual
163 : * number of documents which match the query.
164 : */
165 : Xapian::doccount get_matches_upper_bound() const;
166 :
167 : /** The maximum possible weight in the MSet.
168 : * This weight is likely not to be attained in the set of results,
169 : * but represents an upper bound on the weight which a document
170 : * could attain for the given query.
171 : */
172 : Xapian::weight get_max_possible() const;
173 :
174 : /** The greatest weight which is attained by any document in the
175 : * database.
176 : *
177 : * If firstitem == 0, this is the weight of the first entry in
178 : * items.
179 : *
180 : * If no documents are found by the query, this will be 0.
181 : *
182 : * Note that calculation of max_attained requires calculation
183 : * of at least one result item - therefore, if no items were
184 : * requested when the query was performed (by specifying
185 : * maxitems = 0 in Xapian::Enquire::get_mset()), this value will be 0.
186 : */
187 : Xapian::weight get_max_attained() const;
188 :
189 : /** The number of items in this MSet */
190 : Xapian::doccount size() const;
191 :
192 : /** Required to allow use as an STL container. */
193 : Xapian::doccount max_size() const { return size(); }
194 :
195 : /** Test if this MSet is empty */
196 : bool empty() const;
197 :
198 : /** Swap the MSet we point to with another */
199 : void swap(MSet & other);
200 :
201 : /** Iterator for the terms in this MSet */
202 : MSetIterator begin() const;
203 :
204 : /** End iterator corresponding to begin() */
205 : MSetIterator end() const;
206 :
207 : /** Iterator pointing to the last element of this MSet */
208 : MSetIterator back() const;
209 :
210 : /** This returns the document at position i in this MSet object.
211 : *
212 : * Note that this is not the same as the document at rank i in the
213 : * query, unless the "first" parameter to Xapian::Enquire::get_mset
214 : * was 0. Rather, it is the document at rank i + first.
215 : *
216 : * In other words, the offset is into the documents represented by
217 : * this object, not into the set of documents matching the query.
218 : */
219 : MSetIterator operator[](Xapian::doccount i) const;
220 :
221 : /// Allow use as an STL container
222 : //@{
223 : typedef MSetIterator value_type; // FIXME: not assignable...
224 : typedef MSetIterator iterator;
225 : typedef MSetIterator const_iterator;
226 : typedef MSetIterator & reference; // Hmm
227 : typedef MSetIterator & const_reference;
228 : typedef MSetIterator * pointer; // Hmm
229 : typedef Xapian::doccount_diff difference_type;
230 : typedef Xapian::doccount size_type;
231 : //@}
232 :
233 : /// Return a string describing this object.
234 : std::string get_description() const;
235 : };
236 :
237 : /** An iterator pointing to items in an MSet.
238 : * This is used for access to individual results of a match.
239 : */
240 : class XAPIAN_VISIBILITY_DEFAULT MSetIterator {
241 : private:
242 : friend class MSet;
243 : friend bool operator==(const MSetIterator &a, const MSetIterator &b);
244 : friend bool operator!=(const MSetIterator &a, const MSetIterator &b);
245 :
246 : MSetIterator(Xapian::doccount index_, const MSet & mset_)
247 : : index(index_), mset(mset_) { }
248 :
249 : Xapian::doccount index;
250 : MSet mset;
251 :
252 : public:
253 : /** Create an uninitialised iterator; this cannot be used, but is
254 : * convenient syntactically.
255 : */
256 3 : MSetIterator() : index(0), mset() { }
257 :
258 165 : ~MSetIterator() { }
259 :
260 : /// Copying is allowed (and is cheap).
261 58 : MSetIterator(const MSetIterator &other) {
262 58 : index = other.index;
263 58 : mset = other.mset;
264 58 : }
265 :
266 : /// Assignment is allowed (and is cheap).
267 31 : void operator=(const MSetIterator &other) {
268 31 : index = other.index;
269 31 : mset = other.mset;
270 31 : }
271 :
272 : /// Advance the iterator.
273 61 : MSetIterator & operator++() {
274 61 : ++index;
275 61 : return *this;
276 : }
277 :
278 : /// Advance the iterator (postfix variant).
279 28 : MSetIterator operator++(int) {
280 28 : MSetIterator tmp = *this;
281 28 : ++index;
282 : return tmp;
283 : }
284 :
285 : /// Decrement the iterator.
286 : MSetIterator & operator--() {
287 : --index;
288 : return *this;
289 : }
290 :
291 : /// Decrement the iterator (postfix variant).
292 : MSetIterator operator--(int) {
293 : MSetIterator tmp = *this;
294 : --index;
295 : return tmp;
296 : }
297 :
298 : /// Get the document ID for the current position.
299 : Xapian::docid operator*() const;
300 :
301 : /** Get a Xapian::Document object for the current position.
302 : *
303 : * This method returns a Xapian::Document object which provides the
304 : * information about the document pointed to by the MSetIterator.
305 : *
306 : * If the underlying database has suitable support, using this call
307 : * (rather than asking the database for a document based on its
308 : * document ID) will enable the system to ensure that the correct
309 : * data is returned, and that the document has not been deleted
310 : * or changed since the query was performed.
311 : *
312 : * @return A Xapian::Document object containing the document data.
313 : *
314 : * @exception Xapian::DocNotFoundError The document specified could not
315 : * be found in the database.
316 : */
317 : Xapian::Document get_document() const;
318 :
319 : /** Get the rank of the document at the current position.
320 : *
321 : * The rank is the position that this document is at in the ordered
322 : * list of results of the query. The result is 0-based - i.e. the
323 : * top-ranked document has a rank of 0.
324 : */
325 : Xapian::doccount get_rank() const {
326 : return mset.get_firstitem() + index;
327 : }
328 :
329 : /// Get the weight of the document at the current position
330 : Xapian::weight get_weight() const;
331 :
332 : /** Get the collapse key for this document.
333 : */
334 : std::string get_collapse_key() const;
335 :
336 : /** Get an estimate of the number of documents that have been collapsed
337 : * into this one.
338 : *
339 : * The estimate will always be less than or equal to the actual
340 : * number of other documents satisfying the match criteria with the
341 : * same collapse key as this document.
342 : *
343 : * This method may return 0 even though there are other documents with
344 : * the same collapse key which satisfying the match criteria. However
345 : * if this method returns non-zero, there definitely are other such
346 : * documents. So this method may be used to inform the user that
347 : * there are "at least N other matches in this group", or to control
348 : * whether to offer a "show other documents in this group" feature
349 : * (but note that it may not offer it in every case where it would
350 : * show other documents).
351 : */
352 : Xapian::doccount get_collapse_count() const;
353 :
354 : /** This returns the weight of the document as a percentage score.
355 : *
356 : * The return value will be in the range 0 to 100: 0 meaning
357 : * that the item did not match the query at all.
358 : */
359 : Xapian::percent get_percent() const;
360 :
361 : /// Return a string describing this object.
362 : std::string get_description() const;
363 :
364 : /// Allow use as an STL iterator
365 : //@{
366 : typedef std::bidirectional_iterator_tag iterator_category; // FIXME: could enhance to be a randomaccess_iterator
367 : typedef Xapian::docid value_type;
368 : typedef Xapian::doccount_diff difference_type;
369 : typedef Xapian::docid * pointer;
370 : typedef Xapian::docid & reference;
371 : //@}
372 : };
373 :
374 30 : inline bool operator==(const MSetIterator &a, const MSetIterator &b)
375 : {
376 30 : return (a.index == b.index);
377 : }
378 :
379 58 : inline bool operator!=(const MSetIterator &a, const MSetIterator &b)
380 : {
381 58 : return (a.index != b.index);
382 : }
383 :
384 : class ESetIterator;
385 :
386 : /** Class representing an ordered set of expand terms (an ESet).
387 : * This set represents the results of an expand operation, which is
388 : * performed by Xapian::Enquire::get_eset().
389 : */
390 : class XAPIAN_VISIBILITY_DEFAULT ESet {
391 : public:
392 : class Internal;
393 : /// @internal Reference counted internals.
394 : Xapian::Internal::RefCntPtr<Internal> internal;
395 :
396 : /// Construct an empty ESet
397 : ESet();
398 :
399 : /// Destructor.
400 : ~ESet();
401 :
402 : /// Copying is allowed (and is cheap).
403 : ESet(const ESet & other);
404 :
405 : /// Assignment is allowed (and is cheap).
406 : void operator=(const ESet &other);
407 :
408 : /** A lower bound on the number of terms which are in the full
409 : * set of results of the expand. This will be greater than or
410 : * equal to size()
411 : */
412 : Xapian::termcount get_ebound() const;
413 :
414 : /** The number of terms in this E-Set */
415 : Xapian::termcount size() const;
416 :
417 : /** Required to allow use as an STL container. */
418 : Xapian::termcount max_size() const { return size(); }
419 :
420 : /** Test if this E-Set is empty */
421 : bool empty() const;
422 :
423 : /** Swap the E-Set we point to with another */
424 : void swap(ESet & other);
425 :
426 : /** Iterator for the terms in this E-Set */
427 : ESetIterator begin() const;
428 :
429 : /** End iterator corresponding to begin() */
430 : ESetIterator end() const;
431 :
432 : /** Iterator pointing to the last element of this E-Set */
433 : ESetIterator back() const;
434 :
435 : /** This returns the term at position i in this E-Set. */
436 : ESetIterator operator[](Xapian::termcount i) const;
437 :
438 : /// Return a string describing this object.
439 : std::string get_description() const;
440 : };
441 :
442 : /** Iterate through terms in the ESet */
443 : class XAPIAN_VISIBILITY_DEFAULT ESetIterator {
444 : private:
445 : friend class ESet;
446 : friend bool operator==(const ESetIterator &a, const ESetIterator &b);
447 : friend bool operator!=(const ESetIterator &a, const ESetIterator &b);
448 :
449 : ESetIterator(Xapian::termcount index_, const ESet & eset_)
450 : : index(index_), eset(eset_) { }
451 :
452 : Xapian::termcount index;
453 : ESet eset;
454 :
455 : public:
456 : /** Create an uninitialised iterator; this cannot be used, but is
457 : * convenient syntactically.
458 : */
459 : ESetIterator() : index(0), eset() { }
460 :
461 8 : ~ESetIterator() { }
462 :
463 : /// Copying is allowed (and is cheap).
464 : ESetIterator(const ESetIterator &other) {
465 : index = other.index;
466 : eset = other.eset;
467 : }
468 :
469 : /// Assignment is allowed (and is cheap).
470 : void operator=(const ESetIterator &other) {
471 : index = other.index;
472 : eset = other.eset;
473 : }
474 :
475 : /// Advance the iterator.
476 0 : ESetIterator & operator++() {
477 0 : ++index;
478 0 : return *this;
479 : }
480 :
481 : /// Advance the iterator (postfix variant).
482 : ESetIterator operator++(int) {
483 : ESetIterator tmp = *this;
484 : ++index;
485 : return tmp;
486 : }
487 :
488 : /// Decrement the iterator.
489 : ESetIterator & operator--() {
490 : --index;
491 : return *this;
492 : }
493 :
494 : /// Decrement the iterator (postfix variant).
495 : ESetIterator operator--(int) {
496 : ESetIterator tmp = *this;
497 : --index;
498 : return tmp;
499 : }
500 :
501 : /// Get the term for the current position
502 : const std::string & operator *() const;
503 :
504 : /// Get the weight of the term at the current position
505 : Xapian::weight get_weight() const;
506 :
507 : /// Return a string describing this object.
508 : std::string get_description() const;
509 :
510 : /// Allow use as an STL iterator
511 : //@{
512 : typedef std::bidirectional_iterator_tag iterator_category; // FIXME: go for randomaccess_iterator!
513 : typedef std::string value_type;
514 : typedef Xapian::termcount_diff difference_type;
515 : typedef std::string * pointer;
516 : typedef std::string & reference;
517 : //@}
518 : };
519 :
520 : inline bool operator==(const ESetIterator &a, const ESetIterator &b)
521 : {
522 : return (a.index == b.index);
523 : }
524 :
525 4 : inline bool operator!=(const ESetIterator &a, const ESetIterator &b)
526 : {
527 4 : return (a.index != b.index);
528 : }
529 :
530 : /** A relevance set (R-Set).
531 : * This is the set of documents which are marked as relevant, for use
532 : * in modifying the term weights, and in performing query expansion.
533 : */
534 : class XAPIAN_VISIBILITY_DEFAULT RSet {
535 : public:
536 : /// Class holding details of RSet
537 : class Internal;
538 :
539 : /// @internal Reference counted internals.
540 : Xapian::Internal::RefCntPtr<Internal> internal;
541 :
542 : /// Copy constructor
543 : RSet(const RSet &rset);
544 :
545 : /// Assignment operator
546 : void operator=(const RSet &rset);
547 :
548 : /// Default constructor
549 : RSet();
550 :
551 : /// Destructor
552 : ~RSet();
553 :
554 : /** The number of documents in this R-Set */
555 : Xapian::doccount size() const;
556 :
557 : /** Test if this R-Set is empty */
558 : bool empty() const;
559 :
560 : /// Add a document to the relevance set.
561 : void add_document(Xapian::docid did);
562 :
563 : /// Add a document to the relevance set.
564 20 : void add_document(const Xapian::MSetIterator & i) { add_document(*i); }
565 :
566 : /// Remove a document from the relevance set.
567 : void remove_document(Xapian::docid did);
568 :
569 : /// Remove a document from the relevance set.
570 : void remove_document(const Xapian::MSetIterator & i) { remove_document(*i); }
571 :
572 : /// Test if a given document in the relevance set.
573 : bool contains(Xapian::docid did) const;
574 :
575 : /// Test if a given document in the relevance set.
576 : bool contains(const Xapian::MSetIterator & i) const { return contains(*i); }
577 :
578 : /// Return a string describing this object.
579 : std::string get_description() const;
580 : };
581 :
582 : /** Base class for matcher decision functor.
583 : */
584 : class XAPIAN_VISIBILITY_DEFAULT MatchDecider {
585 : public:
586 : /** Decide whether we want this document to be in the MSet.
587 : *
588 : * Return true if the document is acceptable, or false if the document
589 : * should be excluded from the MSet.
590 : */
591 : virtual bool operator()(const Xapian::Document &doc) const = 0;
592 :
593 : /// Destructor.
594 : virtual ~MatchDecider();
595 : };
596 :
597 : /** This class provides an interface to the information retrieval
598 : * system for the purpose of searching.
599 : *
600 : * Databases are usually opened lazily, so exceptions may not be
601 : * thrown where you would expect them to be. You should catch
602 : * Xapian::Error exceptions when calling any method in Xapian::Enquire.
603 : *
604 : * @exception Xapian::InvalidArgumentError will be thrown if an invalid
605 : * argument is supplied, for example, an unknown database type.
606 : */
607 : class XAPIAN_VISIBILITY_DEFAULT Enquire {
608 : public:
609 : /// Copying is allowed (and is cheap).
610 : Enquire(const Enquire & other);
611 :
612 : /// Assignment is allowed (and is cheap).
613 : void operator=(const Enquire & other);
614 :
615 : class Internal;
616 : /// @internal Reference counted internals.
617 : Xapian::Internal::RefCntPtr<Internal> internal;
618 :
619 : /** Create a Xapian::Enquire object.
620 : *
621 : * This specification cannot be changed once the Xapian::Enquire is
622 : * opened: you must create a new Xapian::Enquire object to access a
623 : * different database, or set of databases.
624 : *
625 : * The database supplied must have been initialised (ie, must not be
626 : * the result of calling the Database::Database() constructor). If
627 : * you need to handle a situation where you have no index gracefully,
628 : * a database created with InMemory::open() can be passed here,
629 : * which represents a completely empty database.
630 : *
631 : * @param database Specification of the database or databases to
632 : * use.
633 : * @param errorhandler_ A pointer to the error handler to use.
634 : * Ownership of the object pointed to is not assumed by the
635 : * Xapian::Enquire object - the user should delete the
636 : * Xapian::ErrorHandler object after the Xapian::Enquire object
637 : * is deleted. To use no error handler, this parameter
638 : * should be 0.
639 : *
640 : * @exception Xapian::InvalidArgumentError will be thrown if an
641 : * initialised Database object is supplied.
642 : */
643 : explicit Enquire(const Database &database, ErrorHandler * errorhandler_ = 0);
644 :
645 : /** Close the Xapian::Enquire object.
646 : */
647 : ~Enquire();
648 :
649 : /** Set the query to run.
650 : *
651 : * @param query the new query to run.
652 : * @param qlen the query length to use in weight calculations -
653 : * by default the sum of the wqf of all terms is used.
654 : */
655 : void set_query(const Xapian::Query & query, Xapian::termcount qlen = 0);
656 :
657 : /** Get the query which has been set.
658 : * This is only valid after set_query() has been called.
659 : *
660 : * @exception Xapian::InvalidArgumentError will be thrown if query has
661 : * not yet been set.
662 : */
663 : const Xapian::Query & get_query() const;
664 :
665 : /** Set the weighting scheme to use for queries.
666 : *
667 : * @param weight_ the new weighting scheme. If no weighting scheme
668 : * is specified, the default is BM25 with the
669 : * default parameters.
670 : */
671 : void set_weighting_scheme(const Weight &weight_);
672 :
673 : /** Set the collapse key to use for queries.
674 : *
675 : * @param collapse_key value number to collapse on - at most one MSet
676 : * entry with each particular value will be returned.
677 : *
678 : * The entry returned will be the best entry with that particular
679 : * value (highest weight or highest sorting key).
680 : *
681 : * An example use might be to create a value for each document
682 : * containing an MD5 hash of the document contents. Then
683 : * duplicate documents from different sources can be eliminated at
684 : * search time (it's better to eliminate duplicates at index time,
685 : * but this may not be always be possible - for example the search
686 : * may be over more than one Xapian database).
687 : *
688 : * Another use is to group matches in a particular category (e.g.
689 : * you might collapse a mailing list search on the Subject: so
690 : * that there's only one result per discussion thread). In this
691 : * case you can use get_collapse_count() to give the user some
692 : * idea how many other results there are. And if you index the
693 : * Subject: as a boolean term as well as putting it in a value,
694 : * you can offer a link to a non-collapsed search restricted to
695 : * that thread using a boolean filter.
696 : *
697 : * (default is Xapian::BAD_VALUENO which means no collapsing).
698 : */
699 : void set_collapse_key(Xapian::valueno collapse_key);
700 :
701 : typedef enum {
702 : ASCENDING = 1,
703 : DESCENDING = 0,
704 : DONT_CARE = 2
705 : } docid_order;
706 :
707 : /** Set the direction in which documents are ordered by document id
708 : * in the returned MSet.
709 : *
710 : * This order only has an effect on documents which would otherwise
711 : * have equal rank. For a weighted probabilistic match with no sort
712 : * value, this means documents with equal weight. For a boolean match,
713 : * with no sort value, this means all documents. And if a sort value
714 : * is used, this means documents with equal sort value (and also equal
715 : * weight if ordering on relevance after the sort).
716 : *
717 : * @param order This can be:
718 : * - Xapian::Enquire::ASCENDING
719 : * docids sort in ascending order (default)
720 : * - Xapian::Enquire::DESCENDING
721 : * docids sort in descending order
722 : * - Xapian::Enquire::DONT_CARE
723 : * docids sort in whatever order is most efficient for the backend
724 : *
725 : * Note: If you add documents in strict date order, then a boolean
726 : * search - i.e. set_weighting_scheme(Xapian::BoolWeight()) - with
727 : * set_docid_order(Xapian::Enquire::DESCENDING) is a very efficient
728 : * way to perform "sort by date, newest first".
729 : */
730 : void set_docid_order(docid_order order);
731 :
732 : /** Set the percentage and/or weight cutoffs.
733 : *
734 : * @param percent_cutoff Minimum percentage score for returned
735 : * documents. If a document has a lower percentage score than this,
736 : * it will not appear in the MSet. If your intention is to return
737 : * only matches which contain all the terms in the query, then
738 : * it's more efficient to use Xapian::Query::OP_AND instead of
739 : * Xapian::Query::OP_OR in the query than to use set_cutoff(100).
740 : * (default 0 => no percentage cut-off).
741 : * @param weight_cutoff Minimum weight for a document to be returned.
742 : * If a document has a lower score that this, it will not appear
743 : * in the MSet. It is usually only possible to choose an
744 : * appropriate weight for cutoff based on the results of a
745 : * previous run of the same query; this is thus mainly useful for
746 : * alerting operations. The other potential use is with a user
747 : * specified weighting scheme.
748 : * (default 0 => no weight cut-off).
749 : */
750 : void set_cutoff(Xapian::percent percent_cutoff, Xapian::weight weight_cutoff = 0);
751 :
752 : /** Set the sorting to be by relevance only.
753 : *
754 : * This is the default.
755 : */
756 : void set_sort_by_relevance();
757 :
758 : /** Set the sorting to be by value only.
759 : *
760 : * NB sorting of values uses a string comparison, so you'll need to
761 : * store numbers padded with leading zeros or spaces, or with the
762 : * number of digits prepended.
763 : *
764 : * @param sort_key value number to sort on.
765 : *
766 : * @param ascending If true, documents values which sort higher by
767 : * string compare are better. If false, the sort order
768 : * is reversed. (default true)
769 : */
770 : void set_sort_by_value(Xapian::valueno sort_key, bool ascending = true);
771 :
772 : /** Set the sorting to be by key generated from values only.
773 : *
774 : * @param sorter The functor to use for generating keys.
775 : *
776 : * @param ascending If true, documents values which sort higher by
777 : * string compare are better. If false, the sort order
778 : * is reversed. (default true)
779 : */
780 : void set_sort_by_key(Xapian::Sorter * sorter, bool ascending = true);
781 :
782 : /** Set the sorting to be by value, then by relevance for documents
783 : * with the same value.
784 : *
785 : * NB sorting of values uses a string comparison, so you'll need to
786 : * store numbers padded with leading zeros or spaces, or with the
787 : * number of digits prepended.
788 : *
789 : * @param sort_key value number to sort on.
790 : *
791 : * @param ascending If true, documents values which sort higher by
792 : * string compare are better. If false, the sort order
793 : * is reversed. (default true)
794 : */
795 : void set_sort_by_value_then_relevance(Xapian::valueno sort_key,
796 : bool ascending = true);
797 :
798 : /** Set the sorting to be by keys generated from values, then by
799 : * relevance for documents with identical keys.
800 : *
801 : * @param sorter The functor to use for generating keys.
802 : *
803 : * @param ascending If true, keys which sort higher by
804 : * string compare are better. If false, the sort order
805 : * is reversed. (default true)
806 : */
807 : void set_sort_by_key_then_relevance(Xapian::Sorter * sorter,
808 : bool ascending = true);
809 :
810 : /** Set the sorting to be by relevance then value.
811 : *
812 : * NB sorting of values uses a string comparison, so you'll need to
813 : * store numbers padded with leading zeros or spaces, or with the
814 : * number of digits prepended.
815 : *
816 : * Note that with the default BM25 weighting scheme parameters,
817 : * non-identical documents will rarely have the same weight, so
818 : * this setting will give very similar results to
819 : * set_sort_by_relevance(). It becomes more useful with particular
820 : * BM25 parameter settings (e.g. BM25Weight(1,0,1,0,0)) or custom
821 : * weighting schemes.
822 : *
823 : * @param sort_key value number to sort on.
824 : *
825 : * @param ascending If true, documents values which sort higher by
826 : * string compare are better. If false, the sort order
827 : * is reversed. (default true)
828 : */
829 : void set_sort_by_relevance_then_value(Xapian::valueno sort_key,
830 : bool ascending = true);
831 :
832 : /** Set the sorting to be by relevance, then by keys generated from
833 : * values.
834 : *
835 : * Note that with the default BM25 weighting scheme parameters,
836 : * non-identical documents will rarely have the same weight, so
837 : * this setting will give very similar results to
838 : * set_sort_by_relevance(). It becomes more useful with particular
839 : * BM25 parameter settings (e.g. BM25Weight(1,0,1,0,0)) or custom
840 : * weighting schemes.
841 : *
842 : * @param sorter The functor to use for generating keys.
843 : *
844 : * @param ascending If true, keys which sort higher by
845 : * string compare are better. If false, the sort order
846 : * is reversed. (default true)
847 : */
848 : void set_sort_by_relevance_then_key(Xapian::Sorter * sorter,
849 : bool ascending = true);
850 :
851 : /** Get (a portion of) the match set for the current query.
852 : *
853 : * @param first the first item in the result set to return.
854 : * A value of zero corresponds to the first item
855 : * returned being that with the highest score.
856 : * A value of 10 corresponds to the first 10 items
857 : * being ignored, and the returned items starting
858 : * at the eleventh.
859 : * @param maxitems the maximum number of items to return.
860 : * @param checkatleast the minimum number of items to check. Because
861 : * the matcher optimises, it won't consider every
862 : * document which might match, so the total number
863 : * of matches is estimated. Setting checkatleast
864 : * forces it to consider at least this many matches
865 : * and so allows for reliable paging links.
866 : * @param omrset the relevance set to use when performing the query.
867 : * @param mdecider a decision functor to use to decide whether a
868 : * given document should be put in the MSet.
869 : * @param matchspy a decision functor to use to decide whether a
870 : * given document should be put in the MSet. The
871 : * matchspy is applied to every document which is
872 : * a potential candidate for the MSet, so if there are
873 : * checkatleast or more such documents, the matchspy
874 : * will see at least checkatleast. The mdecider is
875 : * assumed to be a relatively expensive test so may
876 : * be applied in a lazier fashion.
877 : *
878 : * @return A Xapian::MSet object containing the results of the
879 : * query.
880 : *
881 : * @exception Xapian::InvalidArgumentError See class documentation.
882 : */
883 : MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems,
884 : Xapian::doccount checkatleast = 0,
885 : const RSet * omrset = 0,
886 : const MatchDecider * mdecider = 0) const;
887 : MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems,
888 : Xapian::doccount checkatleast,
889 : const RSet * omrset,
890 : const MatchDecider * mdecider,
891 : const MatchDecider * matchspy) const;
892 : MSet get_mset(Xapian::doccount first, Xapian::doccount maxitems,
893 : const RSet * omrset,
894 : const MatchDecider * mdecider = 0) const {
895 : return get_mset(first, maxitems, 0, omrset, mdecider);
896 : }
897 :
898 : static const int INCLUDE_QUERY_TERMS = 1;
899 : static const int USE_EXACT_TERMFREQ = 2;
900 : #ifndef _MSC_VER
901 : /// Deprecated in Xapian 1.0.0, use INCLUDE_QUERY_TERMS instead.
902 : XAPIAN_DEPRECATED(static const int include_query_terms) = 1;
903 : /// Deprecated in Xapian 1.0.0, use USE_EXACT_TERMFREQ instead.
904 : XAPIAN_DEPRECATED(static const int use_exact_termfreq) = 2;
905 : #else
906 : // Work around MSVC stupidity (you get a warning for deprecating a
907 : // declaration).
908 : static const int include_query_terms = 1;
909 : static const int use_exact_termfreq = 2;
910 : #pragma deprecated("Xapian::Enquire::include_query_terms", "Xapian::Enquire::use_exact_termfreq")
911 : #endif
912 :
913 : /** Get the expand set for the given rset.
914 : *
915 : * @param maxitems the maximum number of items to return.
916 : * @param omrset the relevance set to use when performing
917 : * the expand operation.
918 : * @param flags zero or more of these values |-ed together:
919 : * - Xapian::Enquire::INCLUDE_QUERY_TERMS query
920 : * terms may be returned from expand
921 : * - Xapian::Enquire::USE_EXACT_TERMFREQ for multi
922 : * dbs, calculate the exact termfreq; otherwise an
923 : * approximation is used which can greatly improve
924 : * efficiency, but still returns good results.
925 : * @param k the parameter k in the query expansion algorithm
926 : * (default is 1.0)
927 : * @param edecider a decision functor to use to decide whether a
928 : * given term should be put in the ESet
929 : *
930 : * @return An ESet object containing the results of the
931 : * expand.
932 : *
933 : * @exception Xapian::InvalidArgumentError See class documentation.
934 : */
935 : ESet get_eset(Xapian::termcount maxitems,
936 : const RSet & omrset,
937 : int flags = 0,
938 : double k = 1.0,
939 : const Xapian::ExpandDecider * edecider = 0) const;
940 :
941 : /** Get the expand set for the given rset.
942 : *
943 : * @param maxitems the maximum number of items to return.
944 : * @param omrset the relevance set to use when performing
945 : * the expand operation.
946 : * @param edecider a decision functor to use to decide whether a
947 : * given term should be put in the ESet
948 : *
949 : * @return An ESet object containing the results of the
950 : * expand.
951 : *
952 : * @exception Xapian::InvalidArgumentError See class documentation.
953 : */
954 : inline ESet get_eset(Xapian::termcount maxitems, const RSet & omrset,
955 4 : const Xapian::ExpandDecider * edecider) const {
956 4 : return get_eset(maxitems, omrset, 0, 1.0, edecider);
957 : }
958 :
959 : /** Get terms which match a given document, by document id.
960 : *
961 : * This method returns the terms in the current query which match
962 : * the given document.
963 : *
964 : * It is possible for the document to have been removed from the
965 : * database between the time it is returned in an MSet, and the
966 : * time that this call is made. If possible, you should specify
967 : * an MSetIterator instead of a Xapian::docid, since this will enable
968 : * database backends with suitable support to prevent this
969 : * occurring.
970 : *
971 : * Note that a query does not need to have been run in order to
972 : * make this call.
973 : *
974 : * @param did The document id for which to retrieve the matching
975 : * terms.
976 : *
977 : * @return An iterator returning the terms which match the
978 : * document. The terms will be returned (as far as this
979 : * makes any sense) in the same order as the terms
980 : * in the query. Terms will not occur more than once,
981 : * even if they do in the query.
982 : *
983 : * @exception Xapian::InvalidArgumentError See class documentation.
984 : * @exception Xapian::DocNotFoundError The document specified
985 : * could not be found in the database.
986 : */
987 : TermIterator get_matching_terms_begin(Xapian::docid did) const;
988 :
989 : /** End iterator corresponding to get_matching_terms_begin() */
990 : TermIterator get_matching_terms_end(Xapian::docid /*did*/) const {
991 : return TermIterator(NULL);
992 : }
993 :
994 : /** Get terms which match a given document, by match set item.
995 : *
996 : * This method returns the terms in the current query which match
997 : * the given document.
998 : *
999 : * If the underlying database has suitable support, using this call
1000 : * (rather than passing a Xapian::docid) will enable the system to
1001 : * ensure that the correct data is returned, and that the document
1002 : * has not been deleted or changed since the query was performed.
1003 : *
1004 : * @param it The iterator for which to retrieve the matching terms.
1005 : *
1006 : * @return An iterator returning the terms which match the
1007 : * document. The terms will be returned (as far as this
1008 : * makes any sense) in the same order as the terms
1009 : * in the query. Terms will not occur more than once,
1010 : * even if they do in the query.
1011 : *
1012 : * @exception Xapian::InvalidArgumentError See class documentation.
1013 : * @exception Xapian::DocNotFoundError The document specified
1014 : * could not be found in the database.
1015 : */
1016 : TermIterator get_matching_terms_begin(const MSetIterator &it) const;
1017 :
1018 : /** End iterator corresponding to get_matching_terms_begin() */
1019 : TermIterator get_matching_terms_end(const MSetIterator &/*it*/) const {
1020 : return TermIterator(NULL);
1021 : }
1022 :
1023 : /** Register a MatchDecider.
1024 : *
1025 : * This is used to associate a name with a matchdecider.
1026 : *
1027 : * @deprecated This method is deprecated. It was added long ago with
1028 : * the intention that it would allow the remote backend to support
1029 : * use of MatchDecider objects, but there's a better approach.
1030 : *
1031 : * @param name The name to register this matchdecider as.
1032 : * @param mdecider The matchdecider. If omitted, then remove
1033 : * any matchdecider registered with this name.
1034 : */
1035 : XAPIAN_DEPRECATED(
1036 : void register_match_decider(const std::string &name,
1037 : const MatchDecider *mdecider = NULL));
1038 :
1039 : /// Return a string describing this object.
1040 : std::string get_description() const;
1041 : };
1042 :
1043 : }
1044 :
1045 : class RemoteServer;
1046 : class ScaleWeight;
1047 :
1048 : namespace Xapian {
1049 :
1050 : /// Abstract base class for weighting schemes
1051 : class XAPIAN_VISIBILITY_DEFAULT Weight {
1052 : friend class Enquire; // So Enquire can clone us
1053 : friend class ::RemoteServer; // So RemoteServer can clone us - FIXME
1054 : friend class ::ScaleWeight;
1055 : public:
1056 : class Internal;
1057 : protected:
1058 : Weight(const Weight &);
1059 : private:
1060 : void operator=(Weight &);
1061 :
1062 : /** Return a new weight object of this type.
1063 : *
1064 : * A subclass called FooWeight taking parameters param1 and param2
1065 : * should implement this as:
1066 : *
1067 : * virtual FooWeight * clone() const {
1068 : * return new FooWeight(param1, param2);
1069 : * }
1070 : */
1071 : virtual Weight * clone() const = 0;
1072 :
1073 : protected:
1074 : const Internal * internal; // Weight::Internal == Stats
1075 : Xapian::doclength querysize;
1076 : Xapian::termcount wqf;
1077 : std::string tname;
1078 :
1079 : public:
1080 : // FIXME:1.1: initialise internal to NULL here
1081 : Weight() { }
1082 : virtual ~Weight();
1083 :
1084 : /** Create a new weight object of the same type as this and initialise
1085 : * it with the specified statistics.
1086 : *
1087 : * You shouldn't call this method yourself - it's called by
1088 : * Enquire.
1089 : *
1090 : * @param internal_ Object to ask for collection statistics.
1091 : * @param querysize_ Query size.
1092 : * @param wqf_ Within query frequency of term this object is
1093 : * associated with.
1094 : * @param tname_ Term which this object is associated with.
1095 : */
1096 : Weight * create(const Internal * internal_, Xapian::doclength querysize_,
1097 : Xapian::termcount wqf_, const std::string & tname_) const;
1098 :
1099 : /** Name of the weighting scheme.
1100 : *
1101 : * If the subclass is called FooWeight, this should return "Foo".
1102 : */
1103 : virtual std::string name() const = 0;
1104 :
1105 : /// Serialise object parameters into a string.
1106 : virtual std::string serialise() const = 0;
1107 :
1108 : /// Create object given string serialisation returned by serialise().
1109 : virtual Weight * unserialise(const std::string &s) const = 0;
1110 :
1111 : /** Get a weight which is part of the sum over terms being performed.
1112 : * This returns a weight for a given term and document. These
1113 : * weights are summed to give a total weight for the document.
1114 : *
1115 : * @param wdf the within document frequency of the term.
1116 : * @param len the (unnormalised) document length.
1117 : */
1118 : virtual Xapian::weight get_sumpart(Xapian::termcount wdf,
1119 : Xapian::doclength len) const = 0;
1120 :
1121 : /** Gets the maximum value that get_sumpart() may return. This
1122 : * is used in optimising searches, by having the postlist tree
1123 : * decay appropriately when parts of it can have limited, or no,
1124 : * further effect.
1125 : */
1126 : virtual Xapian::weight get_maxpart() const = 0;
1127 :
1128 : /** Get an extra weight for a document to add to the sum calculated
1129 : * over the query terms.
1130 : * This returns a weight for a given document, and is used by some
1131 : * weighting schemes to account for influence such as document
1132 : * length.
1133 : *
1134 : * @param len the (unnormalised) document length.
1135 : */
1136 : virtual Xapian::weight get_sumextra(Xapian::doclength len) const = 0;
1137 :
1138 : /** Gets the maximum value that get_sumextra() may return. This
1139 : * is used in optimising searches.
1140 : */
1141 : virtual Xapian::weight get_maxextra() const = 0;
1142 :
1143 : /// return false if the weight object doesn't need doclength
1144 : virtual bool get_sumpart_needs_doclength() const; /* { return true; } */
1145 : };
1146 :
1147 : /// Boolean weighting scheme (everything gets 0)
1148 : class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
1149 : public:
1150 : BoolWeight * clone() const;
1151 : BoolWeight() { }
1152 : ~BoolWeight();
1153 : std::string name() const;
1154 : std::string serialise() const;
1155 : BoolWeight * unserialise(const std::string & s) const;
1156 : Xapian::weight get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const;
1157 : Xapian::weight get_maxpart() const;
1158 :
1159 : Xapian::weight get_sumextra(Xapian::doclength len) const;
1160 : Xapian::weight get_maxextra() const;
1161 :
1162 : bool get_sumpart_needs_doclength() const;
1163 : };
1164 :
1165 : /** BM25 weighting scheme
1166 : *
1167 : * BM25 weighting options : The BM25 formula is \f[
1168 : * \frac{k_{2}.n_{q}}{1+L_{d}}+\sum_{t}\frac{(k_{3}+1)q_{t}}{k_{3}+q_{t}}.\frac{(k_{1}+1)f_{t,d}}{k_{1}((1-b)+bL_{d})+f_{t,d}}.w_{t}
1169 : * \f] where
1170 : * - \f$w_{t}\f$ is the termweight of term t
1171 : * - \f$f_{t,d}\f$ is the within document frequency of term t in document d
1172 : * - \f$q_{t}\f$ is the within query frequency of term t
1173 : * - \f$L_{d}\f$ is the normalised length of document d
1174 : * - \f$n_{q}\f$ is the size of the query
1175 : * - \f$k_{1}\f$, \f$k_{2}\f$, \f$k_{3}\f$ and \f$b\f$ are user specified parameters
1176 : */
1177 : class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
1178 : private:
1179 : mutable Xapian::weight termweight;
1180 : mutable Xapian::doclength lenpart;
1181 :
1182 : double k1, k2, k3, b;
1183 : Xapian::doclength min_normlen;
1184 :
1185 : mutable bool weight_calculated;
1186 :
1187 : void calc_termweight() const;
1188 :
1189 : public:
1190 : /** Construct a BM25 weight.
1191 : *
1192 : * @param k1 governs the importance of within document frequency.
1193 : * Must be >= 0. 0 means ignore wdf. Default is 1.
1194 : * @param k2 compensation factor for the high wdf values in
1195 : * large documents. Must be >= 0. 0 means no
1196 : * compensation. Default is 0.
1197 : * @param k3 governs the importance of within query frequency.
1198 : * Must be >= 0. 0 means ignore wqf. Default is 1.
1199 : * @param b Relative importance of within document frequency and
1200 : * document length. Must be >= 0 and <= 1. Default
1201 : * is 0.5.
1202 : * @param min_normlen specifies a cutoff on the minimum value that
1203 : * can be used for a normalised document length -
1204 : * smaller values will be forced up to this cutoff.
1205 : * This prevents very small documents getting a huge
1206 : * bonus weight. Default is 0.5.
1207 : */
1208 : BM25Weight(double k1_, double k2_, double k3_, double b_,
1209 : double min_normlen_)
1210 : : k1(k1_), k2(k2_), k3(k3_), b(b_), min_normlen(min_normlen_),
1211 : weight_calculated(false)
1212 : {
1213 : if (k1 < 0) k1 = 0;
1214 : if (k2 < 0) k2 = 0;
1215 : if (k3 < 0) k3 = 0;
1216 : if (b < 0) b = 0; else if (b > 1) b = 1;
1217 : }
1218 : BM25Weight() : k1(1), k2(0), k3(1), b(0.5), min_normlen(0.5),
1219 : weight_calculated(false) { }
1220 :
1221 : BM25Weight * clone() const;
1222 : ~BM25Weight() { }
1223 : std::string name() const;
1224 : std::string serialise() const;
1225 : BM25Weight * unserialise(const std::string & s) const;
1226 : Xapian::weight get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const;
1227 : Xapian::weight get_maxpart() const;
1228 :
1229 : Xapian::weight get_sumextra(Xapian::doclength len) const;
1230 : Xapian::weight get_maxextra() const;
1231 :
1232 : bool get_sumpart_needs_doclength() const;
1233 : };
1234 :
1235 : /** Traditional probabilistic weighting scheme.
1236 : *
1237 : * This class implements the Traditional Probabilistic Weighting scheme, as
1238 : * described by the early papers on Probabilistic Retrieval. BM25 generally
1239 : * gives better results.
1240 : *
1241 : * The Traditional weighting scheme formula is \f[
1242 : * \sum_{t}\frac{f_{t,d}}{k.L_{d}+f_{t,d}}.w_{t}
1243 : * \f] where
1244 : * - \f$w_{t}\f$ is the termweight of term t
1245 : * - \f$f_{t,d}\f$ is the within document frequency of term t in document d
1246 : * - \f$L_{d}\f$ is the normalised length of document d
1247 : * - \f$k\f$ is a user specifiable parameter
1248 : *
1249 : * TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
1250 : * the latter returns weights (k+1) times larger.
1251 : */
1252 : class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
1253 : private:
1254 : mutable Xapian::weight termweight;
1255 : mutable Xapian::doclength lenpart;
1256 :
1257 : double param_k;
1258 :
1259 : mutable bool weight_calculated;
1260 :
1261 : void calc_termweight() const;
1262 :
1263 : public:
1264 : /** Construct a TradWeight
1265 : *
1266 : * @param k parameter governing the importance of within
1267 : * document frequency and document length - any non-negative
1268 : * number (0 meaning to ignore wdf and doc length when
1269 : * calculating weights). Default is 1.
1270 : */
1271 : explicit TradWeight(double k) : param_k(k), weight_calculated(false) {
1272 : if (param_k < 0) param_k = 0;
1273 : }
1274 :
1275 : TradWeight() : param_k(1.0), weight_calculated(false) { }
1276 :
1277 : TradWeight * clone() const;
1278 : ~TradWeight() { }
1279 : std::string name() const;
1280 : std::string serialise() const;
1281 : TradWeight * unserialise(const std::string & s) const;
1282 :
1283 : Xapian::weight get_sumpart(Xapian::termcount wdf, Xapian::doclength len) const;
1284 : Xapian::weight get_maxpart() const;
1285 :
1286 : Xapian::weight get_sumextra(Xapian::doclength len) const;
1287 : Xapian::weight get_maxextra() const;
1288 :
1289 : bool get_sumpart_needs_doclength() const;
1290 : };
1291 :
1292 : }
1293 :
1294 : #endif /* XAPIAN_INCLUDED_ENQUIRE_H */
|