9#include <boost/tokenizer.hpp>
26 const Xapian::Database& iDatabase)
27 : _resultHolder (NULL), _database (iDatabase),
28 _queryString (iQueryString), _hasFullTextMatched (false),
29 _bestDocData (RawDataString_T (
"")) {
42 std::string Result::describeShortKey()
const {
43 std::ostringstream oStr;
49 std::string Result::describeKey()
const {
50 std::ostringstream oStr;
51 oStr <<
"'" << describeShortKey() <<
"' ";
52 if (_correctedQueryString.empty() ==
false
53 && _correctedQueryString != _queryString) {
54 oStr <<
"(corrected into '" << _correctedQueryString
55 <<
"' with an edit distance/error of " << _editDistance
56 <<
" over an allowable distance of " << _allowableEditDistance
65 std::string Result::toString()
const {
66 std::ostringstream oStr;
67 oStr << describeKey();
69 if (_documentList.empty() ==
true) {
70 oStr <<
"No match" << std::endl;
73 assert (_hasFullTextMatched ==
true);
75 unsigned short idx = 0;
76 for (DocumentList_T::const_iterator itDoc = _documentList.begin();
77 itDoc != _documentList.end(); ++itDoc, ++idx) {
80 const Xapian::Document& lXapianDoc = lDocumentPair.first;
81 const Xapian::docid& lDocID = lXapianDoc.get_docid();
83 const ScoreBoard& lScoreBoard = lDocumentPair.second;
88 oStr <<
"Doc ID: " << lDocID <<
", matching with ("
89 << lScoreBoard.
describe() <<
"), containing: '"
90 << lXapianDoc.get_data() <<
"'";
97 void Result::toStream (std::ostream& ioOut)
const {
102 void Result::fromStream (std::istream& ioIn) {
107 getDocumentPair (
const Xapian::docid& iDocID)
const {
110 DocumentMap_T::const_iterator itDoc = _documentMap.find (iDocID);
112 if (itDoc == _documentMap.end()) {
114 <<
") can not be found in the Result object "
117 assert (itDoc != _documentMap.end());
123 return oDocumentPair;
127 const Xapian::Document& Result::
128 getDocument (
const Xapian::docid& iDocID)
const {
134 const Xapian::Document& oXapianDocument = lDocumentPair.first;
137 return oXapianDocument;
141 void Result::addDocument (
const Xapian::Document& iDocument,
151 Score_T lCorrectedScore = iScore;
152 if (_editDistance > 0) {
153 lCorrectedScore = iScore / (_editDistance * _editDistance * _editDistance);
157 const ScoreType lXapianScoreType (ScoreType::XAPIAN_PCT);
161 lXapianScoreType, lCorrectedScore);
164 const Xapian::docid& lDocID = iDocument.get_docid();
180 _documentList.push_back (lDocumentPair);
183 const bool hasInsertBeenSuccessful =
184 _documentMap.insert (DocumentMap_T::value_type (lDocID,
185 lDocumentPair)).second;
187 if (hasInsertBeenSuccessful ==
false) {
188 std::ostringstream errorStr;
189 errorStr <<
"Error while inserting the Xapian Document pair into "
190 <<
"the internal STL map";
193 assert (hasInsertBeenSuccessful ==
true);
197 void Result::fillResult (
const Xapian::MSet& iMatchingSet) {
202 for (Xapian::MSetIterator itDoc = iMatchingSet.begin();
203 itDoc != iMatchingSet.end(); ++itDoc) {
204 const int& lXapianPercentage = itDoc.get_percent();
205 const Xapian::Document& lDocument = itDoc.get_document();
206 addDocument (lDocument, lXapianPercentage);
211 void Result::fillPlace (
Place& ioPlace)
const {
232 <<
", " << _bestCombinedWeight <<
"% [" << _bestDocData
254 return oEditDistance;
272 Location Result::retrieveLocation (
const Xapian::Document& iDocument) {
274 const std::string& lDocumentDataStr = iDocument.get_data();
278 const Location& oLocation = retrieveLocation (lDocumentData);
284 LocationKey Result::getPrimaryKey (
const Xapian::Document& iDocument) {
286 const Location& lLocation = retrieveLocation (iDocument);
295 Score_T Result::getEnvelopeID (
const Xapian::Document& iDocument) {
297 const Location& lLocation = retrieveLocation (iDocument);
303 const Score_T oEnvelopeID =
static_cast<const Score_T> (lEnvelopeIDInt);
309 PageRank_T Result::getPageRank (
const Xapian::Document& iDocument) {
311 const Location& lLocation = retrieveLocation (iDocument);
320 std::string Result::fullTextMatch (
const Xapian::Database& iDatabase,
322 Xapian::MSet& ioMatchingSet) {
323 std::string oMatchedString;
329 Xapian::QueryParser lQueryParser;
330 lQueryParser.set_database (iDatabase);
338 lQueryParser.set_default_op (Xapian::Query::OP_PHRASE);
350 Xapian::Enquire enquire (iDatabase);
358 const Xapian::Query& lXapianQuery =
359 lQueryParser.parse_query (iQueryString,
360 Xapian::QueryParser::FLAG_BOOLEAN
361 | Xapian::QueryParser::FLAG_PHRASE
362 | Xapian::QueryParser::FLAG_LOVEHATE);
365 enquire.set_query (lXapianQuery);
369 ioMatchingSet = enquire.get_mset (0, K_DEFAULT_XAPIAN_MATCHING_SET_SIZE);
372 int nbMatches = ioMatchingSet.size();
376 <<
"', i.e.: `" << lXapianQuery.get_description()
377 <<
"' => " << nbMatches <<
" result(s) found");
379 if (nbMatches != 0) {
382 setEditDistance (lEditDistance);
385 setAllowableEditDistance (lEditDistance);
388 oMatchedString = iQueryString;
391 setHasFullTextMatched (
true);
395 setCorrectedQueryString (oMatchedString);
399 <<
"' provides " << nbMatches <<
" exact matches.");
401 return oMatchedString;
403 assert (ioMatchingSet.empty() ==
true);
410 const NbOfErrors_T& lAllowableEditDistance =
411 calculateEditDistance (iQueryString);
414 const std::string& lCorrectedString =
415 iDatabase.get_spelling_suggestion (iQueryString, lAllowableEditDistance);
419 if (lCorrectedString.empty() ==
true || lCorrectedString == iQueryString) {
422 << iQueryString <<
"' provides no match, "
423 <<
"and there is no spelling suggestion, "
424 <<
"even with an edit distance of "
425 << lAllowableEditDistance);
428 setHasFullTextMatched (
false);
431 return oMatchedString;
433 assert (lCorrectedString.empty() ==
false
434 && lCorrectedString != iQueryString);
437 const NbOfErrors_T& lEditDistance =
438 Levenshtein::getDistance (iQueryString, lCorrectedString);
447 const Xapian::Query& lCorrectedXapianQuery =
448 lQueryParser.parse_query (lCorrectedString,
449 Xapian::QueryParser::FLAG_BOOLEAN
450 | Xapian::QueryParser::FLAG_PHRASE
451 | Xapian::QueryParser::FLAG_LOVEHATE);
455 enquire.set_query (lCorrectedXapianQuery);
456 ioMatchingSet = enquire.get_mset (0, K_DEFAULT_XAPIAN_MATCHING_SET_SIZE);
459 nbMatches = ioMatchingSet.size();
464 << lCorrectedXapianQuery.get_description()
465 <<
"' => " << nbMatches <<
" result(s) found");
467 if (nbMatches != 0) {
469 setEditDistance (lEditDistance);
472 setAllowableEditDistance (lAllowableEditDistance);
475 oMatchedString = lCorrectedString;
478 setHasFullTextMatched (
true);
481 setCorrectedQueryString (oMatchedString);
485 << iQueryString <<
"', spelling suggestion: `"
487 <<
"', with a Levenshtein edit distance of "
489 <<
" over an allowable edit distance of "
490 << lAllowableEditDistance <<
", provides "
491 << nbMatches <<
" matches.");
494 return oMatchedString;
499 << iQueryString <<
"', spelling suggestion: `"
501 <<
"', with a Levenshtein edit distance of "
503 <<
" over an allowable edit distance of "
504 << lAllowableEditDistance <<
", provides no match, "
505 <<
"which is not consistent with the existence of "
506 <<
"the spelling correction.");
509 }
catch (
const Xapian::Error& error) {
511 throw XapianException (error.get_msg());
515 setHasFullTextMatched (
false);
517 return oMatchedString;
521 std::string Result::fullTextMatch (
const Xapian::Database& iDatabase,
523 std::string oMatchedString;
533 const bool isToBeAdded = Filter::shouldKeep (
"", iQueryString);
536 Xapian::MSet lMatchingSet;
537 if (isToBeAdded ==
true) {
538 oMatchedString = fullTextMatch (iDatabase, iQueryString, lMatchingSet);
542 fillResult (lMatchingSet);
545 if (isToBeAdded ==
false) {
548 <<
"' is not made of searchable words");
553 }
catch (
const Xapian::Error& error) {
558 return oMatchedString;
562 void Result::displayXapianPercentages()
const {
564 for (DocumentList_T::const_iterator itDoc = _documentList.begin();
565 itDoc != _documentList.end(); ++itDoc) {
569 const Xapian::Document& lXapianDoc = lDocumentPair.first;
572 const Xapian::docid& lDocID = lXapianDoc.get_docid();
575 const LocationKey& lLocationKey = getPrimaryKey (lXapianDoc);
578 const ScoreBoard& lScoreBoard = lDocumentPair.second;
581 const Score_T& lXapianPct = lScoreBoard.
getScore (ScoreType::XAPIAN_PCT);
585 <<
"' with (" << lLocationKey <<
", doc ID = "
586 << lDocID <<
") matches at " << lXapianPct
592 void Result::setScoreOnDocMap (
const Xapian::docid& iDocID,
596 DocumentMap_T::iterator itDoc = _documentMap.find (iDocID);
598 if (itDoc == _documentMap.end()) {
600 <<
") can not be found in the Result object "
603 assert (itDoc != _documentMap.end());
607 ScoreBoard& lScoreBoard = lXapianDocPair.second;
610 lScoreBoard.
setScore (iType, iScore);
614 void Result::calculateEnvelopeWeights() {
616 for (DocumentList_T::iterator itDoc = _documentList.begin();
617 itDoc != _documentList.end(); ++itDoc) {
621 const Xapian::Document& lXapianDoc = lDocumentPair.first;
624 const Xapian::docid& lDocID = lXapianDoc.get_docid();
627 const LocationKey& lLocationKey = getPrimaryKey (lXapianDoc);
630 const EnvelopeID_T& lEnvelopeIDInt = getEnvelopeID (lXapianDoc);
633 if (lEnvelopeIDInt != 0) {
635 <<
"] (" << lLocationKey <<
", doc ID = "
636 << lDocID <<
") has a non-null envelope ID ("
637 << lEnvelopeIDInt <<
") => match of 0.10%");
641 const Score_T lEnvelopeID =
static_cast<const Score_T> (lEnvelopeIDInt);
644 ScoreBoard& lScoreBoard = lDocumentPair.second;
647 lScoreBoard.
setScore (ScoreType::ENV_ID, lEnvelopeID);
648 setScoreOnDocMap (lDocID, ScoreType::ENV_ID, lEnvelopeID);
653 void Result::calculateCodeMatches() {
655 for (DocumentList_T::iterator itDoc = _documentList.begin();
656 itDoc != _documentList.end(); ++itDoc) {
660 const Xapian::Document& lXapianDoc = lDocumentPair.first;
663 const Xapian::docid& lDocID = lXapianDoc.get_docid();
666 const LocationKey& lLocationKey = getPrimaryKey (lXapianDoc);
670 bool hasCodeFullyMatched =
false;
675 std::string lFilteredString (_queryString);
677 Filter::trim (lFilteredString, kMinWordLength);
682 WordHolder::tokeniseStringIntoWordList (lFilteredString,
683 lFilteredQueryWordList);
684 const NbOfWords_T nbOfFilteredQueryWords = lFilteredQueryWordList.size();
687 if (_hasFullTextMatched ==
true) {
694 const size_t lNbOfLetters = lFilteredString.size();
695 if (nbOfFilteredQueryWords == 1
696 && lNbOfLetters >= 3 && lNbOfLetters <= 4
697 && _correctedQueryString == _queryString) {
700 std::string lUpperQueryWord;
701 lUpperQueryWord.resize (lNbOfLetters);
702 std::transform (lFilteredString.begin(), lFilteredString.end(),
703 lUpperQueryWord.begin(), ::toupper);
710 if (lUpperQueryWord == lIataCode) {
715 hasCodeFullyMatched =
true;
719 if (hasCodeFullyMatched ==
true) {
722 <<
"' matches the IATA/ICAO code ("
723 << lLocationKey <<
", doc ID = "
724 << lDocID <<
") => match of "
729 <<
"' does not match with the IATA/ICAO "
730 <<
"code (" << lLocationKey <<
", doc ID = "
731 << lDocID <<
") => match of "
737 ScoreBoard& lScoreBoard = lDocumentPair.second;
740 lScoreBoard.
setScore (ScoreType::CODE_FULL_MATCH, lCodeMatchPct);
741 setScoreOnDocMap (lDocID, ScoreType::CODE_FULL_MATCH, lCodeMatchPct);
746 void Result::calculatePageRanks() {
748 for (DocumentList_T::iterator itDoc = _documentList.begin();
749 itDoc != _documentList.end(); ++itDoc) {
753 const Xapian::Document& lXapianDoc = lDocumentPair.first;
756 const Xapian::docid& lDocID = lXapianDoc.get_docid();
759 const LocationKey& lLocationKey = getPrimaryKey (lXapianDoc);
762 const Score_T& lPageRank = getPageRank (lXapianDoc);
766 <<
"] (" << lLocationKey <<
", doc ID = "
767 << lDocID <<
") has a PageRank of "
768 << lPageRank <<
"%");
771 ScoreBoard& lScoreBoard = lDocumentPair.second;
774 lScoreBoard.
setScore (ScoreType::PAGE_RANK, lPageRank);
775 setScoreOnDocMap (lDocID, ScoreType::PAGE_RANK, lPageRank);
780 void Result::calculateHeuristicWeights() {
789 void Result::calculateCombinedWeights() {
791 std::string lBestDocData;
794 Xapian::docid lBestDocID = 0;
795 for (DocumentList_T::iterator itDoc = _documentList.begin();
796 itDoc != _documentList.end(); ++itDoc) {
800 const Xapian::Document& lXapianDoc = lDocumentPair.first;
801 const Xapian::docid& lDocID = lXapianDoc.get_docid();
802 const std::string& lDocData = lXapianDoc.get_data();
808 ScoreBoard& lScoreBoard = lDocumentPair.second;
821 if (lPercentage > lMaxPercentage) {
822 lMaxPercentage = lPercentage;
824 lBestDocData = lDocData;
830 WordHolder::tokeniseStringIntoWordList (_queryString,
831 lOriginalQueryWordList);
832 const NbOfWords_T nbOfOriginalQueryWords = lOriginalQueryWordList.size();
835 if (_hasFullTextMatched ==
true) {
839 const Xapian::Document& lXapianDoc = lXapianDocPair.first;
840 const ScoreBoard& lScoreBoard = lXapianDocPair.second;
841 const LocationKey& lLocationKey = getPrimaryKey (lXapianDoc);
845 <<
"' matches at " << lMaxPercentage
846 <<
"% for " << lLocationKey <<
" (doc ID = "
847 << lBestDocID <<
"). Score calculation: "
855 const bool shouldBeKept = Filter::shouldKeep (
"", _queryString);
857 if (nbOfOriginalQueryWords == 1 && shouldBeKept ==
true) {
863 lMaxPercentage = 100.0;
867 <<
"' does not match, but it is a non black-listed "
868 <<
"single-word string; hence, the weight is "
869 << lMaxPercentage <<
"%");
880 lMaxPercentage = std::pow (10.0, -3*nbOfOriginalQueryWords);
884 <<
"' does not match, and is either a multiple-word "
885 <<
"string or black-listed; hence, the weight is "
886 << lMaxPercentage <<
"%");
891 setBestDocID (lBestDocID);
894 setBestCombinedWeight (lMaxPercentage);
897 setBestDocData (lBestDocData);
#define OPENTREP_LOG_ERROR(iToBeLogged)
#define OPENTREP_LOG_DEBUG(iToBeLogged)
#define OPENTREP_LOG_NOTIFICATION(iToBeLogged)
const Location & generateLocation()
Class modelling a place/POR (point of reference).
void setEditDistance(const NbOfErrors_T &iEditDistance)
void setDocID(const XapianDocID_T &iDocID)
void setAllowableEditDistance(const NbOfErrors_T &iAllowableEditDistance)
void setPercentage(const MatchingPercentage_T &iPercentage)
void setOriginalKeywords(const std::string &iOriginalKeywords)
const LocationKey & getKey() const
void setCorrectedKeywords(const std::string &iCorrectedKeywords)
std::list< Word_T > WordList_T
unsigned int NbOfLetters_T
unsigned short NbOfErrors_T
const Percentage_T K_DEFAULT_MODIFIED_MATCHING_PCT
std::string TravelQuery_T
static unsigned int calculateEditDistance(const TravelQuery_T &iPhrase)
Helper function.
unsigned int EnvelopeID_T
const Percentage_T K_DEFAULT_FULL_CODE_MATCH_PCT
std::pair< Xapian::Document, ScoreBoard > XapianDocumentPair_T
const NbOfErrors_T K_DEFAULT_SIZE_FOR_SPELLING_ERROR_UNIT
unsigned short NbOfWords_T
std::string toString(const TokenList_T &iTokenList)
Class modelling the primary key of a location/POR (point of reference).
const IATACode_T & getIataCode() const
Structure modelling a (geographical) location.
const LocationKey & getKey() const
const EnvelopeID_T & getEnvelopeID() const
const PageRank_T & getPageRank() const
Structure holding a board for all the types of score/matching having been performed.
Percentage_T calculateCombinedWeight()
std::string describe() const
void setScore(const ScoreType &, const Score_T &)
Score_T getScore(const ScoreType &) const
Enumeration of score types.