Go to the documentation of this file.00001 #ifndef VOCABULARY_TREE_DATABASE_H
00002 #define VOCABULARY_TREE_DATABASE_H
00003
00005 #include <vocabulary_tree/vocabulary_tree.h>
00006 #include <map>
00007
00008 namespace vt {
00009
00010 typedef uint32_t DocId;
00011
00017 struct Match
00018 {
00019 DocId id;
00020 float score;
00021
00022 Match() {}
00023 Match(DocId _id, float _score) : id(_id), score(_score) {}
00024
00026 bool operator<(const Match& other) const
00027 {
00028 return score < other.score;
00029 }
00030 };
00031
00032
00033 typedef std::vector<Word> Document;
00034 typedef std::vector<Match> Matches;
00035
00040 class Database
00041 {
00042 public:
00049 Database(uint32_t num_words = 0);
00050
00057 DocId insert(const std::vector<Word>& document);
00058
00066 void find(const std::vector<Word>& document, size_t N, std::vector<Match>& matches) const;
00067
00077 DocId findAndInsert(const std::vector<Word>& document, size_t N, std::vector<Match>& matches);
00078
00085 void computeTfIdfWeights(float default_weight = 1.0f);
00086
00088 void saveWeights(const std::string& file) const;
00090 void loadWeights(const std::string& file);
00091
00092
00093
00094
00095
00096 private:
00097 struct WordFrequency
00098 {
00099 DocId id;
00100 uint32_t count;
00101
00102 WordFrequency(DocId _id, uint32_t _count) : id(_id), count(_count) {}
00103 };
00104
00105
00106 typedef std::vector<WordFrequency> InvertedFile;
00107
00109
00110 typedef std::map<Word, float> DocumentVector;
00111
00112 std::vector<InvertedFile> word_files_;
00113 std::vector<float> word_weights_;
00114 std::vector<DocumentVector> database_vectors_;
00115
00116 void computeVector(const std::vector<Word>& document, DocumentVector& v) const;
00117
00118 static void normalize(DocumentVector& v);
00119 static float sparseDistance(const DocumentVector& v1, const DocumentVector& v2);
00120 };
00121
00122 }
00123
00124 #endif