17 #ifndef __D_T_TEMPLATED_VOCABULARY__ 18 #define __D_T_TEMPLATED_VOCABULARY__ 27 #include <opencv2/core/core.hpp> 34 #include "../DUtils/Random.h" 42 template<
class TDescriptor,
class F>
96 (
const std::vector<std::vector<TDescriptor> > &training_features);
106 (
const std::vector<std::vector<TDescriptor> > &training_features,
115 (
const std::vector<std::vector<TDescriptor> > &training_features,
122 virtual inline unsigned int size()
const;
128 virtual inline bool empty()
const;
135 virtual void transform(
const std::vector<TDescriptor>& features,
BowVector &v)
145 virtual void transform(
const std::vector<TDescriptor>& features,
153 virtual WordId transform(
const TDescriptor& feature)
const;
171 virtual NodeId getParentNode(
WordId wid,
int levelsup)
const;
179 void getWordsFromNode(
NodeId nid, std::vector<WordId> &words)
const;
197 float getEffectiveLevels()
const;
204 virtual inline TDescriptor getWord(
WordId wid)
const;
241 bool loadFromTextFile(
const std::string &filename);
247 void saveToTextFile(
const std::string &filename)
const;
253 void save(
const std::string &filename)
const;
259 void load(
const std::string &filename);
265 virtual void save(cv::FileStorage &fs,
266 const std::string &name =
"vocabulary")
const;
274 virtual void load(
const cv::FileStorage &fs,
275 const std::string &name =
"vocabulary");
289 virtual int stopWords(
double minWeight);
316 Node(): id(0), weight(0), parent(0), word_id(0){}
322 Node(
NodeId _id): id(_id), weight(0), parent(0), word_id(0){}
328 inline bool isLeaf()
const {
return children.empty(); }
336 void createScoringObject();
344 const vector<vector<TDescriptor> > &training_features,
345 vector<pDescriptor> &features)
const;
355 virtual void transform(
const TDescriptor &feature,
363 virtual void transform(
const TDescriptor &feature,
WordId &
id)
const;
372 void HKmeansStep(
NodeId parent_id,
const vector<pDescriptor> &descriptors,
380 virtual void initiateClusters(
const vector<pDescriptor> &descriptors,
381 vector<TDescriptor> &clusters)
const;
389 void initiateClustersKMpp(
const vector<pDescriptor> &descriptors,
390 vector<TDescriptor> &clusters)
const;
403 void setNodeWeights(
const vector<vector<TDescriptor> > &features);
433 template<
class TDescriptor,
class F>
436 : m_k(k), m_L(L), m_weighting(weighting), m_scoring(scoring),
437 m_scoring_object(NULL)
439 createScoringObject();
444 template<
class TDescriptor,
class F>
446 (
const std::string &filename): m_scoring_object(NULL)
453 template<
class TDescriptor,
class F>
455 (
const char *filename): m_scoring_object(NULL)
462 template<
class TDescriptor,
class F>
465 delete m_scoring_object;
466 m_scoring_object = NULL;
471 m_scoring_object =
new L1Scoring;
475 m_scoring_object =
new L2Scoring;
479 m_scoring_object =
new ChiSquareScoring;
483 m_scoring_object =
new KLScoring;
487 m_scoring_object =
new BhattacharyyaScoring;
491 m_scoring_object =
new DotProductScoring;
499 template<
class TDescriptor,
class F>
503 createScoringObject();
508 template<
class TDescriptor,
class F>
511 this->m_weighting = type;
516 template<
class TDescriptor,
class F>
519 : m_scoring_object(NULL)
526 template<
class TDescriptor,
class F>
534 template<
class TDescriptor,
class F>
557 template<
class TDescriptor,
class F>
559 const std::vector<std::vector<TDescriptor> > &training_features)
566 (int)((pow((
double)
m_k, (
double)
m_L + 1) - 1)/(m_k - 1));
568 m_nodes.reserve(expected_nodes);
571 vector<pDescriptor> features;
591 template<
class TDescriptor,
class F>
593 const std::vector<std::vector<TDescriptor> > &training_features,
599 create(training_features);
604 template<
class TDescriptor,
class F>
606 const std::vector<std::vector<TDescriptor> > &training_features,
615 create(training_features);
620 template<
class TDescriptor,
class F>
622 const vector<vector<TDescriptor> > &training_features,
623 vector<pDescriptor> &features)
const 627 typename vector<vector<TDescriptor> >::const_iterator vvit;
628 typename vector<TDescriptor>::const_iterator vit;
629 for(vvit = training_features.begin(); vvit != training_features.end(); ++vvit)
631 features.reserve(features.size() + vvit->size());
632 for(vit = vvit->begin(); vit != vvit->end(); ++vit)
634 features.push_back(&(*vit));
641 template<
class TDescriptor,
class F>
643 const vector<pDescriptor> &descriptors,
int current_level)
645 if(descriptors.empty())
return;
648 vector<TDescriptor> clusters;
649 vector<vector<unsigned int> > groups;
652 clusters.reserve(
m_k);
660 if((
int)descriptors.size() <=
m_k)
663 groups.resize(descriptors.size());
665 for(
unsigned int i = 0; i < descriptors.size(); i++)
667 groups[i].push_back(i);
668 clusters.push_back(*descriptors[i]);
675 bool first_time =
true;
679 vector<int> last_association, current_association;
694 for(
unsigned int c = 0; c < clusters.size(); ++c)
696 vector<pDescriptor> cluster_descriptors;
697 cluster_descriptors.reserve(groups[c].
size());
709 vector<unsigned int>::const_iterator vit;
710 for(vit = groups[c].begin(); vit != groups[c].end(); ++vit)
712 cluster_descriptors.push_back(descriptors[*vit]);
716 F::meanValue(cluster_descriptors, clusters[c]);
725 groups.resize(clusters.size(), vector<unsigned int>());
726 current_association.resize(descriptors.size());
730 typename vector<pDescriptor>::const_iterator fit;
732 for(fit = descriptors.begin(); fit != descriptors.end(); ++fit)
734 double best_dist = F::distance(*(*fit), clusters[0]);
735 unsigned int icluster = 0;
737 for(
unsigned int c = 1; c < clusters.size(); ++c)
739 double dist = F::distance(*(*fit), clusters[c]);
749 groups[icluster].push_back(fit - descriptors.begin());
750 current_association[ fit - descriptors.begin() ] = icluster;
765 for(
unsigned int i = 0; i < current_association.size(); i++)
767 if(current_association[i] != last_association[i]){
777 last_association = current_association;
786 for(
unsigned int i = 0; i < clusters.size(); ++i)
790 m_nodes.back().descriptor = clusters[i];
791 m_nodes.back().parent = parent_id;
792 m_nodes[parent_id].children.push_back(
id);
796 if(current_level <
m_L)
799 const vector<NodeId> &children_ids =
m_nodes[parent_id].children;
800 for(
unsigned int i = 0; i < clusters.size(); ++i)
802 NodeId id = children_ids[i];
804 vector<pDescriptor> child_features;
805 child_features.reserve(groups[i].
size());
807 vector<unsigned int>::const_iterator vit;
808 for(vit = groups[i].begin(); vit != groups[i].end(); ++vit)
810 child_features.push_back(descriptors[*vit]);
813 if(child_features.size() > 1)
815 HKmeansStep(
id, child_features, current_level + 1);
823 template<
class TDescriptor,
class F>
825 (
const vector<pDescriptor> &descriptors, vector<TDescriptor> &clusters)
const 832 template<
class TDescriptor,
class F>
834 const vector<pDescriptor> &pfeatures, vector<TDescriptor> &clusters)
const 850 clusters.reserve(
m_k);
851 vector<double> min_dists(pfeatures.size(), std::numeric_limits<double>::max());
858 clusters.push_back(*pfeatures[ifeature]);
861 typename vector<pDescriptor>::const_iterator fit;
862 vector<double>::iterator dit;
863 dit = min_dists.begin();
864 for(fit = pfeatures.begin(); fit != pfeatures.end(); ++fit, ++dit)
866 *dit = F::distance(*(*fit), clusters.back());
869 while((
int)clusters.size() <
m_k)
872 dit = min_dists.begin();
873 for(fit = pfeatures.begin(); fit != pfeatures.end(); ++fit, ++dit)
877 double dist = F::distance(*(*fit), clusters.back());
878 if(dist < *dit) *dit = dist;
883 double dist_sum = std::accumulate(min_dists.begin(), min_dists.end(), 0.0);
890 cut_d = DUtils::Random::RandomValue<double>(0, dist_sum);
891 }
while(cut_d == 0.0);
894 for(dit = min_dists.begin(); dit != min_dists.end(); ++dit)
897 if(d_up_now >= cut_d)
break;
900 if(dit == min_dists.end())
901 ifeature = pfeatures.size()-1;
903 ifeature = dit - min_dists.begin();
905 clusters.push_back(*pfeatures[ifeature]);
917 template<
class TDescriptor,
class F>
926 typename vector<Node>::iterator nit;
929 for(++nit; nit !=
m_nodes.end(); ++nit)
942 template<
class TDescriptor,
class F>
944 (
const vector<vector<TDescriptor> > &training_features)
946 const unsigned int NWords =
m_words.size();
947 const unsigned int NDocs = training_features.size();
952 for(
unsigned int i = 0; i < NWords; i++)
962 vector<unsigned int> Ni(NWords, 0);
963 vector<bool> counted(NWords,
false);
965 typename vector<vector<TDescriptor> >::const_iterator mit;
966 typename vector<TDescriptor>::const_iterator fit;
968 for(mit = training_features.begin(); mit != training_features.end(); ++mit)
970 fill(counted.begin(), counted.end(),
false);
972 for(fit = mit->begin(); fit < mit->end(); ++fit)
977 if(!counted[word_id])
980 counted[word_id] =
true;
986 for(
unsigned int i = 0; i < NWords; i++)
990 m_words[i]->weight = log((
double)NDocs / (
double)Ni[i]);
1000 template<
class TDescriptor,
class F>
1008 template<
class TDescriptor,
class F>
1016 template<
class TDescriptor,
class F>
1020 typename std::vector<Node*>::const_iterator wit;
1023 const Node *p = *wit;
1028 return (
float)((double)sum / (
double)
m_words.size());
1033 template<
class TDescriptor,
class F>
1036 return m_words[wid]->descriptor;
1041 template<
class TDescriptor,
class F>
1049 template<
class TDescriptor,
class F>
1051 (
const TDescriptor& feature)
const 1065 template<
class TDescriptor,
class F>
1067 const std::vector<TDescriptor>& features,
BowVector &v)
const 1080 typename vector<TDescriptor>::const_iterator fit;
1084 for(fit = features.begin(); fit < features.end(); ++fit)
1096 if(!v.empty() && !must)
1099 const double nd = v.size();
1100 for(BowVector::iterator vit = v.begin(); vit != v.end(); vit++)
1107 for(fit = features.begin(); fit < features.end(); ++fit)
1126 template<
class TDescriptor,
class F>
1128 const std::vector<TDescriptor>& features,
1143 typename vector<TDescriptor>::const_iterator fit;
1147 unsigned int i_feature = 0;
1148 for(fit = features.begin(); fit < features.end(); ++fit, ++i_feature)
1164 if(!v.empty() && !must)
1167 const double nd = v.size();
1168 for(BowVector::iterator vit = v.begin(); vit != v.end(); vit++)
1175 unsigned int i_feature = 0;
1176 for(fit = features.begin(); fit < features.end(); ++fit, ++i_feature)
1198 template<
class TDescriptor,
class F>
1207 template<
class TDescriptor,
class F>
1209 (
const TDescriptor &feature,
WordId &
id)
const 1217 template<
class TDescriptor,
class F>
1222 vector<NodeId> nodes;
1223 typename vector<NodeId>::const_iterator nit;
1226 const int nid_level =
m_L - levelsup;
1227 if(nid_level <= 0 && nid != NULL) *nid = 0;
1230 int current_level = 0;
1235 nodes =
m_nodes[final_id].children;
1236 final_id = nodes[0];
1238 double best_d = F::distance(feature,
m_nodes[final_id].descriptor);
1240 for(nit = nodes.begin() + 1; nit != nodes.end(); ++nit)
1243 double d = F::distance(feature,
m_nodes[
id].descriptor);
1251 if(nid != NULL && current_level == nid_level)
1254 }
while( !
m_nodes[final_id].isLeaf() );
1257 word_id =
m_nodes[final_id].word_id;
1258 weight =
m_nodes[final_id].weight;
1263 template<
class TDescriptor,
class F>
1268 while(levelsup > 0 && ret != 0)
1278 template<
class TDescriptor,
class F>
1280 (
NodeId nid, std::vector<WordId> &words)
const 1286 words.push_back(
m_nodes[nid].word_id);
1292 vector<NodeId> parents;
1293 parents.push_back(nid);
1295 while(!parents.empty())
1297 NodeId parentid = parents.back();
1300 const vector<NodeId> &child_ids =
m_nodes[parentid].children;
1301 vector<NodeId>::const_iterator cit;
1303 for(cit = child_ids.begin(); cit != child_ids.end(); ++cit)
1308 words.push_back(child_node.
word_id);
1310 parents.push_back(*cit);
1319 template<
class TDescriptor,
class F>
1323 typename vector<Node*>::iterator wit;
1326 if((*wit)->weight < minWeight)
1337 template<
class TDescriptor,
class F>
1341 f.open(filename.c_str());
1359 if(m_k<0 || m_k>20 || m_L<1 || m_L>10 || n1<0 || n1>5 || n2<0 || n2>3)
1361 std::cerr <<
"Vocabulary loading failure: This is not a correct text file!" << endl;
1370 int expected_nodes =
1371 (int)((pow((
double)m_k, (
double)m_L + 1) - 1)/(m_k - 1));
1372 m_nodes.reserve(expected_nodes);
1374 m_words.reserve(pow((
double)m_k, (
double)m_L + 1));
1382 stringstream ssnode;
1392 m_nodes[pid].children.push_back(nid);
1398 for(
int iD=0;iD<F::L;iD++)
1402 ssd << sElement <<
" ";
1404 F::fromString(
m_nodes[nid].descriptor, ssd.str());
1406 ssnode >>
m_nodes[nid].weight;
1418 m_nodes[nid].children.reserve(m_k);
1428 template<
class TDescriptor,
class F>
1432 f.open(filename.c_str(),ios_base::out);
1435 for(
size_t i=1; i<
m_nodes.size();i++)
1453 template<
class TDescriptor,
class F>
1456 cv::FileStorage fs(filename.c_str(), cv::FileStorage::WRITE);
1457 if(!fs.isOpened())
throw string(
"Could not open file ") + filename;
1464 template<
class TDescriptor,
class F>
1467 cv::FileStorage fs(filename.c_str(), cv::FileStorage::READ);
1468 if(!fs.isOpened())
throw string(
"Could not open file ") + filename;
1475 template<
class TDescriptor,
class F>
1477 const std::string &name)
const 1515 f <<
"nodes" <<
"[";
1516 vector<NodeId> parents, children;
1517 vector<NodeId>::const_iterator pit;
1519 parents.push_back(0);
1521 while(!parents.empty())
1523 NodeId pid = parents.back();
1529 for(pit = children.begin(); pit != children.end(); pit++)
1535 f <<
"nodeId" << (int)child.
id;
1536 f <<
"parentId" << (
int)pid;
1537 f <<
"weight" << (double)child.
weight;
1538 f <<
"descriptor" << F::toString(child.
descriptor);
1544 parents.push_back(*pit);
1552 f <<
"words" <<
"[";
1554 typename vector<Node*>::const_iterator wit;
1559 f <<
"wordId" << (int)
id;
1560 f <<
"nodeId" << (int)(*wit)->id;
1572 template<
class TDescriptor,
class F>
1574 const std::string &name)
1579 cv::FileNode fvoc = fs[name];
1581 m_k = (int)fvoc[
"k"];
1582 m_L = (int)fvoc[
"L"];
1589 cv::FileNode fn = fvoc[
"nodes"];
1591 m_nodes.resize(fn.size() + 1);
1594 for(
unsigned int i = 0; i < fn.size(); ++i)
1596 NodeId nid = (int)fn[i][
"nodeId"];
1597 NodeId pid = (int)fn[i][
"parentId"];
1599 string d = (string)fn[i][
"descriptor"];
1604 m_nodes[pid].children.push_back(nid);
1606 F::fromString(
m_nodes[nid].descriptor, d);
1614 for(
unsigned int i = 0; i < fn.size(); ++i)
1616 NodeId wid = (int)fn[i][
"wordId"];
1617 NodeId nid = (int)fn[i][
"nodeId"];
1631 template<
class TDescriptor,
class F>
1637 <<
", Weighting = ";
1641 case TF_IDF: os <<
"tf-idf";
break;
1642 case TF: os <<
"tf";
break;
1643 case IDF: os <<
"idf";
break;
1644 case BINARY: os <<
"binary";
break;
1647 os <<
", Scoring = ";
1650 case L1_NORM: os <<
"L1-norm";
break;
1651 case L2_NORM: os <<
"L2-norm";
break;
1652 case CHI_SQUARE: os <<
"Chi square distance";
break;
1653 case KL: os <<
"KL-divergence";
break;
1654 case BHATTACHARYYA: os <<
"Bhattacharyya coefficient";
break;
1658 os <<
", Number of words = " << voc.
size();
double score(const BowVector &a, const BowVector &b) const
virtual void initiateClusters(const vector< pDescriptor > &descriptors, vector< TDescriptor > &clusters) const
std::vector< Node > m_nodes
Tree nodes.
ScoringType getScoringType() const
void getFeatures(const vector< vector< TDescriptor > > &training_features, vector< pDescriptor > &features) const
WeightingType
Weighting type.
virtual TDescriptor getWord(WordId wid) const
WeightingType m_weighting
Weighting method.
static void SeedRandOnce()
void load(const std::string &filename)
TDescriptor descriptor
Node descriptor.
float getEffectiveLevels() const
WordValue weight
Weight if the node is a word.
int getDepthLevels() const
void saveToTextFile(const std::string &filename) const
void getWordsFromNode(NodeId nid, std::vector< WordId > &words) const
virtual NodeId getParentNode(WordId wid, int levelsup) const
void addWeight(WordId id, WordValue v)
void initiateClustersKMpp(const vector< pDescriptor > &descriptors, vector< TDescriptor > &clusters) const
GeneralScoring * m_scoring_object
Object for computing scores.
virtual double score(const BowVector &v, const BowVector &w) const =0
void HKmeansStep(NodeId parent_id, const vector< pDescriptor > &descriptors, int current_level)
virtual bool mustNormalize(LNorm &norm) const =0
void normalize(LNorm norm_type)
vector< NodeId > children
Children.
virtual void transform(const std::vector< TDescriptor > &features, BowVector &v) const
Base class of scoring functions.
NodeId parent
Parent node (undefined in case of root)
virtual unsigned int size() const
WeightingType getWeightingType() const
ScoringType m_scoring
Scoring method.
LNorm
L-norms for normalization.
std::vector< Node * > m_words
bool loadFromTextFile(const std::string &filename)
Vector of words to represent images.
int getBranchingFactor() const
double WordValue
Value of a word.
Vector of nodes with indexes of local features.
void createScoringObject()
static int RandomInt(int min, int max)
virtual bool empty() const
unsigned int WordId
Id of words.
std::ostream & operator<<(std::ostream &out, const BowVector &v)
TFSIMD_FORCE_INLINE const tfScalar & w() const
void addIfNotExist(WordId id, WordValue v)
const TDescriptor * pDescriptor
Pointer to descriptor.
void save(const std::string &filename) const
WordId word_id
Word id if the node is a word.
virtual int stopWords(double minWeight)
virtual void create(const std::vector< std::vector< TDescriptor > > &training_features)
virtual WordValue getWordWeight(WordId wid) const
virtual ~TemplatedVocabulary()
void setNodeWeights(const vector< vector< TDescriptor > > &features)
void addFeature(NodeId id, unsigned int i_feature)
unsigned int NodeId
Id of nodes in the vocabulary treee.