lsh_table.h
Go to the documentation of this file.
00001 /***********************************************************************
00002  * Software License Agreement (BSD License)
00003  *
00004  * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
00005  * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
00006  *
00007  * THE BSD LICENSE
00008  *
00009  * Redistribution and use in source and binary forms, with or without
00010  * modification, are permitted provided that the following conditions
00011  * are met:
00012  *
00013  * 1. Redistributions of source code must retain the above copyright
00014  *    notice, this list of conditions and the following disclaimer.
00015  * 2. Redistributions in binary form must reproduce the above copyright
00016  *    notice, this list of conditions and the following disclaimer in the
00017  *    documentation and/or other materials provided with the distribution.
00018  *
00019  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
00020  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
00021  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
00022  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
00023  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
00024  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
00025  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
00026  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
00027  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
00028  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  *************************************************************************/
00030 
00031 /***********************************************************************
00032  * Author: Vincent Rabaud
00033  *************************************************************************/
00034 
00035 #ifndef RTABMAP_FLANN_LSH_TABLE_H_
00036 #define RTABMAP_FLANN_LSH_TABLE_H_
00037 
00038 #include <algorithm>
00039 #include <iostream>
00040 #include <iomanip>
00041 #include <limits.h>
00042 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
00043 #if RTFLANN_USE_UNORDERED_MAP
00044 #include <unordered_map>
00045 #else
00046 #include <map>
00047 #endif
00048 #include <math.h>
00049 #include <stddef.h>
00050 
00051 #include "rtflann/util/dynamic_bitset.h"
00052 #include "rtflann/util/matrix.h"
00053 
00054 namespace rtflann
00055 {
00056 
00057 namespace lsh
00058 {
00059 
00061 
00064 typedef uint32_t FeatureIndex;
00067 typedef unsigned int BucketKey;
00068 
00071 typedef std::vector<FeatureIndex> Bucket;
00072 
00074 
00077 struct LshStats
00078 {
00079     std::vector<unsigned int> bucket_sizes_;
00080     size_t n_buckets_;
00081     size_t bucket_size_mean_;
00082     size_t bucket_size_median_;
00083     size_t bucket_size_min_;
00084     size_t bucket_size_max_;
00085     size_t bucket_size_std_dev;
00088     std::vector<std::vector<unsigned int> > size_histogram_;
00089 };
00090 
00096 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
00097 {
00098     size_t w = 20;
00099     out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
00100     << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
00101     << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
00102     << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
00103     << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
00104     << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
00105     << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
00106 
00107     // Display the histogram
00108     out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
00109     << std::setiosflags(std::ios::left);
00110     for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
00111              stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
00112 
00113     return out;
00114 }
00115 
00116 
00118 
00124 template<typename ElementType>
00125 class LshTable
00126 {
00127 public:
00130 #if RTFLANN_USE_UNORDERED_MAP
00131     typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
00132 #else
00133     typedef std::map<BucketKey, Bucket> BucketsSpace;
00134 #endif
00135 
00138     typedef std::vector<Bucket> BucketsSpeed;
00139 
00142     LshTable()
00143     {
00144     }
00145 
00151     LshTable(unsigned int /*feature_size*/, unsigned int /*key_size*/)
00152     {
00153         std::cerr << "LSH is not implemented for that type" << std::endl;
00154         throw;
00155     }
00156 
00161     void add(unsigned int value, const ElementType* feature)
00162     {
00163         // Add the value to the corresponding bucket
00164         BucketKey key = getKey(feature);
00165 
00166         switch (speed_level_) {
00167         case kArray:
00168             // That means we get the buckets from an array
00169             buckets_speed_[key].push_back(value);
00170             break;
00171         case kBitsetHash:
00172             // That means we can check the bitset for the presence of a key
00173             key_bitset_.set(key);
00174             buckets_space_[key].push_back(value);
00175             break;
00176         case kHash:
00177         {
00178             // That means we have to check for the hash table for the presence of a key
00179             buckets_space_[key].push_back(value);
00180             break;
00181         }
00182         }
00183     }
00184 
00188     void add(const std::vector< std::pair<size_t, ElementType*> >& features)
00189     {
00190 #if RTFLANN_USE_UNORDERED_MAP
00191         buckets_space_.rehash((buckets_space_.size() + features.size()) * 1.2);
00192 #endif
00193         // Add the features to the table
00194         for (size_t i = 0; i < features.size(); ++i) {
00195                 add(features[i].first, features[i].second);
00196         }
00197         // Now that the table is full, optimize it for speed/space
00198         optimize();
00199     }
00200 
00205     inline const Bucket* getBucketFromKey(BucketKey key) const
00206     {
00207         // Generate other buckets
00208         switch (speed_level_) {
00209         case kArray:
00210             // That means we get the buckets from an array
00211             return &buckets_speed_[key];
00212             break;
00213         case kBitsetHash:
00214             // That means we can check the bitset for the presence of a key
00215             if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
00216             else return 0;
00217             break;
00218         case kHash:
00219         {
00220             // That means we have to check for the hash table for the presence of a key
00221             BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
00222             bucket_it = buckets_space_.find(key);
00223             // Stop here if that bucket does not exist
00224             if (bucket_it == bucket_end) return 0;
00225             else return &bucket_it->second;
00226             break;
00227         }
00228         }
00229         return 0;
00230     }
00231 
00234     size_t getKey(const ElementType* /*feature*/) const
00235     {
00236         std::cerr << "LSH is not implemented for that type" << std::endl;
00237         throw;
00238         return 1;
00239     }
00240 
00244     LshStats getStats() const;
00245 
00246 private:
00252     enum SpeedLevel
00253     {
00254         kArray, kBitsetHash, kHash
00255     };
00256 
00259     void initialize(size_t key_size)
00260     {
00261         speed_level_ = kHash;
00262         key_size_ = key_size;
00263     }
00264 
00267     void optimize()
00268     {
00269         // If we are already using the fast storage, no need to do anything
00270         if (speed_level_ == kArray) return;
00271 
00272         // Use an array if it will be more than half full
00273         if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
00274             speed_level_ = kArray;
00275             // Fill the array version of it
00276             buckets_speed_.resize(size_t(1) << key_size_);
00277             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
00278 
00279             // Empty the hash table
00280             buckets_space_.clear();
00281             return;
00282         }
00283 
00284         // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
00285         // for the vector) or less than 512MB (key_size_ <= 30)
00286         if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
00287              >= size_t(size_t(1) << key_size_)) || (key_size_ <= 32)) {
00288             speed_level_ = kBitsetHash;
00289             key_bitset_.resize(size_t(1) << key_size_);
00290             key_bitset_.reset();
00291             // Try with the BucketsSpace
00292             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
00293         }
00294         else {
00295             speed_level_ = kHash;
00296             key_bitset_.clear();
00297         }
00298     }
00299 
00300     template<typename Archive>
00301     void serialize(Archive& ar)
00302     {
00303         int val;
00304         if (Archive::is_saving::value) {
00305                 val = (int)speed_level_;
00306         }
00307         ar & val;
00308         if (Archive::is_loading::value) {
00309                 speed_level_ = (SpeedLevel) val;
00310         }
00311 
00312         ar & key_size_;
00313         ar & mask_;
00314 
00315         if (speed_level_==kArray) {
00316                 ar & buckets_speed_;
00317         }
00318         if (speed_level_==kBitsetHash || speed_level_==kHash) {
00319                 ar & buckets_space_;
00320         }
00321                 if (speed_level_==kBitsetHash) {
00322                         ar & key_bitset_;
00323                 }
00324     }
00325     friend struct serialization::access;
00326 
00329     BucketsSpeed buckets_speed_;
00330 
00333     BucketsSpace buckets_space_;
00334 
00336     SpeedLevel speed_level_;
00337 
00341     DynamicBitset key_bitset_;
00342 
00345     unsigned int key_size_;
00346 
00347     // Members only used for the unsigned char specialization
00351     std::vector<size_t> mask_;
00352 };
00353 
00355 // Specialization for unsigned char
00356 
00357 template<>
00358 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
00359 {
00360     initialize(subsignature_size);
00361     // Allocate the mask
00362     mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
00363 
00364     // A bit brutal but fast to code
00365     std::vector<size_t> indices(feature_size * CHAR_BIT);
00366     for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
00367     std::random_shuffle(indices.begin(), indices.end());
00368 
00369     // Generate a random set of order of subsignature_size_ bits
00370     for (unsigned int i = 0; i < key_size_; ++i) {
00371         size_t index = indices[i];
00372 
00373         // Set that bit in the mask
00374         size_t divisor = CHAR_BIT * sizeof(size_t);
00375         size_t idx = index / divisor; //pick the right size_t index
00376         mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
00377     }
00378 
00379     // Set to 1 if you want to display the mask for debug
00380 #if 0
00381     {
00382         size_t bcount = 0;
00383         BOOST_FOREACH(size_t mask_block, mask_){
00384             out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
00385                 << std::endl;
00386             bcount += __builtin_popcountll(mask_block);
00387         }
00388         out << "bit count : " << std::dec << bcount << std::endl;
00389         out << "mask size : " << mask_.size() << std::endl;
00390         return out;
00391     }
00392 #endif
00393 }
00394 
00398 template<>
00399 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
00400 {
00401     // no need to check if T is dividable by sizeof(size_t) like in the Hamming
00402     // distance computation as we have a mask
00403     const size_t* feature_block_ptr = reinterpret_cast<const size_t*> (feature);
00404 
00405     // Figure out the subsignature of the feature
00406     // Given the feature ABCDEF, and the mask 001011, the output will be
00407     // 000CEF
00408     size_t subsignature = 0;
00409     size_t bit_index = 1;
00410 
00411     for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
00412         // get the mask and signature blocks
00413         size_t feature_block = *feature_block_ptr;
00414         size_t mask_block = *pmask_block;
00415         while (mask_block) {
00416             // Get the lowest set bit in the mask block
00417             size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
00418             // Add it to the current subsignature if necessary
00419             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
00420             // Reset the bit in the mask block
00421             mask_block ^= lowest_bit;
00422             // increment the bit index for the subsignature
00423             bit_index <<= 1;
00424         }
00425         // Check the next feature block
00426         ++feature_block_ptr;
00427     }
00428     return subsignature;
00429 }
00430 
00431 template<>
00432 inline LshStats LshTable<unsigned char>::getStats() const
00433 {
00434     LshStats stats;
00435     stats.bucket_size_mean_ = 0;
00436     if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
00437         stats.n_buckets_ = 0;
00438         stats.bucket_size_median_ = 0;
00439         stats.bucket_size_min_ = 0;
00440         stats.bucket_size_max_ = 0;
00441         return stats;
00442     }
00443 
00444     if (!buckets_speed_.empty()) {
00445         for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
00446             stats.bucket_sizes_.push_back(pbucket->size());
00447             stats.bucket_size_mean_ += pbucket->size();
00448         }
00449         stats.bucket_size_mean_ /= buckets_speed_.size();
00450         stats.n_buckets_ = buckets_speed_.size();
00451     }
00452     else {
00453         for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
00454             stats.bucket_sizes_.push_back(x->second.size());
00455             stats.bucket_size_mean_ += x->second.size();
00456         }
00457         stats.bucket_size_mean_ /= buckets_space_.size();
00458         stats.n_buckets_ = buckets_space_.size();
00459     }
00460 
00461     std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
00462 
00463     //  BOOST_FOREACH(int size, stats.bucket_sizes_)
00464     //          std::cout << size << " ";
00465     //  std::cout << std::endl;
00466     stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
00467     stats.bucket_size_min_ = stats.bucket_sizes_.front();
00468     stats.bucket_size_max_ = stats.bucket_sizes_.back();
00469 
00470     // TODO compute mean and std
00471     /*float mean, stddev;
00472        stats.bucket_size_mean_ = mean;
00473        stats.bucket_size_std_dev = stddev;*/
00474 
00475     // Include a histogram of the buckets
00476     unsigned int bin_start = 0;
00477     unsigned int bin_end = 20;
00478     bool is_new_bin = true;
00479     for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
00480          != end; )
00481         if (*iterator < bin_end) {
00482             if (is_new_bin) {
00483                 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
00484                 stats.size_histogram_.back()[0] = bin_start;
00485                 stats.size_histogram_.back()[1] = bin_end - 1;
00486                 is_new_bin = false;
00487             }
00488             ++stats.size_histogram_.back()[2];
00489             ++iterator;
00490         }
00491         else {
00492             bin_start += 20;
00493             bin_end += 20;
00494             is_new_bin = true;
00495         }
00496 
00497     return stats;
00498 }
00499 
00500 // End the two namespaces
00501 }
00502 }
00503 
00505 
00506 #endif /* FLANN_LSH_TABLE_H_ */


rtabmap
Author(s): Mathieu Labbe
autogenerated on Thu Jun 6 2019 21:59:20