simple_kmeans.h
Go to the documentation of this file.
00001 #ifndef VOCABULARY_TREE_SIMPLE_KMEANS_H
00002 #define VOCABULARY_TREE_SIMPLE_KMEANS_H
00003 
00004 #include "vocabulary_tree/distance.h"
00005 #include "vocabulary_tree/feature_allocator.h"
00006 #include <boost/function.hpp>
00007 #include <boost/foreach.hpp>
00008 #include <algorithm>
00009 #include <vector>
00010 #include <limits>
00011 
00012 namespace vt {
00013 
00014 // Forward declare function objects for choosing the initial centers
00015 struct InitRandom;
00016 struct InitGiven;
00018 
00024 template<class Feature, class Distance = distance::L2<Feature>,
00025          class FeatureAllocator = typename DefaultAllocator<Feature>::type>
00026 class SimpleKmeans
00027 {
00028 public:
00029   typedef typename Distance::result_type squared_distance_type;
00030   typedef boost::function<void(const std::vector<Feature*>&, size_t, std::vector<Feature, FeatureAllocator>&, Distance)> Initializer;
00031 
00040   SimpleKmeans(const Feature& zero = Feature(), Distance d = Distance());
00041 
00043   void setInitMethod(const Initializer& init) { choose_centers_ = init; }
00044 
00045   size_t getMaxIterations() const { return max_iterations_; }
00046   void setMaxIterations(size_t iters) { max_iterations_ = iters; }
00047 
00048   size_t getRestarts() const { return restarts_; }
00049   void setRestarts(size_t restarts) { restarts_ = restarts; }
00050 
00059   squared_distance_type cluster(const std::vector<Feature, FeatureAllocator>& features, size_t k,
00060                                 std::vector<Feature, FeatureAllocator>& centers,
00061                                 std::vector<unsigned int>& membership) const;
00062 
00074   squared_distance_type clusterPointers(const std::vector<Feature*>& features, size_t k,
00075                                         std::vector<Feature, FeatureAllocator>& centers,
00076                                         std::vector<unsigned int>& membership) const;
00077   
00078 private:
00079   
00080   squared_distance_type clusterOnce(const std::vector<Feature*>& features, size_t k,
00081                                     std::vector<Feature, FeatureAllocator>& centers,
00082                                     std::vector<unsigned int>& membership) const;
00083   
00084   Feature zero_;
00085   Distance distance_;
00086   Initializer choose_centers_;
00087   size_t max_iterations_;
00088   size_t restarts_;
00089 };
00090 
00091 
00092 template < class Feature, class Distance, class FeatureAllocator >
00093 SimpleKmeans<Feature, Distance, FeatureAllocator>::SimpleKmeans(const Feature& zero, Distance d)
00094   : zero_(zero),
00095     distance_(d),
00096     choose_centers_(InitRandom()),
00097     max_iterations_(100),
00098     restarts_(1)
00099 {
00100 }
00101 
00102 template < class Feature, class Distance, class FeatureAllocator >
00103 typename SimpleKmeans<Feature, Distance, FeatureAllocator>::squared_distance_type
00104 SimpleKmeans<Feature, Distance, FeatureAllocator>::cluster(const std::vector<Feature, FeatureAllocator>& features, size_t k,
00105                                                            std::vector<Feature, FeatureAllocator>& centers,
00106                                                            std::vector<unsigned int>& membership) const
00107 {
00108   std::vector<Feature*> feature_ptrs;
00109   feature_ptrs.reserve(features.size());
00110   BOOST_FOREACH(const Feature& f, features)
00111     feature_ptrs.push_back(const_cast<Feature*>(&f));
00112   return clusterPointers(feature_ptrs, k, centers, membership);
00113 }
00114 
00115 template < class Feature, class Distance, class FeatureAllocator >
00116 typename SimpleKmeans<Feature, Distance, FeatureAllocator>::squared_distance_type
00117 SimpleKmeans<Feature, Distance, FeatureAllocator>::clusterPointers(const std::vector<Feature*>& features, size_t k,
00118                                                                    std::vector<Feature, FeatureAllocator>& centers,
00119                                                                    std::vector<unsigned int>& membership) const
00120 {
00121   std::vector<Feature, FeatureAllocator> new_centers(centers);
00122   new_centers.resize(k);
00123   std::vector<unsigned int> new_membership(features.size());
00124 
00125   squared_distance_type least_sse = std::numeric_limits<squared_distance_type>::max();
00126   for (size_t starts = 0; starts < restarts_; ++starts) {
00127     choose_centers_(features, k, new_centers, distance_);
00128     
00129     squared_distance_type sse = clusterOnce(features, k, new_centers, new_membership);
00130     if (sse < least_sse) {
00131       least_sse = sse;
00132       centers = new_centers;
00133       membership = new_membership;
00134     }
00135   }
00136 
00137   return least_sse;
00138 }
00139 
00140 template < class Feature, class Distance, class FeatureAllocator >
00141 typename SimpleKmeans<Feature, Distance, FeatureAllocator>::squared_distance_type
00142 SimpleKmeans<Feature, Distance, FeatureAllocator>::clusterOnce(const std::vector<Feature*>& features, size_t k,
00143                                                                std::vector<Feature, FeatureAllocator>& centers,
00144                                                                std::vector<unsigned int>& membership) const
00145 {
00146   std::vector<size_t> new_center_counts(k);
00147   std::vector<Feature, FeatureAllocator> new_centers(k);
00148   
00149   for (size_t iter = 0; iter < max_iterations_; ++iter) {
00150     // Zero out new centers and counts
00151     std::fill(new_center_counts.begin(), new_center_counts.end(), 0);
00152     std::fill(new_centers.begin(), new_centers.end(), zero_);
00153     bool is_stable = true;
00154 
00155     // Assign data objects to current centers
00156     for (size_t i = 0; i < features.size(); ++i) {
00157       squared_distance_type d_min = std::numeric_limits<squared_distance_type>::max();
00158       unsigned int nearest = -1;
00159       // Find the nearest cluster center to feature i
00160       for (unsigned int j = 0; j < k; ++j) {
00161         squared_distance_type distance = distance_(*features[i], centers[j]);
00162         if (distance < d_min) {
00163           d_min = distance;
00164           nearest = j;
00165         }
00166       }
00167 
00168       // Assign feature i to the cluster it is nearest to
00169       if (membership[i] != nearest) {
00170         is_stable = false;
00171         membership[i] = nearest;
00172       }
00173 
00174       // Accumulate the cluster center and its membership count
00175       new_centers[nearest] += *features[i];
00176       ++new_center_counts[nearest];
00177     }
00178     if (is_stable) break;
00179     
00180     // Assign new centers
00181     for (size_t i = 0; i < k; ++i) {
00182       if (new_center_counts[i] > 0) {
00183         centers[i] = new_centers[i] / new_center_counts[i];
00184       }
00185       else {
00186         // Choose a new center randomly from the input features
00187         unsigned int index = rand() % features.size();
00188         centers[i] = *features[index];
00189       }
00190     }
00191   }
00192 
00193   // Return the sum squared error
00195   squared_distance_type sse = squared_distance_type();
00196   for (size_t i = 0; i < features.size(); ++i) {
00197     sse += distance_(*features[i], centers[membership[i]]);
00198   }
00199   return sse;
00200 }
00201 
00202 
00206 struct InitRandom
00207 {
00208   template<class Feature, class Distance, class FeatureAllocator>
00209   void operator()(const std::vector<Feature*>& features, size_t k, std::vector<Feature, FeatureAllocator>& centers, Distance distance)
00210   {
00211     // Construct a random permutation of the features using a Fisher-Yates shuffle
00212     std::vector<Feature*> features_perm = features;
00213     for (size_t i = features.size(); i > 1; --i) {
00214       size_t k = rand() % i;
00215       std::swap(features_perm[i-1], features_perm[k]);
00216     }
00217     // Take the first k permuted features as the initial centers
00218     for (size_t i = 0; i < centers.size(); ++i)
00219       centers[i] = *features_perm[i];
00220   }
00221 };
00222 
00226 struct InitGiven
00227 {
00228   template<class Feature, class Distance, class FeatureAllocator>
00229   void operator()(const std::vector<Feature*>& features, size_t k, std::vector<Feature, FeatureAllocator>& centers, Distance distance)
00230   {
00231     // Do nothing!
00232   }
00233 };
00234 
00235 } //namespace vt
00236 
00237 #endif


vocabulary_tree
Author(s): Patrick Mihelich
autogenerated on Thu Jan 2 2014 12:12:26