00001 #ifndef VOCABULARY_TREE_SIMPLE_KMEANS_H
00002 #define VOCABULARY_TREE_SIMPLE_KMEANS_H
00003
00004 #include "vocabulary_tree/distance.h"
00005 #include "vocabulary_tree/feature_allocator.h"
00006 #include <boost/function.hpp>
00007 #include <boost/foreach.hpp>
00008 #include <algorithm>
00009 #include <vector>
00010 #include <limits>
00011
00012 namespace vt {
00013
00014
00015 struct InitRandom;
00016 struct InitGiven;
00018
00024 template<class Feature, class Distance = distance::L2<Feature>,
00025 class FeatureAllocator = typename DefaultAllocator<Feature>::type>
00026 class SimpleKmeans
00027 {
00028 public:
00029 typedef typename Distance::result_type squared_distance_type;
00030 typedef boost::function<void(const std::vector<Feature*>&, size_t, std::vector<Feature, FeatureAllocator>&, Distance)> Initializer;
00031
00040 SimpleKmeans(const Feature& zero = Feature(), Distance d = Distance());
00041
00043 void setInitMethod(const Initializer& init) { choose_centers_ = init; }
00044
00045 size_t getMaxIterations() const { return max_iterations_; }
00046 void setMaxIterations(size_t iters) { max_iterations_ = iters; }
00047
00048 size_t getRestarts() const { return restarts_; }
00049 void setRestarts(size_t restarts) { restarts_ = restarts; }
00050
00059 squared_distance_type cluster(const std::vector<Feature, FeatureAllocator>& features, size_t k,
00060 std::vector<Feature, FeatureAllocator>& centers,
00061 std::vector<unsigned int>& membership) const;
00062
00074 squared_distance_type clusterPointers(const std::vector<Feature*>& features, size_t k,
00075 std::vector<Feature, FeatureAllocator>& centers,
00076 std::vector<unsigned int>& membership) const;
00077
00078 private:
00079
00080 squared_distance_type clusterOnce(const std::vector<Feature*>& features, size_t k,
00081 std::vector<Feature, FeatureAllocator>& centers,
00082 std::vector<unsigned int>& membership) const;
00083
00084 Feature zero_;
00085 Distance distance_;
00086 Initializer choose_centers_;
00087 size_t max_iterations_;
00088 size_t restarts_;
00089 };
00090
00091
00092 template < class Feature, class Distance, class FeatureAllocator >
00093 SimpleKmeans<Feature, Distance, FeatureAllocator>::SimpleKmeans(const Feature& zero, Distance d)
00094 : zero_(zero),
00095 distance_(d),
00096 choose_centers_(InitRandom()),
00097 max_iterations_(100),
00098 restarts_(1)
00099 {
00100 }
00101
00102 template < class Feature, class Distance, class FeatureAllocator >
00103 typename SimpleKmeans<Feature, Distance, FeatureAllocator>::squared_distance_type
00104 SimpleKmeans<Feature, Distance, FeatureAllocator>::cluster(const std::vector<Feature, FeatureAllocator>& features, size_t k,
00105 std::vector<Feature, FeatureAllocator>& centers,
00106 std::vector<unsigned int>& membership) const
00107 {
00108 std::vector<Feature*> feature_ptrs;
00109 feature_ptrs.reserve(features.size());
00110 BOOST_FOREACH(const Feature& f, features)
00111 feature_ptrs.push_back(const_cast<Feature*>(&f));
00112 return clusterPointers(feature_ptrs, k, centers, membership);
00113 }
00114
00115 template < class Feature, class Distance, class FeatureAllocator >
00116 typename SimpleKmeans<Feature, Distance, FeatureAllocator>::squared_distance_type
00117 SimpleKmeans<Feature, Distance, FeatureAllocator>::clusterPointers(const std::vector<Feature*>& features, size_t k,
00118 std::vector<Feature, FeatureAllocator>& centers,
00119 std::vector<unsigned int>& membership) const
00120 {
00121 std::vector<Feature, FeatureAllocator> new_centers(centers);
00122 new_centers.resize(k);
00123 std::vector<unsigned int> new_membership(features.size());
00124
00125 squared_distance_type least_sse = std::numeric_limits<squared_distance_type>::max();
00126 for (size_t starts = 0; starts < restarts_; ++starts) {
00127 choose_centers_(features, k, new_centers, distance_);
00128
00129 squared_distance_type sse = clusterOnce(features, k, new_centers, new_membership);
00130 if (sse < least_sse) {
00131 least_sse = sse;
00132 centers = new_centers;
00133 membership = new_membership;
00134 }
00135 }
00136
00137 return least_sse;
00138 }
00139
00140 template < class Feature, class Distance, class FeatureAllocator >
00141 typename SimpleKmeans<Feature, Distance, FeatureAllocator>::squared_distance_type
00142 SimpleKmeans<Feature, Distance, FeatureAllocator>::clusterOnce(const std::vector<Feature*>& features, size_t k,
00143 std::vector<Feature, FeatureAllocator>& centers,
00144 std::vector<unsigned int>& membership) const
00145 {
00146 std::vector<size_t> new_center_counts(k);
00147 std::vector<Feature, FeatureAllocator> new_centers(k);
00148
00149 for (size_t iter = 0; iter < max_iterations_; ++iter) {
00150
00151 std::fill(new_center_counts.begin(), new_center_counts.end(), 0);
00152 std::fill(new_centers.begin(), new_centers.end(), zero_);
00153 bool is_stable = true;
00154
00155
00156 for (size_t i = 0; i < features.size(); ++i) {
00157 squared_distance_type d_min = std::numeric_limits<squared_distance_type>::max();
00158 unsigned int nearest = -1;
00159
00160 for (unsigned int j = 0; j < k; ++j) {
00161 squared_distance_type distance = distance_(*features[i], centers[j]);
00162 if (distance < d_min) {
00163 d_min = distance;
00164 nearest = j;
00165 }
00166 }
00167
00168
00169 if (membership[i] != nearest) {
00170 is_stable = false;
00171 membership[i] = nearest;
00172 }
00173
00174
00175 new_centers[nearest] += *features[i];
00176 ++new_center_counts[nearest];
00177 }
00178 if (is_stable) break;
00179
00180
00181 for (size_t i = 0; i < k; ++i) {
00182 if (new_center_counts[i] > 0) {
00183 centers[i] = new_centers[i] / new_center_counts[i];
00184 }
00185 else {
00186
00187 unsigned int index = rand() % features.size();
00188 centers[i] = *features[index];
00189 }
00190 }
00191 }
00192
00193
00195 squared_distance_type sse = squared_distance_type();
00196 for (size_t i = 0; i < features.size(); ++i) {
00197 sse += distance_(*features[i], centers[membership[i]]);
00198 }
00199 return sse;
00200 }
00201
00202
00206 struct InitRandom
00207 {
00208 template<class Feature, class Distance, class FeatureAllocator>
00209 void operator()(const std::vector<Feature*>& features, size_t k, std::vector<Feature, FeatureAllocator>& centers, Distance distance)
00210 {
00211
00212 std::vector<Feature*> features_perm = features;
00213 for (size_t i = features.size(); i > 1; --i) {
00214 size_t k = rand() % i;
00215 std::swap(features_perm[i-1], features_perm[k]);
00216 }
00217
00218 for (size_t i = 0; i < centers.size(); ++i)
00219 centers[i] = *features_perm[i];
00220 }
00221 };
00222
00226 struct InitGiven
00227 {
00228 template<class Feature, class Distance, class FeatureAllocator>
00229 void operator()(const std::vector<Feature*>& features, size_t k, std::vector<Feature, FeatureAllocator>& centers, Distance distance)
00230 {
00231
00232 }
00233 };
00234
00235 }
00236
00237 #endif