kmeans.c
Go to the documentation of this file.
00001 
00006 /*
00007 Copyright (C) 2007-12 Andrea Vedaldi and Brian Fulkerson.
00008 Copyright (C) 2013 Andrea Vedaldi and David Novotny.
00009 All rights reserved.
00010 
00011 This file is part of the VLFeat library and is made available under
00012 the terms of the BSD license (see the COPYING file).
00013 */
00014 
00363 #include "kmeans.h"
00364 #include "generic.h"
00365 #include "mathop.h"
00366 #include <string.h>
00367 
00368 #ifdef _OPENMP
00369 #include <omp.h>
00370 #endif
00371 
00372 /* ================================================================ */
00373 #ifndef VL_KMEANS_INSTANTIATING
00374 
00375 
00385 VL_EXPORT void
00386 vl_kmeans_reset (VlKMeans * self)
00387 {
00388   self->numCenters = 0 ;
00389   self->dimension = 0 ;
00390 
00391   if (self->centers) vl_free(self->centers) ;
00392   if (self->centerDistances) vl_free(self->centerDistances) ;
00393 
00394   self->centers = NULL ;
00395   self->centerDistances = NULL ;
00396 }
00397 
00405 VL_EXPORT VlKMeans *
00406 vl_kmeans_new (vl_type dataType,
00407                VlVectorComparisonType distance)
00408 {
00409   VlKMeans * self = vl_calloc(1, sizeof(VlKMeans)) ;
00410 
00411   self->algorithm = VlKMeansLloyd ;
00412   self->distance = distance ;
00413   self->dataType = dataType ;
00414   self->verbosity = 0 ;
00415   self->maxNumIterations = 100 ;
00416   self->minEnergyVariation = 1e-4 ;
00417   self->numRepetitions = 1 ;
00418   self->centers = NULL ;
00419   self->centerDistances = NULL ;
00420   self->numTrees = 3;
00421   self->maxNumComparisons = 100;
00422 
00423   vl_kmeans_reset (self) ;
00424   return self ;
00425 }
00426 
00433 VL_EXPORT VlKMeans *
00434 vl_kmeans_new_copy (VlKMeans const * kmeans)
00435 {
00436   VlKMeans * self = vl_malloc(sizeof(VlKMeans)) ;
00437 
00438   self->algorithm = kmeans->algorithm ;
00439   self->distance = kmeans->distance ;
00440   self->dataType = kmeans->dataType ;
00441 
00442   self->verbosity = kmeans->verbosity ;
00443   self->maxNumIterations = kmeans->maxNumIterations ;
00444   self->numRepetitions = kmeans->numRepetitions ;
00445 
00446   self->dimension = kmeans->dimension ;
00447   self->numCenters = kmeans->numCenters ;
00448   self->centers = NULL ;
00449   self->centerDistances = NULL ;
00450 
00451   self->numTrees = kmeans->numTrees;
00452   self->maxNumComparisons = kmeans->maxNumComparisons;
00453 
00454   if (kmeans->centers) {
00455     vl_size dataSize = vl_get_type_size(self->dataType) * self->dimension * self->numCenters ;
00456     self->centers = vl_malloc(dataSize) ;
00457     memcpy (self->centers, kmeans->centers, dataSize) ;
00458   }
00459 
00460   if (kmeans->centerDistances) {
00461     vl_size dataSize = vl_get_type_size(self->dataType) * self->numCenters * self->numCenters ;
00462     self->centerDistances = vl_malloc(dataSize) ;
00463     memcpy (self->centerDistances, kmeans->centerDistances, dataSize) ;
00464   }
00465 
00466   return self ;
00467 }
00468 
00477 VL_EXPORT void
00478 vl_kmeans_delete (VlKMeans * self)
00479 {
00480   vl_kmeans_reset (self) ;
00481   vl_free (self) ;
00482 }
00483 
00484 /* an helper structure */
00485 typedef struct _VlKMeansSortWrapper {
00486   vl_uint32 * permutation ;
00487   void const * data ;
00488   vl_size stride ;
00489 } VlKMeansSortWrapper ;
00490 
00491 
00492 /* ---------------------------------------------------------------- */
00493 /* Instantiate shuffle algorithm */
00494 
00495 #define VL_SHUFFLE_type vl_uindex
00496 #define VL_SHUFFLE_prefix _vl_kmeans
00497 #include "shuffle-def.h"
00498 
00499 /* #ifdef VL_KMEANS_INSTANTITATING */
00500 #endif
00501 
00502 /* ================================================================ */
00503 #ifdef VL_KMEANS_INSTANTIATING
00504 
00505 /* ---------------------------------------------------------------- */
00506 /*                                                      Set centers */
00507 /* ---------------------------------------------------------------- */
00508 
00509 static void
00510 VL_XCAT(_vl_kmeans_set_centers_, SFX)
00511 (VlKMeans * self,
00512  TYPE const * centers,
00513  vl_size dimension,
00514  vl_size numCenters)
00515 {
00516   self->dimension = dimension ;
00517   self->numCenters = numCenters ;
00518   self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00519   memcpy ((TYPE*)self->centers, centers,
00520           sizeof(TYPE) * dimension * numCenters) ;
00521 }
00522 
00523 /* ---------------------------------------------------------------- */
00524 /*                                                   Random seeding */
00525 /* ---------------------------------------------------------------- */
00526 
00527 static void
00528 VL_XCAT(_vl_kmeans_init_centers_with_rand_data_, SFX)
00529 (VlKMeans * self,
00530  TYPE const * data,
00531  vl_size dimension,
00532  vl_size numData,
00533  vl_size numCenters)
00534 {
00535   vl_uindex i, j, k ;
00536   VlRand * rand = vl_get_rand () ;
00537 
00538   self->dimension = dimension ;
00539   self->numCenters = numCenters ;
00540   self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00541 
00542   {
00543     vl_uindex * perm = vl_malloc (sizeof(vl_uindex) * numData) ;
00544 #if (FLT == VL_TYPE_FLOAT)
00545     VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00546 #else
00547     VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00548 #endif
00549     TYPE * distances = vl_malloc (sizeof(TYPE) * numCenters) ;
00550 
00551     /* get a random permutation of the data point */
00552     for (i = 0 ; i < numData ; ++i) perm[i] = i ;
00553     _vl_kmeans_shuffle (perm, numData, rand) ;
00554 
00555     for (k = 0, i = 0 ; k < numCenters ; ++ i) {
00556 
00557       /* compare the next data point to all centers collected so far
00558        to detect duplicates (if there are enough left)
00559        */
00560       if (numCenters - k < numData - i) {
00561         vl_bool duplicateDetected = VL_FALSE ;
00562         VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(distances,
00563             dimension,
00564             data + dimension * perm[i], 1,
00565             (TYPE*)self->centers, k,
00566             distFn) ;
00567         for (j = 0 ; j < k ; ++j) {
00568           duplicateDetected |= (distances[j] == 0) ;
00569         }
00570         if (duplicateDetected) continue ;
00571       }
00572 
00573       /* ok, it is not a duplicate so we can accept it! */
00574       memcpy ((TYPE*)self->centers + dimension * k,
00575               data + dimension * perm[i],
00576               sizeof(TYPE) * dimension) ;
00577       k ++ ;
00578     }
00579     vl_free(distances) ;
00580     vl_free(perm) ;
00581   }
00582 }
00583 
00584 /* ---------------------------------------------------------------- */
00585 /*                                                 kmeans++ seeding */
00586 /* ---------------------------------------------------------------- */
00587 
00588 static void
00589 VL_XCAT(_vl_kmeans_init_centers_plus_plus_, SFX)
00590 (VlKMeans * self,
00591  TYPE const * data,
00592  vl_size dimension,
00593  vl_size numData,
00594  vl_size numCenters)
00595 {
00596   vl_uindex x, c ;
00597   VlRand * rand = vl_get_rand () ;
00598   TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00599   TYPE * minDistances = vl_malloc (sizeof(TYPE) * numData) ;
00600 #if (FLT == VL_TYPE_FLOAT)
00601   VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00602 #else
00603   VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00604 #endif
00605 
00606   self->dimension = dimension ;
00607   self->numCenters = numCenters ;
00608   self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00609 
00610   for (x = 0 ; x < numData ; ++x) {
00611     minDistances[x] = (TYPE) VL_INFINITY_D ;
00612   }
00613 
00614   /* select the first point at random */
00615   x = vl_rand_uindex (rand, numData) ;
00616   c = 0 ;
00617   while (1) {
00618     TYPE energy = 0 ;
00619     TYPE acc = 0 ;
00620     TYPE thresh = (TYPE) vl_rand_real1 (rand) ;
00621 
00622     memcpy ((TYPE*)self->centers + c * dimension,
00623             data + x * dimension,
00624             sizeof(TYPE) * dimension) ;
00625 
00626     c ++ ;
00627     if (c == numCenters) break ;
00628 
00629     VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)
00630     (distances,
00631      dimension,
00632      (TYPE*)self->centers + (c - 1) * dimension, 1,
00633      data, numData,
00634      distFn) ;
00635 
00636     for (x = 0 ; x < numData ; ++x) {
00637       minDistances[x] = VL_MIN(minDistances[x], distances[x]) ;
00638       energy += minDistances[x] ;
00639     }
00640 
00641     for (x = 0 ; x < numData - 1 ; ++x) {
00642       acc += minDistances[x] ;
00643       if (acc >= thresh * energy) break ;
00644     }
00645   }
00646 
00647   vl_free(distances) ;
00648   vl_free(minDistances) ;
00649 }
00650 
00651 /* ---------------------------------------------------------------- */
00652 /*                                                     Quantization */
00653 /* ---------------------------------------------------------------- */
00654 
00655 static void
00656 VL_XCAT(_vl_kmeans_quantize_, SFX)
00657 (VlKMeans * self,
00658  vl_uint32 * assignments,
00659  TYPE * distances,
00660  TYPE const * data,
00661  vl_size numData)
00662 {
00663   vl_index i ;
00664 
00665 #if (FLT == VL_TYPE_FLOAT)
00666   VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00667 #else
00668   VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00669 #endif
00670 
00671 #ifdef _OPENMP
00672 #pragma omp parallel default(none) \
00673             shared(self, distances, assignments, numData, distFn, data) \
00674             num_threads(vl_get_max_threads())
00675 #endif
00676   {
00677     /* vl_malloc cannot be used here if mapped to MATLAB malloc */
00678     TYPE * distanceToCenters = malloc(sizeof(TYPE) * self->numCenters) ;
00679 
00680 #ifdef _OPENMP
00681 #pragma omp for
00682 #endif
00683     for (i = 0 ; i < (signed)numData ; ++i) {
00684       vl_uindex k ;
00685       TYPE bestDistance = (TYPE) VL_INFINITY_D ;
00686       VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(distanceToCenters,
00687                                                             self->dimension,
00688                                                             data + self->dimension * i, 1,
00689                                                             (TYPE*)self->centers, self->numCenters,
00690                                                             distFn) ;
00691       for (k = 0 ; k < self->numCenters ; ++k) {
00692         if (distanceToCenters[k] < bestDistance) {
00693           bestDistance = distanceToCenters[k] ;
00694           assignments[i] = (vl_uint32)k ;
00695         }
00696       }
00697       if (distances) distances[i] = bestDistance ;
00698     }
00699 
00700     free(distanceToCenters) ;
00701   }
00702 }
00703 
00704 /* ---------------------------------------------------------------- */
00705 /*                                                 ANN quantization */
00706 /* ---------------------------------------------------------------- */
00707 
00708 static void
00709 VL_XCAT(_vl_kmeans_quantize_ann_, SFX)
00710 (VlKMeans * self,
00711  vl_uint32 * assignments,
00712  TYPE * distances,
00713  TYPE const * data,
00714  vl_size numData,
00715  vl_bool update)
00716 {
00717 #if (FLT == VL_TYPE_FLOAT)
00718   VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00719 #else
00720   VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00721 #endif
00722 
00723   VlKDForest * forest = vl_kdforest_new(self->dataType,self->dimension,self->numTrees, self->distance) ;
00724   vl_kdforest_set_max_num_comparisons(forest,self->maxNumComparisons);
00725   vl_kdforest_set_thresholding_method(forest,VL_KDTREE_MEDIAN);
00726   vl_kdforest_build(forest,self->numCenters,self->centers);
00727 
00728 #ifdef _OPENMP
00729 #pragma omp parallel default(none) \
00730   num_threads(vl_get_max_threads()) \
00731   shared(self, forest, update, assignments, distances, data, numData, distFn)
00732 #endif
00733   {
00734     VlKDForestNeighbor neighbor ;
00735     VlKDForestSearcher * searcher ;
00736     vl_index x;
00737 
00738 #ifdef _OPENMP
00739 #pragma omp critical
00740 #endif
00741     searcher = vl_kdforest_new_searcher (forest) ;
00742 
00743 #ifdef _OPENMP
00744 #pragma omp for
00745 #endif
00746     for(x = 0 ; x < (signed)numData ; ++x) {
00747       vl_kdforestsearcher_query (searcher, &neighbor, 1, (TYPE const *) (data + x*self->dimension));
00748 
00749       if (distances) {
00750         if(!update) {
00751           distances[x] = (TYPE) neighbor.distance;
00752           assignments[x] = (vl_uint32) neighbor.index ;
00753         } else {
00754           TYPE prevDist = (TYPE) distFn(self->dimension,
00755                                         data + self->dimension * x,
00756                                         (TYPE*)self->centers + self->dimension *assignments[x]);
00757           if (prevDist > (TYPE) neighbor.distance) {
00758             distances[x] = (TYPE) neighbor.distance ;
00759             assignments[x] = (vl_uint32) neighbor.index ;
00760           } else {
00761             distances[x] = prevDist ;
00762           }
00763         }
00764       } else {
00765         assignments[x] = (vl_uint32) neighbor.index ;
00766       }
00767     } /* end for */
00768   } /* end of parallel region */
00769 
00770   vl_kdforest_delete(forest);
00771 }
00772 
00773 /* ---------------------------------------------------------------- */
00774 /*                                                 Helper functions */
00775 /* ---------------------------------------------------------------- */
00776 
00777 /* The sorting routine is used to find increasing permutation of each
00778  * data dimension. This is used to quickly find the median for l1
00779  * distance clustering. */
00780 
00781 VL_INLINE TYPE
00782 VL_XCAT3(_vl_kmeans_, SFX, _qsort_cmp)
00783 (VlKMeansSortWrapper * array, vl_uindex indexA, vl_uindex indexB)
00784 {
00785   return
00786     ((TYPE*)array->data) [array->permutation[indexA] * array->stride]
00787     -
00788     ((TYPE*)array->data) [array->permutation[indexB] * array->stride] ;
00789 }
00790 
00791 VL_INLINE void
00792 VL_XCAT3(_vl_kmeans_, SFX, _qsort_swap)
00793 (VlKMeansSortWrapper * array, vl_uindex indexA, vl_uindex indexB)
00794 {
00795   vl_uint32 tmp = array->permutation[indexA] ;
00796   array->permutation[indexA] = array->permutation[indexB] ;
00797   array->permutation[indexB] = tmp ;
00798 }
00799 
00800 #define VL_QSORT_prefix  VL_XCAT3(_vl_kmeans_, SFX, _qsort)
00801 #define VL_QSORT_array   VlKMeansSortWrapper*
00802 #define VL_QSORT_cmp     VL_XCAT3(_vl_kmeans_, SFX, _qsort_cmp)
00803 #define VL_QSORT_swap    VL_XCAT3(_vl_kmeans_, SFX, _qsort_swap)
00804 #include "qsort-def.h"
00805 
00806 static void
00807 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)
00808 (VlKMeans * self, vl_uint32 * permutations, TYPE const * data, vl_size numData)
00809 {
00810   vl_uindex d, x ;
00811 
00812   for (d = 0 ; d < self->dimension ; ++d) {
00813     VlKMeansSortWrapper array ;
00814     array.permutation = permutations + d * numData ;
00815     array.data = data + d ;
00816     array.stride = self->dimension ;
00817     for (x = 0 ; x < numData ; ++x) {
00818       array.permutation[x] = (vl_uint32)x ;
00819     }
00820     VL_XCAT3(_vl_kmeans_, SFX, _qsort_sort)(&array, numData) ;
00821   }
00822 }
00823 
00824 /* ---------------------------------------------------------------- */
00825 /*                                                 Lloyd refinement */
00826 /* ---------------------------------------------------------------- */
00827 
00828 static double
00829 VL_XCAT(_vl_kmeans_refine_centers_lloyd_, SFX)
00830 (VlKMeans * self,
00831  TYPE const * data,
00832  vl_size numData)
00833 {
00834   vl_size c, d, x, iteration ;
00835   double previousEnergy = VL_INFINITY_D ;
00836   double initialEnergy = VL_INFINITY_D ;
00837   double energy ;
00838   TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00839 
00840   vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
00841   vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
00842   vl_uint32 * permutations = NULL ;
00843   vl_size * numSeenSoFar = NULL ;
00844   VlRand * rand = vl_get_rand () ;
00845   vl_size totNumRestartedCenters = 0 ;
00846   vl_size numRestartedCenters = 0 ;
00847 
00848   if (self->distance == VlDistanceL1) {
00849     permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
00850     numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
00851     VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
00852   }
00853 
00854   for (energy = VL_INFINITY_D,
00855        iteration = 0;
00856        1 ;
00857        ++ iteration) {
00858 
00859     /* assign data to cluters */
00860     VL_XCAT(_vl_kmeans_quantize_, SFX)(self, assignments, distances, data, numData) ;
00861 
00862     /* compute energy */
00863     energy = 0 ;
00864     for (x = 0 ; x < numData ; ++x) energy += distances[x] ;
00865     if (self->verbosity) {
00866       VL_PRINTF("kmeans: Lloyd iter %d: energy = %g\n", iteration,
00867                 energy) ;
00868     }
00869 
00870     /* check termination conditions */
00871     if (iteration >= self->maxNumIterations) {
00872       if (self->verbosity) {
00873         VL_PRINTF("kmeans: Lloyd terminating because maximum number of iterations reached\n") ;
00874       }
00875       break ;
00876     }
00877     if (energy == previousEnergy) {
00878       if (self->verbosity) {
00879         VL_PRINTF("kmeans: Lloyd terminating because the algorithm fully converged\n") ;
00880       }
00881       break ;
00882     }
00883     
00884     if (iteration == 0) {
00885       initialEnergy = energy ;
00886     } else {
00887       double eps = (previousEnergy - energy) / (initialEnergy - energy) ;
00888       if (eps < self->minEnergyVariation) {
00889         if (self->verbosity) {
00890           VL_PRINTF("kmeans: ANN terminating because the energy relative variation was less than %f\n", self->minEnergyVariation) ;
00891         }
00892         break ;
00893       }
00894     }
00895     
00896     /* begin next iteration */
00897     previousEnergy = energy ;
00898 
00899     /* update clusters */
00900     memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
00901     for (x = 0 ; x < numData ; ++x) {
00902       clusterMasses[assignments[x]] ++ ;
00903     }
00904 
00905     numRestartedCenters = 0 ;
00906     switch (self->distance) {
00907       case VlDistanceL2:
00908         memset(self->centers, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
00909         for (x = 0 ; x < numData ; ++x) {
00910           TYPE * cpt = (TYPE*)self->centers + assignments[x] * self->dimension ;
00911           TYPE const * xpt = data + x * self->dimension ;
00912           for (d = 0 ; d < self->dimension ; ++d) {
00913             cpt[d] += xpt[d] ;
00914           }
00915         }
00916         for (c = 0 ; c < self->numCenters ; ++c) {
00917           TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
00918           if (clusterMasses[c] > 0) {
00919             TYPE mass = clusterMasses[c] ;
00920             for (d = 0 ; d < self->dimension ; ++d) {
00921               cpt[d] /= mass ;
00922             }
00923           } else {
00924             vl_uindex x = vl_rand_uindex(rand, numData) ;
00925             numRestartedCenters ++ ;
00926             for (d = 0 ; d < self->dimension ; ++d) {
00927               cpt[d] = data[x * self->dimension + d] ;
00928             }
00929           }
00930         }
00931         break ;
00932       case VlDistanceL1:
00933         for (d = 0 ; d < self->dimension ; ++d) {
00934           vl_uint32 * perm = permutations + d * numData ;
00935           memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
00936           for (x = 0; x < numData ; ++x) {
00937             c = assignments[perm[x]] ;
00938             if (2 * numSeenSoFar[c] < clusterMasses[c]) {
00939               ((TYPE*)self->centers) [d + c * self->dimension] =
00940                 data [d + perm[x] * self->dimension] ;
00941             }
00942             numSeenSoFar[c] ++ ;
00943           }
00944           /* restart the centers as required  */
00945           for (c = 0 ; c < self->numCenters ; ++c) {
00946             if (clusterMasses[c] == 0) {
00947               TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
00948               vl_uindex x = vl_rand_uindex(rand, numData) ;
00949               numRestartedCenters ++ ;
00950               for (d = 0 ; d < self->dimension ; ++d) {
00951                 cpt[d] = data[x * self->dimension + d] ;
00952               }
00953             }
00954           }
00955         }
00956         break ;
00957       default:
00958         abort();
00959     } /* done compute centers */
00960 
00961     totNumRestartedCenters += numRestartedCenters ;
00962     if (self->verbosity && numRestartedCenters) {
00963       VL_PRINTF("kmeans: Lloyd iter %d: restarted %d centers\n", iteration,
00964                 numRestartedCenters) ;
00965     }
00966   } /* next Lloyd iteration */
00967 
00968   if (permutations) {
00969     vl_free(permutations) ;
00970   }
00971   if (numSeenSoFar) {
00972     vl_free(numSeenSoFar) ;
00973   }
00974   vl_free(distances) ;
00975   vl_free(assignments) ;
00976   vl_free(clusterMasses) ;
00977   return energy ;
00978 }
00979 
00980 static double
00981 VL_XCAT(_vl_kmeans_update_center_distances_, SFX)
00982 (VlKMeans * self)
00983 {
00984 #if (FLT == VL_TYPE_FLOAT)
00985   VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00986 #else
00987   VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00988 #endif
00989 
00990   if (! self->centerDistances) {
00991     self->centerDistances = vl_malloc (sizeof(TYPE) *
00992                                        self->numCenters *
00993                                        self->numCenters) ;
00994   }
00995   VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(self->centerDistances,
00996       self->dimension,
00997       self->centers, self->numCenters,
00998       NULL, 0,
00999       distFn) ;
01000   return self->numCenters * (self->numCenters - 1) / 2 ;
01001 }
01002 
01003 static double
01004 VL_XCAT(_vl_kmeans_refine_centers_ann_, SFX)
01005 (VlKMeans * self,
01006  TYPE const * data,
01007  vl_size numData)
01008 {
01009   vl_size c, d, x, iteration ;
01010   double initialEnergy = VL_INFINITY_D ;
01011   double previousEnergy = VL_INFINITY_D ;
01012   double energy ;
01013 
01014   vl_uint32 * permutations = NULL ;
01015   vl_size * numSeenSoFar = NULL ;
01016   VlRand * rand = vl_get_rand () ;
01017   vl_size totNumRestartedCenters = 0 ;
01018   vl_size numRestartedCenters = 0 ;
01019 
01020   vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
01021   vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
01022   TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
01023 
01024   if (self->distance == VlDistanceL1) {
01025     permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
01026     numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
01027     VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
01028   }
01029 
01030   for (energy = VL_INFINITY_D,
01031        iteration = 0;
01032        1 ;
01033        ++ iteration) {
01034 
01035     /* assign data to cluters */
01036     VL_XCAT(_vl_kmeans_quantize_ann_, SFX)(self, assignments, distances, data, numData, iteration > 0) ;
01037 
01038     /* compute energy */
01039     energy = 0 ;
01040     for (x = 0 ; x < numData ; ++x) energy += distances[x] ;
01041     if (self->verbosity) {
01042       VL_PRINTF("kmeans: ANN iter %d: energy = %g\n", iteration,
01043                 energy) ;
01044     }
01045 
01046     /* check termination conditions */
01047     if (iteration >= self->maxNumIterations) {
01048       if (self->verbosity) {
01049         VL_PRINTF("kmeans: ANN terminating because the maximum number of iterations has been reached\n") ;
01050       }
01051       break ;
01052     }
01053     if (energy == previousEnergy) {
01054       if (self->verbosity) {
01055         VL_PRINTF("kmeans: ANN terminating because the algorithm fully converged\n") ;
01056       }
01057       break ;
01058     }
01059     
01060     if (iteration == 0) {
01061       initialEnergy = energy ;
01062     } else {
01063       double eps = (previousEnergy - energy) / (initialEnergy - energy) ;
01064       if (eps < self->minEnergyVariation) {
01065         if (self->verbosity) {
01066           VL_PRINTF("kmeans: ANN terminating because the energy relative variation was less than %f\n", self->minEnergyVariation) ;
01067         }
01068         break ;
01069       }
01070     }
01071 
01072     /* begin next iteration */
01073     previousEnergy = energy ;
01074 
01075     /* update clusters */
01076     memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
01077     for (x = 0 ; x < numData ; ++x) {
01078       clusterMasses[assignments[x]] ++ ;
01079     }
01080 
01081     numRestartedCenters = 0 ;
01082     switch (self->distance) {
01083       case VlDistanceL2:
01084         memset(self->centers, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
01085         for (x = 0 ; x < numData ; ++x) {
01086           TYPE * cpt = (TYPE*)self->centers + assignments[x] * self->dimension ;
01087           TYPE const * xpt = data + x * self->dimension ;
01088           for (d = 0 ; d < self->dimension ; ++d) {
01089             cpt[d] += xpt[d] ;
01090           }
01091         }
01092         for (c = 0 ; c < self->numCenters ; ++c) {
01093           TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
01094           if (clusterMasses[c] > 0) {
01095             TYPE mass = clusterMasses[c] ;
01096             for (d = 0 ; d < self->dimension ; ++d) {
01097               cpt[d] /= mass ;
01098             }
01099           } else {
01100             vl_uindex x = vl_rand_uindex(rand, numData) ;
01101             numRestartedCenters ++ ;
01102             for (d = 0 ; d < self->dimension ; ++d) {
01103               cpt[d] = data[x * self->dimension + d] ;
01104             }
01105           }
01106         }
01107         break ;
01108       case VlDistanceL1:
01109         for (d = 0 ; d < self->dimension ; ++d) {
01110           vl_uint32 * perm = permutations + d * numData ;
01111           memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
01112           for (x = 0; x < numData ; ++x) {
01113             c = assignments[perm[x]] ;
01114             if (2 * numSeenSoFar[c] < clusterMasses[c]) {
01115               ((TYPE*)self->centers) [d + c * self->dimension] =
01116                 data [d + perm[x] * self->dimension] ;
01117             }
01118             numSeenSoFar[c] ++ ;
01119           }
01120           /* restart the centers as required  */
01121           for (c = 0 ; c < self->numCenters ; ++c) {
01122             if (clusterMasses[c] == 0) {
01123               TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
01124               vl_uindex x = vl_rand_uindex(rand, numData) ;
01125               numRestartedCenters ++ ;
01126               for (d = 0 ; d < self->dimension ; ++d) {
01127                 cpt[d] = data[x * self->dimension + d] ;
01128               }
01129             }
01130           }
01131         }
01132         break ;
01133       default:
01134         VL_PRINT("bad distance set: %d\n",self->distance);
01135         abort();
01136     } /* done compute centers */
01137 
01138     totNumRestartedCenters += numRestartedCenters ;
01139     if (self->verbosity && numRestartedCenters) {
01140       VL_PRINTF("kmeans: ANN iter %d: restarted %d centers\n", iteration,
01141                 numRestartedCenters) ;
01142     }
01143   }
01144 
01145   if (permutations) {
01146     vl_free(permutations) ;
01147   }
01148   if (numSeenSoFar) {
01149     vl_free(numSeenSoFar) ;
01150   }
01151 
01152   vl_free(distances) ;
01153   vl_free(assignments) ;
01154   vl_free(clusterMasses) ;
01155   return energy ;
01156 }
01157 
01158 /* ---------------------------------------------------------------- */
01159 /*                                                 Elkan refinement */
01160 /* ---------------------------------------------------------------- */
01161 
01162 static double
01163 VL_XCAT(_vl_kmeans_refine_centers_elkan_, SFX)
01164 (VlKMeans * self,
01165  TYPE const * data,
01166  vl_size numData)
01167 {
01168   vl_size d, iteration ;
01169   vl_index x ;
01170   vl_uint32 c, j ;
01171   vl_bool allDone ;
01172   TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
01173   vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
01174   vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
01175   VlRand * rand = vl_get_rand () ;
01176 
01177 #if (FLT == VL_TYPE_FLOAT)
01178   VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
01179 #else
01180   VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
01181 #endif
01182 
01183   TYPE * nextCenterDistances = vl_malloc (sizeof(TYPE) * self->numCenters) ;
01184   TYPE * pointToClosestCenterUB = vl_malloc (sizeof(TYPE) * numData) ;
01185   vl_bool * pointToClosestCenterUBIsStrict = vl_malloc (sizeof(vl_bool) * numData) ;
01186   TYPE * pointToCenterLB = vl_malloc (sizeof(TYPE) * numData * self->numCenters) ;
01187   TYPE * newCenters = vl_malloc(sizeof(TYPE) * self->dimension * self->numCenters) ;
01188   TYPE * centerToNewCenterDistances = vl_malloc (sizeof(TYPE) * self->numCenters) ;
01189 
01190   vl_uint32 * permutations = NULL ;
01191   vl_size * numSeenSoFar = NULL ;
01192 
01193   double energy ;
01194 
01195   vl_size totDistanceComputationsToInit = 0 ;
01196   vl_size totDistanceComputationsToRefreshUB = 0 ;
01197   vl_size totDistanceComputationsToRefreshLB = 0 ;
01198   vl_size totDistanceComputationsToRefreshCenterDistances = 0 ;
01199   vl_size totDistanceComputationsToNewCenters = 0 ;
01200   vl_size totDistanceComputationsToFinalize = 0 ;
01201   vl_size totNumRestartedCenters = 0 ;
01202 
01203   if (self->distance == VlDistanceL1) {
01204     permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
01205     numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
01206     VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
01207   }
01208 
01209   /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01210   /*                          Initialization                        */
01211   /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01212 
01213   /* An iteration is: get_new_centers + reassign + get_energy.
01214    This counts as iteration 0, where get_new_centers is assumed
01215    to be performed before calling the train function by
01216    the initialization function */
01217 
01218   /* update distances between centers */
01219   totDistanceComputationsToInit +=
01220   VL_XCAT(_vl_kmeans_update_center_distances_, SFX)(self) ;
01221 
01222   /* assigmen points to the initial centers and initialize bounds */
01223   memset(pointToCenterLB, 0, sizeof(TYPE) * self->numCenters *  numData) ;
01224   for (x = 0 ; x < (signed)numData ; ++x) {
01225     TYPE distance ;
01226 
01227     /* do the first center */
01228     assignments[x] = 0 ;
01229     distance = distFn(self->dimension,
01230                       data + x * self->dimension,
01231                       (TYPE*)self->centers + 0) ;
01232     pointToClosestCenterUB[x] = distance ;
01233     pointToClosestCenterUBIsStrict[x] = VL_TRUE ;
01234     pointToCenterLB[0 + x * self->numCenters] = distance ;
01235     totDistanceComputationsToInit += 1 ;
01236 
01237     /* do other centers */
01238     for (c = 1 ; c < self->numCenters ; ++c) {
01239 
01240       /* Can skip if the center assigned so far is twice as close
01241        as its distance to the center under consideration */
01242 
01243       if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01244           pointToClosestCenterUB[x] <=
01245           ((TYPE*)self->centerDistances)
01246           [c + assignments[x] * self->numCenters]) {
01247         continue ;
01248       }
01249 
01250       distance = distFn(self->dimension,
01251                         data + x * self->dimension,
01252                         (TYPE*)self->centers + c * self->dimension) ;
01253       pointToCenterLB[c + x * self->numCenters] = distance ;
01254       totDistanceComputationsToInit += 1 ;
01255       if (distance < pointToClosestCenterUB[x]) {
01256         pointToClosestCenterUB[x] = distance ;
01257         assignments[x] = c ;
01258       }
01259     }
01260   }
01261 
01262   /* compute UB on energy */
01263   energy = 0 ;
01264   for (x = 0 ; x < (signed)numData ; ++x) {
01265     energy += pointToClosestCenterUB[x] ;
01266   }
01267 
01268   if (self->verbosity) {
01269     VL_PRINTF("kmeans: Elkan iter 0: energy = %g, dist. calc. = %d\n",
01270               energy, totDistanceComputationsToInit) ;
01271   }
01272 
01273   /* #define SANITY*/
01274 #ifdef SANITY
01275   {
01276     int xx ;
01277     int cc ;
01278     TYPE tol = 1e-5 ;
01279     VL_PRINTF("inconsistencies after initial assignments:\n");
01280     for (xx = 0 ; xx < numData ; ++xx) {
01281       for (cc = 0 ; cc < self->numCenters ; ++cc) {
01282         TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01283         TYPE b = distFn(self->dimension,
01284                         data + self->dimension * xx,
01285                         (TYPE*)self->centers + self->dimension * cc) ;
01286         if (cc == assignments[xx]) {
01287           TYPE z = pointToClosestCenterUB[xx] ;
01288           if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01289                                  cc, xx, z, b) ;
01290         }
01291         if (a>b+tol) VL_PRINTF("LB %d %d = %f  > %f\n",
01292                                cc, xx, a, b) ;
01293       }
01294     }
01295   }
01296 #endif
01297 
01298   /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01299   /*                          Iterations                            */
01300   /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01301 
01302   for (iteration = 1 ; 1; ++iteration) {
01303 
01304     vl_size numDistanceComputationsToRefreshUB = 0 ;
01305     vl_size numDistanceComputationsToRefreshLB = 0 ;
01306     vl_size numDistanceComputationsToRefreshCenterDistances = 0 ;
01307     vl_size numDistanceComputationsToNewCenters = 0 ;
01308     vl_size numRestartedCenters = 0 ;
01309 
01310     /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01311     /*                         Compute new centers                  */
01312     /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01313 
01314     memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
01315     for (x = 0 ; x < (signed)numData ; ++x) {
01316       clusterMasses[assignments[x]] ++ ;
01317     }
01318 
01319     switch (self->distance) {
01320       case VlDistanceL2:
01321         memset(newCenters, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
01322         for (x = 0 ; x < (signed)numData ; ++x) {
01323           TYPE * cpt = newCenters + assignments[x] * self->dimension ;
01324           TYPE const * xpt = data + x * self->dimension ;
01325           for (d = 0 ; d < self->dimension ; ++d) {
01326             cpt[d] += xpt[d] ;
01327           }
01328         }
01329         for (c = 0 ; c < self->numCenters ; ++c) {
01330           TYPE * cpt = newCenters + c * self->dimension ;
01331           if (clusterMasses[c] > 0) {
01332             TYPE mass = clusterMasses[c] ;
01333             for (d = 0 ; d < self->dimension ; ++d) {
01334               cpt[d] /= mass ;
01335             }
01336           } else {
01337             /* restart the center */
01338             vl_uindex x = vl_rand_uindex(rand, numData) ;
01339             numRestartedCenters ++ ;
01340             for (d = 0 ; d < self->dimension ; ++d) {
01341               cpt[d] = data[x * self->dimension + d] ;
01342             }
01343           }
01344         }
01345         break ;
01346       case VlDistanceL1:
01347         for (d = 0 ; d < self->dimension ; ++d) {
01348           vl_uint32 * perm = permutations + d * numData ;
01349           memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
01350           for (x = 0; x < (signed)numData ; ++x) {
01351             c = assignments[perm[x]] ;
01352             if (2 * numSeenSoFar[c] < clusterMasses[c]) {
01353               newCenters [d + c * self->dimension] =
01354               data [d + perm[x] * self->dimension] ;
01355             }
01356             numSeenSoFar[c] ++ ;
01357           }
01358         }
01359         /* restart the centers as required  */
01360         for (c = 0 ; c < self->numCenters ; ++c) {
01361           if (clusterMasses[c] == 0) {
01362             TYPE * cpt = newCenters + c * self->dimension ;
01363             vl_uindex x = vl_rand_uindex(rand, numData) ;
01364             numRestartedCenters ++ ;
01365             for (d = 0 ; d < self->dimension ; ++d) {
01366               cpt[d] = data[x * self->dimension + d] ;
01367             }
01368           }
01369         }
01370         break ;
01371       default:
01372         abort();
01373     } /* done compute centers */
01374 
01375     /* compute the distance from the old centers to the new centers */
01376     for (c = 0 ; c < self->numCenters ; ++c) {
01377       TYPE distance = distFn(self->dimension,
01378                              newCenters + c * self->dimension,
01379                              (TYPE*)self->centers + c * self->dimension) ;
01380       centerToNewCenterDistances[c] = distance ;
01381       numDistanceComputationsToNewCenters += 1 ;
01382     }
01383 
01384     /* make the new centers current */
01385     {
01386       TYPE * tmp = self->centers ;
01387       self->centers = newCenters ;
01388       newCenters = tmp ;
01389     }
01390 
01391     /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01392     /*                Reassign points to a centers                  */
01393     /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */
01394 
01395     /*
01396      Update distances between centers.
01397      */
01398     numDistanceComputationsToRefreshCenterDistances
01399     += VL_XCAT(_vl_kmeans_update_center_distances_, SFX)(self) ;
01400 
01401     for (c = 0 ; c < self->numCenters ; ++c) {
01402       nextCenterDistances[c] = (TYPE) VL_INFINITY_D ;
01403       for (j = 0 ; j < self->numCenters ; ++j) {
01404         if (j == c) continue ;
01405         nextCenterDistances[c] = VL_MIN(nextCenterDistances[c],
01406                                         ((TYPE*)self->centerDistances)
01407                                         [j + c * self->numCenters]) ;
01408       }
01409     }
01410 
01411     /*
01412      Update upper bounds on point-to-closest-center distances
01413      based on the center variation.
01414      */
01415     for (x = 0 ; x < (signed)numData ; ++x) {
01416       TYPE a = pointToClosestCenterUB[x] ;
01417       TYPE b = centerToNewCenterDistances[assignments[x]] ;
01418       if (self->distance == VlDistanceL1) {
01419         pointToClosestCenterUB[x] = a + b ;
01420       } else {
01421 #if (FLT == VL_TYPE_FLOAT)
01422         TYPE sqrtab =  sqrtf (a * b) ;
01423 #else
01424         TYPE sqrtab =  sqrt (a * b) ;
01425 #endif
01426         pointToClosestCenterUB[x] = a + b + 2.0 * sqrtab ;
01427       }
01428       pointToClosestCenterUBIsStrict[x] = VL_FALSE ;
01429     }
01430 
01431     /*
01432      Update lower bounds on point-to-center distances
01433      based on the center variation.
01434      */
01435 
01436 #if defined(_OPENMP)
01437 #pragma omp parallel for default(shared) private(x,c) num_threads(vl_get_max_threads())
01438 #endif
01439     for (x = 0 ; x < (signed)numData ; ++x) {
01440       for (c = 0 ; c < self->numCenters ; ++c) {
01441         TYPE a = pointToCenterLB[c + x * self->numCenters] ;
01442         TYPE b = centerToNewCenterDistances[c] ;
01443         if (a < b) {
01444           pointToCenterLB[c + x * self->numCenters] = 0 ;
01445         } else {
01446           if (self->distance == VlDistanceL1) {
01447             pointToCenterLB[c + x * self->numCenters]  = a - b ;
01448           } else {
01449 #if (FLT == VL_TYPE_FLOAT)
01450             TYPE sqrtab =  sqrtf (a * b) ;
01451 #else
01452             TYPE sqrtab =  sqrt (a * b) ;
01453 #endif
01454             pointToCenterLB[c + x * self->numCenters]  = a + b - 2.0 * sqrtab ;
01455           }
01456         }
01457       }
01458     }
01459 
01460 #ifdef SANITY
01461     {
01462       int xx ;
01463       int cc ;
01464       TYPE tol = 1e-5 ;
01465       VL_PRINTF("inconsistencies before assignments:\n");
01466       for (xx = 0 ; xx < numData ; ++xx) {
01467         for (cc = 0 ; cc < self->numCenters ; ++cc) {
01468           TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01469           TYPE b = distFn(self->dimension,
01470                           data + self->dimension * xx,
01471                           (TYPE*)self->centers + self->dimension * cc) ;
01472           if (cc == assignments[xx]) {
01473             TYPE z = pointToClosestCenterUB[xx] ;
01474             if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01475                                    cc, xx, z, b) ;
01476           }
01477           if (a>b+tol) VL_PRINTF("LB %d %d = %f  > %f (assign = %d)\n",
01478                                  cc, xx, a, b, assignments[xx]) ;
01479         }
01480       }
01481     }
01482 #endif
01483 
01484     /*
01485      Scan the data and do the reassignments. Use the bounds to
01486      skip as many point-to-center distance calculations as possible.
01487      */
01488     allDone = VL_TRUE ;
01489 
01490 #if defined(_OPENMP)
01491 #pragma omp parallel for \
01492             default(none) \
01493             shared(self,numData, \
01494               pointToClosestCenterUB,pointToCenterLB, \
01495               nextCenterDistances,pointToClosestCenterUBIsStrict, \
01496               assignments,data,distFn,allDone) \
01497             private(c,x) \
01498             reduction(+:numDistanceComputationsToRefreshUB,numDistanceComputationsToRefreshLB) \
01499             num_threads(vl_get_max_threads())
01500 #endif
01501     for (x = 0 ; x < (signed)numData ; ++ x) {
01502       /*
01503        A point x sticks with its current center assignmets[x]
01504        the UB to d(x, c[assigmnets[x]]) is not larger than half
01505        the distance of c[assigments[x]] to any other center c.
01506        */
01507       if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01508           pointToClosestCenterUB[x] <= nextCenterDistances[assignments[x]]) {
01509         continue ;
01510       }
01511 
01512       for (c = 0 ; c < self->numCenters ; ++c) {
01513         vl_uint32 cx = assignments[x] ;
01514         TYPE distance ;
01515 
01516         /* The point is not reassigned to a given center c
01517          if either:
01518 
01519          0 - c is already the assigned center
01520          1 - The UB of d(x, c[assignments[x]]) is smaller than half
01521          the distance of c[assigments[x]] to c, OR
01522          2 - The UB of d(x, c[assignmets[x]]) is smaller than the
01523          LB of the distance of x to c.
01524          */
01525         if (cx == c) {
01526           continue ;
01527         }
01528         if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01529             pointToClosestCenterUB[x] <= ((TYPE*)self->centerDistances)
01530             [c + cx * self->numCenters]) {
01531           continue ;
01532         }
01533         if (pointToClosestCenterUB[x] <= pointToCenterLB
01534             [c + x * self->numCenters]) {
01535           continue ;
01536         }
01537 
01538         /* If the UB is loose, try recomputing it and test again */
01539         if (! pointToClosestCenterUBIsStrict[x]) {
01540           distance = distFn(self->dimension,
01541                             data + self->dimension * x,
01542                             (TYPE*)self->centers + self->dimension * cx) ;
01543           pointToClosestCenterUB[x] = distance ;
01544           pointToClosestCenterUBIsStrict[x] = VL_TRUE ;
01545           pointToCenterLB[cx + x * self->numCenters] = distance ;
01546           numDistanceComputationsToRefreshUB += 1 ;
01547 
01548           if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01549               pointToClosestCenterUB[x] <= ((TYPE*)self->centerDistances)
01550               [c + cx * self->numCenters]) {
01551             continue ;
01552           }
01553           if (pointToClosestCenterUB[x] <= pointToCenterLB
01554               [c + x * self->numCenters]) {
01555             continue ;
01556           }
01557         }
01558 
01559         /*
01560          Now the UB is strict (equal to d(x, assignments[x])), but
01561          we still could not exclude that x should be reassigned to
01562          c. We therefore compute the distance, update the LB,
01563          and check if a reassigmnet must be made
01564          */
01565         distance = distFn(self->dimension,
01566                           data + x * self->dimension,
01567                           (TYPE*)self->centers + c *  self->dimension) ;
01568         numDistanceComputationsToRefreshLB += 1 ;
01569         pointToCenterLB[c + x * self->numCenters] = distance ;
01570 
01571         if (distance < pointToClosestCenterUB[x]) {
01572           assignments[x] = c ;
01573           pointToClosestCenterUB[x] = distance ;
01574           allDone = VL_FALSE ;
01575           /* the UB strict flag is already set here */
01576         }
01577 
01578       } /* assign center */
01579     } /* next data point */
01580 
01581 
01582     totDistanceComputationsToRefreshUB
01583     += numDistanceComputationsToRefreshUB ;
01584 
01585     totDistanceComputationsToRefreshLB
01586     += numDistanceComputationsToRefreshLB ;
01587 
01588     totDistanceComputationsToRefreshCenterDistances
01589     += numDistanceComputationsToRefreshCenterDistances ;
01590 
01591     totDistanceComputationsToNewCenters
01592     += numDistanceComputationsToNewCenters ;
01593 
01594     totNumRestartedCenters
01595     += numRestartedCenters ;
01596 
01597 #ifdef SANITY
01598     {
01599       int xx ;
01600       int cc ;
01601       TYPE tol = 1e-5 ;
01602       VL_PRINTF("inconsistencies after assignments:\n");
01603       for (xx = 0 ; xx < numData ; ++xx) {
01604         for (cc = 0 ; cc < self->numCenters ; ++cc) {
01605           TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01606           TYPE b = distFn(self->dimension,
01607                           data + self->dimension * xx,
01608                           (TYPE*)self->centers + self->dimension * cc) ;
01609           if (cc == assignments[xx]) {
01610             TYPE z = pointToClosestCenterUB[xx] ;
01611             if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01612                                    cc, xx, z, b) ;
01613           }
01614           if (a>b+tol) VL_PRINTF("LB %d %d = %f  > %f (assign = %d)\n",
01615                                  cc, xx, a, b, assignments[xx]) ;
01616         }
01617       }
01618     }
01619 #endif
01620 
01621     /* compute UB on energy */
01622     energy = 0 ;
01623     for (x = 0 ; x < (signed)numData ; ++x) {
01624       energy += pointToClosestCenterUB[x] ;
01625     }
01626 
01627     if (self->verbosity) {
01628       vl_size numDistanceComputations =
01629       numDistanceComputationsToRefreshUB +
01630       numDistanceComputationsToRefreshLB +
01631       numDistanceComputationsToRefreshCenterDistances +
01632       numDistanceComputationsToNewCenters ;
01633       VL_PRINTF("kmeans: Elkan iter %d: energy <= %g, dist. calc. = %d\n",
01634                 iteration,
01635                 energy,
01636                 numDistanceComputations) ;
01637       if (numRestartedCenters) {
01638         VL_PRINTF("kmeans: Elkan iter %d: restarted %d centers\n",
01639                   iteration,
01640                   energy,
01641                   numRestartedCenters) ;
01642       }
01643       if (self->verbosity > 1) {
01644         VL_PRINTF("kmeans: Elkan iter %d: total dist. calc. per type: "
01645                   "UB: %.1f%% (%d), LB: %.1f%% (%d), "
01646                   "intra_center: %.1f%% (%d), "
01647                   "new_center: %.1f%% (%d)\n",
01648                   iteration,
01649                   100.0 * numDistanceComputationsToRefreshUB / numDistanceComputations,
01650                   numDistanceComputationsToRefreshUB,
01651                   100.0 *numDistanceComputationsToRefreshLB / numDistanceComputations,
01652                   numDistanceComputationsToRefreshLB,
01653                   100.0 * numDistanceComputationsToRefreshCenterDistances / numDistanceComputations,
01654                   numDistanceComputationsToRefreshCenterDistances,
01655                   100.0 * numDistanceComputationsToNewCenters / numDistanceComputations,
01656                   numDistanceComputationsToNewCenters) ;
01657       }
01658     }
01659 
01660     /* check termination conditions */
01661     if (iteration >= self->maxNumIterations) {
01662       if (self->verbosity) {
01663         VL_PRINTF("kmeans: Elkan terminating because maximum number of iterations reached\n") ;
01664       }
01665       break ;
01666     }
01667     if (allDone) {
01668       if (self->verbosity) {
01669         VL_PRINTF("kmeans: Elkan terminating because the algorithm fully converged\n") ;
01670       }
01671       break ;
01672     }
01673 
01674   } /* next Elkan iteration */
01675 
01676   /* compute true energy */
01677   energy = 0 ;
01678   for (x = 0 ; x < (signed)numData ; ++ x) {
01679     vl_uindex cx = assignments [x] ;
01680     energy += distFn(self->dimension,
01681                      data + self->dimension * x,
01682                      (TYPE*)self->centers + self->dimension * cx) ;
01683     totDistanceComputationsToFinalize += 1 ;
01684   }
01685 
01686   {
01687     vl_size totDistanceComputations =
01688     totDistanceComputationsToInit +
01689     totDistanceComputationsToRefreshUB +
01690     totDistanceComputationsToRefreshLB +
01691     totDistanceComputationsToRefreshCenterDistances +
01692     totDistanceComputationsToNewCenters +
01693     totDistanceComputationsToFinalize ;
01694 
01695     double saving = (double)totDistanceComputations
01696     / (iteration * self->numCenters * numData) ;
01697 
01698     if (self->verbosity) {
01699       VL_PRINTF("kmeans: Elkan: total dist. calc.: %d (%.2f %% of Lloyd)\n",
01700                 totDistanceComputations, saving * 100.0) ;
01701       if (totNumRestartedCenters) {
01702         VL_PRINTF("kmeans: Elkan: there have been %d restarts\n",
01703                   totNumRestartedCenters) ;
01704       }
01705     }
01706 
01707     if (self->verbosity > 1) {
01708       VL_PRINTF("kmeans: Elkan: total dist. calc. per type: "
01709                 "init: %.1f%% (%d), UB: %.1f%% (%d), LB: %.1f%% (%d), "
01710                 "intra_center: %.1f%% (%d), "
01711                 "new_center: %.1f%% (%d), "
01712                 "finalize: %.1f%% (%d)\n",
01713                 100.0 * totDistanceComputationsToInit / totDistanceComputations,
01714                 totDistanceComputationsToInit,
01715                 100.0 * totDistanceComputationsToRefreshUB / totDistanceComputations,
01716                 totDistanceComputationsToRefreshUB,
01717                 100.0 *totDistanceComputationsToRefreshLB / totDistanceComputations,
01718                 totDistanceComputationsToRefreshLB,
01719                 100.0 * totDistanceComputationsToRefreshCenterDistances / totDistanceComputations,
01720                 totDistanceComputationsToRefreshCenterDistances,
01721                 100.0 * totDistanceComputationsToNewCenters / totDistanceComputations,
01722                 totDistanceComputationsToNewCenters,
01723                 100.0 * totDistanceComputationsToFinalize / totDistanceComputations,
01724                 totDistanceComputationsToFinalize) ;
01725     }
01726   }
01727 
01728   if (permutations) {
01729     vl_free(permutations) ;
01730   }
01731   if (numSeenSoFar) {
01732     vl_free(numSeenSoFar) ;
01733   }
01734 
01735   vl_free(distances) ;
01736   vl_free(assignments) ;
01737   vl_free(clusterMasses) ;
01738 
01739   vl_free(nextCenterDistances) ;
01740   vl_free(pointToClosestCenterUB) ;
01741   vl_free(pointToClosestCenterUBIsStrict) ;
01742   vl_free(pointToCenterLB) ;
01743   vl_free(newCenters) ;
01744   vl_free(centerToNewCenterDistances) ;
01745 
01746   return energy ;
01747 }
01748 
01749 /* ---------------------------------------------------------------- */
01750 static double
01751 VL_XCAT(_vl_kmeans_refine_centers_, SFX)
01752 (VlKMeans * self,
01753  TYPE const * data,
01754  vl_size numData)
01755 {
01756   switch (self->algorithm) {
01757     case VlKMeansLloyd:
01758       return
01759         VL_XCAT(_vl_kmeans_refine_centers_lloyd_, SFX)(self, data, numData) ;
01760       break ;
01761     case VlKMeansElkan:
01762       return
01763         VL_XCAT(_vl_kmeans_refine_centers_elkan_, SFX)(self, data, numData) ;
01764       break ;
01765     case VlKMeansANN:
01766       return
01767         VL_XCAT(_vl_kmeans_refine_centers_ann_, SFX)(self, data, numData) ;
01768       break ;
01769     default:
01770       abort() ;
01771   }
01772 }
01773 
01774 /* VL_KMEANS_INSTANTIATING */
01775 #else
01776 
01777 #ifndef __DOXYGEN__
01778 #define FLT VL_TYPE_FLOAT
01779 #define TYPE float
01780 #define SFX f
01781 #define VL_KMEANS_INSTANTIATING
01782 #include "kmeans.c"
01783 
01784 #define FLT VL_TYPE_DOUBLE
01785 #define TYPE double
01786 #define SFX d
01787 #define VL_KMEANS_INSTANTIATING
01788 #include "kmeans.c"
01789 #endif
01790 
01791 /* VL_KMEANS_INSTANTIATING */
01792 #endif
01793 
01794 /* ================================================================ */
01795 #ifndef VL_KMEANS_INSTANTIATING
01796 
01805 VL_EXPORT void
01806 vl_kmeans_set_centers
01807 (VlKMeans * self,
01808  void const * centers,
01809  vl_size dimension,
01810  vl_size numCenters)
01811 {
01812   vl_kmeans_reset (self) ;
01813 
01814   switch (self->dataType) {
01815     case VL_TYPE_FLOAT :
01816       _vl_kmeans_set_centers_f
01817       (self, (float const *)centers, dimension, numCenters) ;
01818       break ;
01819     case VL_TYPE_DOUBLE :
01820       _vl_kmeans_set_centers_d
01821       (self, (double const *)centers, dimension, numCenters) ;
01822       break ;
01823     default:
01824       abort() ;
01825   }
01826 }
01827 
01840 VL_EXPORT void
01841 vl_kmeans_init_centers_with_rand_data
01842 (VlKMeans * self,
01843  void const * data,
01844  vl_size dimension,
01845  vl_size numData,
01846  vl_size numCenters)
01847 {
01848   vl_kmeans_reset (self) ;
01849 
01850   switch (self->dataType) {
01851     case VL_TYPE_FLOAT :
01852       _vl_kmeans_init_centers_with_rand_data_f
01853       (self, (float const *)data, dimension, numData, numCenters) ;
01854       break ;
01855     case VL_TYPE_DOUBLE :
01856       _vl_kmeans_init_centers_with_rand_data_d
01857       (self, (double const *)data, dimension, numData, numCenters) ;
01858       break ;
01859     default:
01860       abort() ;
01861   }
01862 }
01863 
01873 VL_EXPORT void
01874 vl_kmeans_init_centers_plus_plus
01875 (VlKMeans * self,
01876  void const * data,
01877  vl_size dimension,
01878  vl_size numData,
01879  vl_size numCenters)
01880 {
01881   vl_kmeans_reset (self) ;
01882 
01883   switch (self->dataType) {
01884     case VL_TYPE_FLOAT :
01885       _vl_kmeans_init_centers_plus_plus_f
01886       (self, (float const *)data, dimension, numData, numCenters) ;
01887       break ;
01888     case VL_TYPE_DOUBLE :
01889       _vl_kmeans_init_centers_plus_plus_d
01890       (self, (double const *)data, dimension, numData, numCenters) ;
01891       break ;
01892     default:
01893       abort() ;
01894   }
01895 }
01896 
01906 VL_EXPORT void
01907 vl_kmeans_quantize
01908 (VlKMeans * self,
01909  vl_uint32 * assignments,
01910  void * distances,
01911  void const * data,
01912  vl_size numData)
01913 {
01914   switch (self->dataType) {
01915     case VL_TYPE_FLOAT :
01916       _vl_kmeans_quantize_f
01917       (self, assignments, distances, (float const *)data, numData) ;
01918       break ;
01919     case VL_TYPE_DOUBLE :
01920       _vl_kmeans_quantize_d
01921       (self, assignments, distances, (double const *)data, numData) ;
01922       break ;
01923     default:
01924       abort() ;
01925   }
01926 }
01927 
01946 VL_EXPORT void
01947 vl_kmeans_quantize_ann
01948 (VlKMeans * self,
01949  vl_uint32 * assignments,
01950  void * distances,
01951  void const * data,
01952  vl_size numData,
01953  vl_bool update)
01954 {
01955   switch (self->dataType) {
01956     case VL_TYPE_FLOAT :
01957       _vl_kmeans_quantize_ann_f
01958       (self, assignments, distances, (float const *)data, numData, update) ;
01959       break ;
01960     case VL_TYPE_DOUBLE :
01961       _vl_kmeans_quantize_ann_d
01962       (self, assignments, distances, (double const *)data, numData, update) ;
01963       break ;
01964     default:
01965       abort() ;
01966   }
01967 }
01968 
01983 VL_EXPORT double
01984 vl_kmeans_refine_centers
01985 (VlKMeans * self,
01986  void const * data,
01987  vl_size numData)
01988 {
01989   assert (self->centers) ;
01990 
01991   switch (self->dataType) {
01992     case VL_TYPE_FLOAT :
01993       return
01994         _vl_kmeans_refine_centers_f
01995         (self, (float const *)data, numData) ;
01996     case VL_TYPE_DOUBLE :
01997       return
01998         _vl_kmeans_refine_centers_d
01999         (self, (double const *)data, numData) ;
02000     default:
02001       abort() ;
02002   }
02003 }
02004 
02005 
02023 VL_EXPORT double
02024 vl_kmeans_cluster (VlKMeans * self,
02025                    void const * data,
02026                    vl_size dimension,
02027                    vl_size numData,
02028                    vl_size numCenters)
02029 {
02030   vl_uindex repetition ;
02031   double bestEnergy = VL_INFINITY_D ;
02032   void * bestCenters = NULL ;
02033 
02034   for (repetition = 0 ; repetition < self->numRepetitions ; ++ repetition) {
02035     double energy ;
02036     double timeRef ;
02037 
02038     if (self->verbosity) {
02039       VL_PRINTF("kmeans: repetition %d of %d\n", repetition + 1, self->numRepetitions) ;
02040     }
02041 
02042     timeRef = vl_get_cpu_time() ;
02043     switch (self->initialization) {
02044       case VlKMeansRandomSelection :
02045         vl_kmeans_init_centers_with_rand_data (self,
02046                                                data, dimension, numData,
02047                                                numCenters) ;
02048         break ;
02049       case VlKMeansPlusPlus :
02050         vl_kmeans_init_centers_plus_plus (self,
02051                                           data, dimension, numData,
02052                                           numCenters) ;
02053         break ;
02054       default:
02055         abort() ;
02056     }
02057 
02058     if (self->verbosity) {
02059       VL_PRINTF("kmeans: K-means initialized in %.2f s\n",
02060                 vl_get_cpu_time() - timeRef) ;
02061     }
02062 
02063     timeRef = vl_get_cpu_time () ;
02064     energy = vl_kmeans_refine_centers (self, data, numData) ;
02065     if (self->verbosity) {
02066       VL_PRINTF("kmeans: K-means terminated in %.2f s with energy %g\n",
02067                 vl_get_cpu_time() - timeRef, energy) ;
02068     }
02069 
02070     /* copy centers to output if current solution is optimal */
02071     /* check repetition == 0 as well in case energy = NaN, which */
02072     /* can happen if the data contain NaNs */
02073     if (energy < bestEnergy || repetition == 0) {
02074       void * temp ;
02075       bestEnergy = energy ;
02076 
02077       if (bestCenters == NULL) {
02078         bestCenters = vl_malloc(vl_get_type_size(self->dataType) *
02079                                 self->dimension *
02080                                 self->numCenters) ;
02081       }
02082 
02083       /* swap buffers */
02084       temp = bestCenters ;
02085       bestCenters = self->centers ;
02086       self->centers = temp ;
02087     } /* better energy */
02088   } /* next repetition */
02089 
02090   vl_free (self->centers) ;
02091   self->centers = bestCenters ;
02092   return bestEnergy ;
02093 }
02094 
02095 /* VL_KMEANS_INSTANTIATING */
02096 #endif
02097 
02098 #undef SFX
02099 #undef TYPE
02100 #undef FLT
02101 #undef VL_KMEANS_INSTANTIATING


libvlfeat
Author(s): Andrea Vedaldi
autogenerated on Thu Jun 6 2019 20:25:51