00001
00006
00007
00008
00009
00010
00011
00012
00013
00014
00363 #include "kmeans.h"
00364 #include "generic.h"
00365 #include "mathop.h"
00366 #include <string.h>
00367
00368 #ifdef _OPENMP
00369 #include <omp.h>
00370 #endif
00371
00372
00373 #ifndef VL_KMEANS_INSTANTIATING
00374
00375
00385 VL_EXPORT void
00386 vl_kmeans_reset (VlKMeans * self)
00387 {
00388 self->numCenters = 0 ;
00389 self->dimension = 0 ;
00390
00391 if (self->centers) vl_free(self->centers) ;
00392 if (self->centerDistances) vl_free(self->centerDistances) ;
00393
00394 self->centers = NULL ;
00395 self->centerDistances = NULL ;
00396 }
00397
00405 VL_EXPORT VlKMeans *
00406 vl_kmeans_new (vl_type dataType,
00407 VlVectorComparisonType distance)
00408 {
00409 VlKMeans * self = vl_calloc(1, sizeof(VlKMeans)) ;
00410
00411 self->algorithm = VlKMeansLloyd ;
00412 self->distance = distance ;
00413 self->dataType = dataType ;
00414 self->verbosity = 0 ;
00415 self->maxNumIterations = 100 ;
00416 self->minEnergyVariation = 1e-4 ;
00417 self->numRepetitions = 1 ;
00418 self->centers = NULL ;
00419 self->centerDistances = NULL ;
00420 self->numTrees = 3;
00421 self->maxNumComparisons = 100;
00422
00423 vl_kmeans_reset (self) ;
00424 return self ;
00425 }
00426
00433 VL_EXPORT VlKMeans *
00434 vl_kmeans_new_copy (VlKMeans const * kmeans)
00435 {
00436 VlKMeans * self = vl_malloc(sizeof(VlKMeans)) ;
00437
00438 self->algorithm = kmeans->algorithm ;
00439 self->distance = kmeans->distance ;
00440 self->dataType = kmeans->dataType ;
00441
00442 self->verbosity = kmeans->verbosity ;
00443 self->maxNumIterations = kmeans->maxNumIterations ;
00444 self->numRepetitions = kmeans->numRepetitions ;
00445
00446 self->dimension = kmeans->dimension ;
00447 self->numCenters = kmeans->numCenters ;
00448 self->centers = NULL ;
00449 self->centerDistances = NULL ;
00450
00451 self->numTrees = kmeans->numTrees;
00452 self->maxNumComparisons = kmeans->maxNumComparisons;
00453
00454 if (kmeans->centers) {
00455 vl_size dataSize = vl_get_type_size(self->dataType) * self->dimension * self->numCenters ;
00456 self->centers = vl_malloc(dataSize) ;
00457 memcpy (self->centers, kmeans->centers, dataSize) ;
00458 }
00459
00460 if (kmeans->centerDistances) {
00461 vl_size dataSize = vl_get_type_size(self->dataType) * self->numCenters * self->numCenters ;
00462 self->centerDistances = vl_malloc(dataSize) ;
00463 memcpy (self->centerDistances, kmeans->centerDistances, dataSize) ;
00464 }
00465
00466 return self ;
00467 }
00468
00477 VL_EXPORT void
00478 vl_kmeans_delete (VlKMeans * self)
00479 {
00480 vl_kmeans_reset (self) ;
00481 vl_free (self) ;
00482 }
00483
00484
00485 typedef struct _VlKMeansSortWrapper {
00486 vl_uint32 * permutation ;
00487 void const * data ;
00488 vl_size stride ;
00489 } VlKMeansSortWrapper ;
00490
00491
00492
00493
00494
00495 #define VL_SHUFFLE_type vl_uindex
00496 #define VL_SHUFFLE_prefix _vl_kmeans
00497 #include "shuffle-def.h"
00498
00499
00500 #endif
00501
00502
00503 #ifdef VL_KMEANS_INSTANTIATING
00504
00505
00506
00507
00508
00509 static void
00510 VL_XCAT(_vl_kmeans_set_centers_, SFX)
00511 (VlKMeans * self,
00512 TYPE const * centers,
00513 vl_size dimension,
00514 vl_size numCenters)
00515 {
00516 self->dimension = dimension ;
00517 self->numCenters = numCenters ;
00518 self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00519 memcpy ((TYPE*)self->centers, centers,
00520 sizeof(TYPE) * dimension * numCenters) ;
00521 }
00522
00523
00524
00525
00526
00527 static void
00528 VL_XCAT(_vl_kmeans_init_centers_with_rand_data_, SFX)
00529 (VlKMeans * self,
00530 TYPE const * data,
00531 vl_size dimension,
00532 vl_size numData,
00533 vl_size numCenters)
00534 {
00535 vl_uindex i, j, k ;
00536 VlRand * rand = vl_get_rand () ;
00537
00538 self->dimension = dimension ;
00539 self->numCenters = numCenters ;
00540 self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00541
00542 {
00543 vl_uindex * perm = vl_malloc (sizeof(vl_uindex) * numData) ;
00544 #if (FLT == VL_TYPE_FLOAT)
00545 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00546 #else
00547 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00548 #endif
00549 TYPE * distances = vl_malloc (sizeof(TYPE) * numCenters) ;
00550
00551
00552 for (i = 0 ; i < numData ; ++i) perm[i] = i ;
00553 _vl_kmeans_shuffle (perm, numData, rand) ;
00554
00555 for (k = 0, i = 0 ; k < numCenters ; ++ i) {
00556
00557
00558
00559
00560 if (numCenters - k < numData - i) {
00561 vl_bool duplicateDetected = VL_FALSE ;
00562 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(distances,
00563 dimension,
00564 data + dimension * perm[i], 1,
00565 (TYPE*)self->centers, k,
00566 distFn) ;
00567 for (j = 0 ; j < k ; ++j) {
00568 duplicateDetected |= (distances[j] == 0) ;
00569 }
00570 if (duplicateDetected) continue ;
00571 }
00572
00573
00574 memcpy ((TYPE*)self->centers + dimension * k,
00575 data + dimension * perm[i],
00576 sizeof(TYPE) * dimension) ;
00577 k ++ ;
00578 }
00579 vl_free(distances) ;
00580 vl_free(perm) ;
00581 }
00582 }
00583
00584
00585
00586
00587
00588 static void
00589 VL_XCAT(_vl_kmeans_init_centers_plus_plus_, SFX)
00590 (VlKMeans * self,
00591 TYPE const * data,
00592 vl_size dimension,
00593 vl_size numData,
00594 vl_size numCenters)
00595 {
00596 vl_uindex x, c ;
00597 VlRand * rand = vl_get_rand () ;
00598 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00599 TYPE * minDistances = vl_malloc (sizeof(TYPE) * numData) ;
00600 #if (FLT == VL_TYPE_FLOAT)
00601 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00602 #else
00603 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00604 #endif
00605
00606 self->dimension = dimension ;
00607 self->numCenters = numCenters ;
00608 self->centers = vl_malloc (sizeof(TYPE) * dimension * numCenters) ;
00609
00610 for (x = 0 ; x < numData ; ++x) {
00611 minDistances[x] = (TYPE) VL_INFINITY_D ;
00612 }
00613
00614
00615 x = vl_rand_uindex (rand, numData) ;
00616 c = 0 ;
00617 while (1) {
00618 TYPE energy = 0 ;
00619 TYPE acc = 0 ;
00620 TYPE thresh = (TYPE) vl_rand_real1 (rand) ;
00621
00622 memcpy ((TYPE*)self->centers + c * dimension,
00623 data + x * dimension,
00624 sizeof(TYPE) * dimension) ;
00625
00626 c ++ ;
00627 if (c == numCenters) break ;
00628
00629 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)
00630 (distances,
00631 dimension,
00632 (TYPE*)self->centers + (c - 1) * dimension, 1,
00633 data, numData,
00634 distFn) ;
00635
00636 for (x = 0 ; x < numData ; ++x) {
00637 minDistances[x] = VL_MIN(minDistances[x], distances[x]) ;
00638 energy += minDistances[x] ;
00639 }
00640
00641 for (x = 0 ; x < numData - 1 ; ++x) {
00642 acc += minDistances[x] ;
00643 if (acc >= thresh * energy) break ;
00644 }
00645 }
00646
00647 vl_free(distances) ;
00648 vl_free(minDistances) ;
00649 }
00650
00651
00652
00653
00654
00655 static void
00656 VL_XCAT(_vl_kmeans_quantize_, SFX)
00657 (VlKMeans * self,
00658 vl_uint32 * assignments,
00659 TYPE * distances,
00660 TYPE const * data,
00661 vl_size numData)
00662 {
00663 vl_index i ;
00664
00665 #if (FLT == VL_TYPE_FLOAT)
00666 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00667 #else
00668 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00669 #endif
00670
00671 #ifdef _OPENMP
00672 #pragma omp parallel default(none) \
00673 shared(self, distances, assignments, numData, distFn, data) \
00674 num_threads(vl_get_max_threads())
00675 #endif
00676 {
00677
00678 TYPE * distanceToCenters = malloc(sizeof(TYPE) * self->numCenters) ;
00679
00680 #ifdef _OPENMP
00681 #pragma omp for
00682 #endif
00683 for (i = 0 ; i < (signed)numData ; ++i) {
00684 vl_uindex k ;
00685 TYPE bestDistance = (TYPE) VL_INFINITY_D ;
00686 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(distanceToCenters,
00687 self->dimension,
00688 data + self->dimension * i, 1,
00689 (TYPE*)self->centers, self->numCenters,
00690 distFn) ;
00691 for (k = 0 ; k < self->numCenters ; ++k) {
00692 if (distanceToCenters[k] < bestDistance) {
00693 bestDistance = distanceToCenters[k] ;
00694 assignments[i] = (vl_uint32)k ;
00695 }
00696 }
00697 if (distances) distances[i] = bestDistance ;
00698 }
00699
00700 free(distanceToCenters) ;
00701 }
00702 }
00703
00704
00705
00706
00707
00708 static void
00709 VL_XCAT(_vl_kmeans_quantize_ann_, SFX)
00710 (VlKMeans * self,
00711 vl_uint32 * assignments,
00712 TYPE * distances,
00713 TYPE const * data,
00714 vl_size numData,
00715 vl_bool update)
00716 {
00717 #if (FLT == VL_TYPE_FLOAT)
00718 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00719 #else
00720 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00721 #endif
00722
00723 VlKDForest * forest = vl_kdforest_new(self->dataType,self->dimension,self->numTrees, self->distance) ;
00724 vl_kdforest_set_max_num_comparisons(forest,self->maxNumComparisons);
00725 vl_kdforest_set_thresholding_method(forest,VL_KDTREE_MEDIAN);
00726 vl_kdforest_build(forest,self->numCenters,self->centers);
00727
00728 #ifdef _OPENMP
00729 #pragma omp parallel default(none) \
00730 num_threads(vl_get_max_threads()) \
00731 shared(self, forest, update, assignments, distances, data, numData, distFn)
00732 #endif
00733 {
00734 VlKDForestNeighbor neighbor ;
00735 VlKDForestSearcher * searcher ;
00736 vl_index x;
00737
00738 #ifdef _OPENMP
00739 #pragma omp critical
00740 #endif
00741 searcher = vl_kdforest_new_searcher (forest) ;
00742
00743 #ifdef _OPENMP
00744 #pragma omp for
00745 #endif
00746 for(x = 0 ; x < (signed)numData ; ++x) {
00747 vl_kdforestsearcher_query (searcher, &neighbor, 1, (TYPE const *) (data + x*self->dimension));
00748
00749 if (distances) {
00750 if(!update) {
00751 distances[x] = (TYPE) neighbor.distance;
00752 assignments[x] = (vl_uint32) neighbor.index ;
00753 } else {
00754 TYPE prevDist = (TYPE) distFn(self->dimension,
00755 data + self->dimension * x,
00756 (TYPE*)self->centers + self->dimension *assignments[x]);
00757 if (prevDist > (TYPE) neighbor.distance) {
00758 distances[x] = (TYPE) neighbor.distance ;
00759 assignments[x] = (vl_uint32) neighbor.index ;
00760 } else {
00761 distances[x] = prevDist ;
00762 }
00763 }
00764 } else {
00765 assignments[x] = (vl_uint32) neighbor.index ;
00766 }
00767 }
00768 }
00769
00770 vl_kdforest_delete(forest);
00771 }
00772
00773
00774
00775
00776
00777
00778
00779
00780
00781 VL_INLINE TYPE
00782 VL_XCAT3(_vl_kmeans_, SFX, _qsort_cmp)
00783 (VlKMeansSortWrapper * array, vl_uindex indexA, vl_uindex indexB)
00784 {
00785 return
00786 ((TYPE*)array->data) [array->permutation[indexA] * array->stride]
00787 -
00788 ((TYPE*)array->data) [array->permutation[indexB] * array->stride] ;
00789 }
00790
00791 VL_INLINE void
00792 VL_XCAT3(_vl_kmeans_, SFX, _qsort_swap)
00793 (VlKMeansSortWrapper * array, vl_uindex indexA, vl_uindex indexB)
00794 {
00795 vl_uint32 tmp = array->permutation[indexA] ;
00796 array->permutation[indexA] = array->permutation[indexB] ;
00797 array->permutation[indexB] = tmp ;
00798 }
00799
00800 #define VL_QSORT_prefix VL_XCAT3(_vl_kmeans_, SFX, _qsort)
00801 #define VL_QSORT_array VlKMeansSortWrapper*
00802 #define VL_QSORT_cmp VL_XCAT3(_vl_kmeans_, SFX, _qsort_cmp)
00803 #define VL_QSORT_swap VL_XCAT3(_vl_kmeans_, SFX, _qsort_swap)
00804 #include "qsort-def.h"
00805
00806 static void
00807 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)
00808 (VlKMeans * self, vl_uint32 * permutations, TYPE const * data, vl_size numData)
00809 {
00810 vl_uindex d, x ;
00811
00812 for (d = 0 ; d < self->dimension ; ++d) {
00813 VlKMeansSortWrapper array ;
00814 array.permutation = permutations + d * numData ;
00815 array.data = data + d ;
00816 array.stride = self->dimension ;
00817 for (x = 0 ; x < numData ; ++x) {
00818 array.permutation[x] = (vl_uint32)x ;
00819 }
00820 VL_XCAT3(_vl_kmeans_, SFX, _qsort_sort)(&array, numData) ;
00821 }
00822 }
00823
00824
00825
00826
00827
00828 static double
00829 VL_XCAT(_vl_kmeans_refine_centers_lloyd_, SFX)
00830 (VlKMeans * self,
00831 TYPE const * data,
00832 vl_size numData)
00833 {
00834 vl_size c, d, x, iteration ;
00835 double previousEnergy = VL_INFINITY_D ;
00836 double initialEnergy = VL_INFINITY_D ;
00837 double energy ;
00838 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
00839
00840 vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
00841 vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
00842 vl_uint32 * permutations = NULL ;
00843 vl_size * numSeenSoFar = NULL ;
00844 VlRand * rand = vl_get_rand () ;
00845 vl_size totNumRestartedCenters = 0 ;
00846 vl_size numRestartedCenters = 0 ;
00847
00848 if (self->distance == VlDistanceL1) {
00849 permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
00850 numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
00851 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
00852 }
00853
00854 for (energy = VL_INFINITY_D,
00855 iteration = 0;
00856 1 ;
00857 ++ iteration) {
00858
00859
00860 VL_XCAT(_vl_kmeans_quantize_, SFX)(self, assignments, distances, data, numData) ;
00861
00862
00863 energy = 0 ;
00864 for (x = 0 ; x < numData ; ++x) energy += distances[x] ;
00865 if (self->verbosity) {
00866 VL_PRINTF("kmeans: Lloyd iter %d: energy = %g\n", iteration,
00867 energy) ;
00868 }
00869
00870
00871 if (iteration >= self->maxNumIterations) {
00872 if (self->verbosity) {
00873 VL_PRINTF("kmeans: Lloyd terminating because maximum number of iterations reached\n") ;
00874 }
00875 break ;
00876 }
00877 if (energy == previousEnergy) {
00878 if (self->verbosity) {
00879 VL_PRINTF("kmeans: Lloyd terminating because the algorithm fully converged\n") ;
00880 }
00881 break ;
00882 }
00883
00884 if (iteration == 0) {
00885 initialEnergy = energy ;
00886 } else {
00887 double eps = (previousEnergy - energy) / (initialEnergy - energy) ;
00888 if (eps < self->minEnergyVariation) {
00889 if (self->verbosity) {
00890 VL_PRINTF("kmeans: ANN terminating because the energy relative variation was less than %f\n", self->minEnergyVariation) ;
00891 }
00892 break ;
00893 }
00894 }
00895
00896
00897 previousEnergy = energy ;
00898
00899
00900 memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
00901 for (x = 0 ; x < numData ; ++x) {
00902 clusterMasses[assignments[x]] ++ ;
00903 }
00904
00905 numRestartedCenters = 0 ;
00906 switch (self->distance) {
00907 case VlDistanceL2:
00908 memset(self->centers, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
00909 for (x = 0 ; x < numData ; ++x) {
00910 TYPE * cpt = (TYPE*)self->centers + assignments[x] * self->dimension ;
00911 TYPE const * xpt = data + x * self->dimension ;
00912 for (d = 0 ; d < self->dimension ; ++d) {
00913 cpt[d] += xpt[d] ;
00914 }
00915 }
00916 for (c = 0 ; c < self->numCenters ; ++c) {
00917 TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
00918 if (clusterMasses[c] > 0) {
00919 TYPE mass = clusterMasses[c] ;
00920 for (d = 0 ; d < self->dimension ; ++d) {
00921 cpt[d] /= mass ;
00922 }
00923 } else {
00924 vl_uindex x = vl_rand_uindex(rand, numData) ;
00925 numRestartedCenters ++ ;
00926 for (d = 0 ; d < self->dimension ; ++d) {
00927 cpt[d] = data[x * self->dimension + d] ;
00928 }
00929 }
00930 }
00931 break ;
00932 case VlDistanceL1:
00933 for (d = 0 ; d < self->dimension ; ++d) {
00934 vl_uint32 * perm = permutations + d * numData ;
00935 memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
00936 for (x = 0; x < numData ; ++x) {
00937 c = assignments[perm[x]] ;
00938 if (2 * numSeenSoFar[c] < clusterMasses[c]) {
00939 ((TYPE*)self->centers) [d + c * self->dimension] =
00940 data [d + perm[x] * self->dimension] ;
00941 }
00942 numSeenSoFar[c] ++ ;
00943 }
00944
00945 for (c = 0 ; c < self->numCenters ; ++c) {
00946 if (clusterMasses[c] == 0) {
00947 TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
00948 vl_uindex x = vl_rand_uindex(rand, numData) ;
00949 numRestartedCenters ++ ;
00950 for (d = 0 ; d < self->dimension ; ++d) {
00951 cpt[d] = data[x * self->dimension + d] ;
00952 }
00953 }
00954 }
00955 }
00956 break ;
00957 default:
00958 abort();
00959 }
00960
00961 totNumRestartedCenters += numRestartedCenters ;
00962 if (self->verbosity && numRestartedCenters) {
00963 VL_PRINTF("kmeans: Lloyd iter %d: restarted %d centers\n", iteration,
00964 numRestartedCenters) ;
00965 }
00966 }
00967
00968 if (permutations) {
00969 vl_free(permutations) ;
00970 }
00971 if (numSeenSoFar) {
00972 vl_free(numSeenSoFar) ;
00973 }
00974 vl_free(distances) ;
00975 vl_free(assignments) ;
00976 vl_free(clusterMasses) ;
00977 return energy ;
00978 }
00979
00980 static double
00981 VL_XCAT(_vl_kmeans_update_center_distances_, SFX)
00982 (VlKMeans * self)
00983 {
00984 #if (FLT == VL_TYPE_FLOAT)
00985 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
00986 #else
00987 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
00988 #endif
00989
00990 if (! self->centerDistances) {
00991 self->centerDistances = vl_malloc (sizeof(TYPE) *
00992 self->numCenters *
00993 self->numCenters) ;
00994 }
00995 VL_XCAT(vl_eval_vector_comparison_on_all_pairs_, SFX)(self->centerDistances,
00996 self->dimension,
00997 self->centers, self->numCenters,
00998 NULL, 0,
00999 distFn) ;
01000 return self->numCenters * (self->numCenters - 1) / 2 ;
01001 }
01002
01003 static double
01004 VL_XCAT(_vl_kmeans_refine_centers_ann_, SFX)
01005 (VlKMeans * self,
01006 TYPE const * data,
01007 vl_size numData)
01008 {
01009 vl_size c, d, x, iteration ;
01010 double initialEnergy = VL_INFINITY_D ;
01011 double previousEnergy = VL_INFINITY_D ;
01012 double energy ;
01013
01014 vl_uint32 * permutations = NULL ;
01015 vl_size * numSeenSoFar = NULL ;
01016 VlRand * rand = vl_get_rand () ;
01017 vl_size totNumRestartedCenters = 0 ;
01018 vl_size numRestartedCenters = 0 ;
01019
01020 vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
01021 vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
01022 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
01023
01024 if (self->distance == VlDistanceL1) {
01025 permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
01026 numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
01027 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
01028 }
01029
01030 for (energy = VL_INFINITY_D,
01031 iteration = 0;
01032 1 ;
01033 ++ iteration) {
01034
01035
01036 VL_XCAT(_vl_kmeans_quantize_ann_, SFX)(self, assignments, distances, data, numData, iteration > 0) ;
01037
01038
01039 energy = 0 ;
01040 for (x = 0 ; x < numData ; ++x) energy += distances[x] ;
01041 if (self->verbosity) {
01042 VL_PRINTF("kmeans: ANN iter %d: energy = %g\n", iteration,
01043 energy) ;
01044 }
01045
01046
01047 if (iteration >= self->maxNumIterations) {
01048 if (self->verbosity) {
01049 VL_PRINTF("kmeans: ANN terminating because the maximum number of iterations has been reached\n") ;
01050 }
01051 break ;
01052 }
01053 if (energy == previousEnergy) {
01054 if (self->verbosity) {
01055 VL_PRINTF("kmeans: ANN terminating because the algorithm fully converged\n") ;
01056 }
01057 break ;
01058 }
01059
01060 if (iteration == 0) {
01061 initialEnergy = energy ;
01062 } else {
01063 double eps = (previousEnergy - energy) / (initialEnergy - energy) ;
01064 if (eps < self->minEnergyVariation) {
01065 if (self->verbosity) {
01066 VL_PRINTF("kmeans: ANN terminating because the energy relative variation was less than %f\n", self->minEnergyVariation) ;
01067 }
01068 break ;
01069 }
01070 }
01071
01072
01073 previousEnergy = energy ;
01074
01075
01076 memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
01077 for (x = 0 ; x < numData ; ++x) {
01078 clusterMasses[assignments[x]] ++ ;
01079 }
01080
01081 numRestartedCenters = 0 ;
01082 switch (self->distance) {
01083 case VlDistanceL2:
01084 memset(self->centers, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
01085 for (x = 0 ; x < numData ; ++x) {
01086 TYPE * cpt = (TYPE*)self->centers + assignments[x] * self->dimension ;
01087 TYPE const * xpt = data + x * self->dimension ;
01088 for (d = 0 ; d < self->dimension ; ++d) {
01089 cpt[d] += xpt[d] ;
01090 }
01091 }
01092 for (c = 0 ; c < self->numCenters ; ++c) {
01093 TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
01094 if (clusterMasses[c] > 0) {
01095 TYPE mass = clusterMasses[c] ;
01096 for (d = 0 ; d < self->dimension ; ++d) {
01097 cpt[d] /= mass ;
01098 }
01099 } else {
01100 vl_uindex x = vl_rand_uindex(rand, numData) ;
01101 numRestartedCenters ++ ;
01102 for (d = 0 ; d < self->dimension ; ++d) {
01103 cpt[d] = data[x * self->dimension + d] ;
01104 }
01105 }
01106 }
01107 break ;
01108 case VlDistanceL1:
01109 for (d = 0 ; d < self->dimension ; ++d) {
01110 vl_uint32 * perm = permutations + d * numData ;
01111 memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
01112 for (x = 0; x < numData ; ++x) {
01113 c = assignments[perm[x]] ;
01114 if (2 * numSeenSoFar[c] < clusterMasses[c]) {
01115 ((TYPE*)self->centers) [d + c * self->dimension] =
01116 data [d + perm[x] * self->dimension] ;
01117 }
01118 numSeenSoFar[c] ++ ;
01119 }
01120
01121 for (c = 0 ; c < self->numCenters ; ++c) {
01122 if (clusterMasses[c] == 0) {
01123 TYPE * cpt = (TYPE*)self->centers + c * self->dimension ;
01124 vl_uindex x = vl_rand_uindex(rand, numData) ;
01125 numRestartedCenters ++ ;
01126 for (d = 0 ; d < self->dimension ; ++d) {
01127 cpt[d] = data[x * self->dimension + d] ;
01128 }
01129 }
01130 }
01131 }
01132 break ;
01133 default:
01134 VL_PRINT("bad distance set: %d\n",self->distance);
01135 abort();
01136 }
01137
01138 totNumRestartedCenters += numRestartedCenters ;
01139 if (self->verbosity && numRestartedCenters) {
01140 VL_PRINTF("kmeans: ANN iter %d: restarted %d centers\n", iteration,
01141 numRestartedCenters) ;
01142 }
01143 }
01144
01145 if (permutations) {
01146 vl_free(permutations) ;
01147 }
01148 if (numSeenSoFar) {
01149 vl_free(numSeenSoFar) ;
01150 }
01151
01152 vl_free(distances) ;
01153 vl_free(assignments) ;
01154 vl_free(clusterMasses) ;
01155 return energy ;
01156 }
01157
01158
01159
01160
01161
01162 static double
01163 VL_XCAT(_vl_kmeans_refine_centers_elkan_, SFX)
01164 (VlKMeans * self,
01165 TYPE const * data,
01166 vl_size numData)
01167 {
01168 vl_size d, iteration ;
01169 vl_index x ;
01170 vl_uint32 c, j ;
01171 vl_bool allDone ;
01172 TYPE * distances = vl_malloc (sizeof(TYPE) * numData) ;
01173 vl_uint32 * assignments = vl_malloc (sizeof(vl_uint32) * numData) ;
01174 vl_size * clusterMasses = vl_malloc (sizeof(vl_size) * numData) ;
01175 VlRand * rand = vl_get_rand () ;
01176
01177 #if (FLT == VL_TYPE_FLOAT)
01178 VlFloatVectorComparisonFunction distFn = vl_get_vector_comparison_function_f(self->distance) ;
01179 #else
01180 VlDoubleVectorComparisonFunction distFn = vl_get_vector_comparison_function_d(self->distance) ;
01181 #endif
01182
01183 TYPE * nextCenterDistances = vl_malloc (sizeof(TYPE) * self->numCenters) ;
01184 TYPE * pointToClosestCenterUB = vl_malloc (sizeof(TYPE) * numData) ;
01185 vl_bool * pointToClosestCenterUBIsStrict = vl_malloc (sizeof(vl_bool) * numData) ;
01186 TYPE * pointToCenterLB = vl_malloc (sizeof(TYPE) * numData * self->numCenters) ;
01187 TYPE * newCenters = vl_malloc(sizeof(TYPE) * self->dimension * self->numCenters) ;
01188 TYPE * centerToNewCenterDistances = vl_malloc (sizeof(TYPE) * self->numCenters) ;
01189
01190 vl_uint32 * permutations = NULL ;
01191 vl_size * numSeenSoFar = NULL ;
01192
01193 double energy ;
01194
01195 vl_size totDistanceComputationsToInit = 0 ;
01196 vl_size totDistanceComputationsToRefreshUB = 0 ;
01197 vl_size totDistanceComputationsToRefreshLB = 0 ;
01198 vl_size totDistanceComputationsToRefreshCenterDistances = 0 ;
01199 vl_size totDistanceComputationsToNewCenters = 0 ;
01200 vl_size totDistanceComputationsToFinalize = 0 ;
01201 vl_size totNumRestartedCenters = 0 ;
01202
01203 if (self->distance == VlDistanceL1) {
01204 permutations = vl_malloc(sizeof(vl_uint32) * numData * self->dimension) ;
01205 numSeenSoFar = vl_malloc(sizeof(vl_size) * self->numCenters) ;
01206 VL_XCAT(_vl_kmeans_sort_data_helper_, SFX)(self, permutations, data, numData) ;
01207 }
01208
01209
01210
01211
01212
01213
01214
01215
01216
01217
01218
01219 totDistanceComputationsToInit +=
01220 VL_XCAT(_vl_kmeans_update_center_distances_, SFX)(self) ;
01221
01222
01223 memset(pointToCenterLB, 0, sizeof(TYPE) * self->numCenters * numData) ;
01224 for (x = 0 ; x < (signed)numData ; ++x) {
01225 TYPE distance ;
01226
01227
01228 assignments[x] = 0 ;
01229 distance = distFn(self->dimension,
01230 data + x * self->dimension,
01231 (TYPE*)self->centers + 0) ;
01232 pointToClosestCenterUB[x] = distance ;
01233 pointToClosestCenterUBIsStrict[x] = VL_TRUE ;
01234 pointToCenterLB[0 + x * self->numCenters] = distance ;
01235 totDistanceComputationsToInit += 1 ;
01236
01237
01238 for (c = 1 ; c < self->numCenters ; ++c) {
01239
01240
01241
01242
01243 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01244 pointToClosestCenterUB[x] <=
01245 ((TYPE*)self->centerDistances)
01246 [c + assignments[x] * self->numCenters]) {
01247 continue ;
01248 }
01249
01250 distance = distFn(self->dimension,
01251 data + x * self->dimension,
01252 (TYPE*)self->centers + c * self->dimension) ;
01253 pointToCenterLB[c + x * self->numCenters] = distance ;
01254 totDistanceComputationsToInit += 1 ;
01255 if (distance < pointToClosestCenterUB[x]) {
01256 pointToClosestCenterUB[x] = distance ;
01257 assignments[x] = c ;
01258 }
01259 }
01260 }
01261
01262
01263 energy = 0 ;
01264 for (x = 0 ; x < (signed)numData ; ++x) {
01265 energy += pointToClosestCenterUB[x] ;
01266 }
01267
01268 if (self->verbosity) {
01269 VL_PRINTF("kmeans: Elkan iter 0: energy = %g, dist. calc. = %d\n",
01270 energy, totDistanceComputationsToInit) ;
01271 }
01272
01273
01274 #ifdef SANITY
01275 {
01276 int xx ;
01277 int cc ;
01278 TYPE tol = 1e-5 ;
01279 VL_PRINTF("inconsistencies after initial assignments:\n");
01280 for (xx = 0 ; xx < numData ; ++xx) {
01281 for (cc = 0 ; cc < self->numCenters ; ++cc) {
01282 TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01283 TYPE b = distFn(self->dimension,
01284 data + self->dimension * xx,
01285 (TYPE*)self->centers + self->dimension * cc) ;
01286 if (cc == assignments[xx]) {
01287 TYPE z = pointToClosestCenterUB[xx] ;
01288 if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01289 cc, xx, z, b) ;
01290 }
01291 if (a>b+tol) VL_PRINTF("LB %d %d = %f > %f\n",
01292 cc, xx, a, b) ;
01293 }
01294 }
01295 }
01296 #endif
01297
01298
01299
01300
01301
01302 for (iteration = 1 ; 1; ++iteration) {
01303
01304 vl_size numDistanceComputationsToRefreshUB = 0 ;
01305 vl_size numDistanceComputationsToRefreshLB = 0 ;
01306 vl_size numDistanceComputationsToRefreshCenterDistances = 0 ;
01307 vl_size numDistanceComputationsToNewCenters = 0 ;
01308 vl_size numRestartedCenters = 0 ;
01309
01310
01311
01312
01313
01314 memset(clusterMasses, 0, sizeof(vl_size) * numData) ;
01315 for (x = 0 ; x < (signed)numData ; ++x) {
01316 clusterMasses[assignments[x]] ++ ;
01317 }
01318
01319 switch (self->distance) {
01320 case VlDistanceL2:
01321 memset(newCenters, 0, sizeof(TYPE) * self->dimension * self->numCenters) ;
01322 for (x = 0 ; x < (signed)numData ; ++x) {
01323 TYPE * cpt = newCenters + assignments[x] * self->dimension ;
01324 TYPE const * xpt = data + x * self->dimension ;
01325 for (d = 0 ; d < self->dimension ; ++d) {
01326 cpt[d] += xpt[d] ;
01327 }
01328 }
01329 for (c = 0 ; c < self->numCenters ; ++c) {
01330 TYPE * cpt = newCenters + c * self->dimension ;
01331 if (clusterMasses[c] > 0) {
01332 TYPE mass = clusterMasses[c] ;
01333 for (d = 0 ; d < self->dimension ; ++d) {
01334 cpt[d] /= mass ;
01335 }
01336 } else {
01337
01338 vl_uindex x = vl_rand_uindex(rand, numData) ;
01339 numRestartedCenters ++ ;
01340 for (d = 0 ; d < self->dimension ; ++d) {
01341 cpt[d] = data[x * self->dimension + d] ;
01342 }
01343 }
01344 }
01345 break ;
01346 case VlDistanceL1:
01347 for (d = 0 ; d < self->dimension ; ++d) {
01348 vl_uint32 * perm = permutations + d * numData ;
01349 memset(numSeenSoFar, 0, sizeof(vl_size) * self->numCenters) ;
01350 for (x = 0; x < (signed)numData ; ++x) {
01351 c = assignments[perm[x]] ;
01352 if (2 * numSeenSoFar[c] < clusterMasses[c]) {
01353 newCenters [d + c * self->dimension] =
01354 data [d + perm[x] * self->dimension] ;
01355 }
01356 numSeenSoFar[c] ++ ;
01357 }
01358 }
01359
01360 for (c = 0 ; c < self->numCenters ; ++c) {
01361 if (clusterMasses[c] == 0) {
01362 TYPE * cpt = newCenters + c * self->dimension ;
01363 vl_uindex x = vl_rand_uindex(rand, numData) ;
01364 numRestartedCenters ++ ;
01365 for (d = 0 ; d < self->dimension ; ++d) {
01366 cpt[d] = data[x * self->dimension + d] ;
01367 }
01368 }
01369 }
01370 break ;
01371 default:
01372 abort();
01373 }
01374
01375
01376 for (c = 0 ; c < self->numCenters ; ++c) {
01377 TYPE distance = distFn(self->dimension,
01378 newCenters + c * self->dimension,
01379 (TYPE*)self->centers + c * self->dimension) ;
01380 centerToNewCenterDistances[c] = distance ;
01381 numDistanceComputationsToNewCenters += 1 ;
01382 }
01383
01384
01385 {
01386 TYPE * tmp = self->centers ;
01387 self->centers = newCenters ;
01388 newCenters = tmp ;
01389 }
01390
01391
01392
01393
01394
01395
01396
01397
01398 numDistanceComputationsToRefreshCenterDistances
01399 += VL_XCAT(_vl_kmeans_update_center_distances_, SFX)(self) ;
01400
01401 for (c = 0 ; c < self->numCenters ; ++c) {
01402 nextCenterDistances[c] = (TYPE) VL_INFINITY_D ;
01403 for (j = 0 ; j < self->numCenters ; ++j) {
01404 if (j == c) continue ;
01405 nextCenterDistances[c] = VL_MIN(nextCenterDistances[c],
01406 ((TYPE*)self->centerDistances)
01407 [j + c * self->numCenters]) ;
01408 }
01409 }
01410
01411
01412
01413
01414
01415 for (x = 0 ; x < (signed)numData ; ++x) {
01416 TYPE a = pointToClosestCenterUB[x] ;
01417 TYPE b = centerToNewCenterDistances[assignments[x]] ;
01418 if (self->distance == VlDistanceL1) {
01419 pointToClosestCenterUB[x] = a + b ;
01420 } else {
01421 #if (FLT == VL_TYPE_FLOAT)
01422 TYPE sqrtab = sqrtf (a * b) ;
01423 #else
01424 TYPE sqrtab = sqrt (a * b) ;
01425 #endif
01426 pointToClosestCenterUB[x] = a + b + 2.0 * sqrtab ;
01427 }
01428 pointToClosestCenterUBIsStrict[x] = VL_FALSE ;
01429 }
01430
01431
01432
01433
01434
01435
01436 #if defined(_OPENMP)
01437 #pragma omp parallel for default(shared) private(x,c) num_threads(vl_get_max_threads())
01438 #endif
01439 for (x = 0 ; x < (signed)numData ; ++x) {
01440 for (c = 0 ; c < self->numCenters ; ++c) {
01441 TYPE a = pointToCenterLB[c + x * self->numCenters] ;
01442 TYPE b = centerToNewCenterDistances[c] ;
01443 if (a < b) {
01444 pointToCenterLB[c + x * self->numCenters] = 0 ;
01445 } else {
01446 if (self->distance == VlDistanceL1) {
01447 pointToCenterLB[c + x * self->numCenters] = a - b ;
01448 } else {
01449 #if (FLT == VL_TYPE_FLOAT)
01450 TYPE sqrtab = sqrtf (a * b) ;
01451 #else
01452 TYPE sqrtab = sqrt (a * b) ;
01453 #endif
01454 pointToCenterLB[c + x * self->numCenters] = a + b - 2.0 * sqrtab ;
01455 }
01456 }
01457 }
01458 }
01459
01460 #ifdef SANITY
01461 {
01462 int xx ;
01463 int cc ;
01464 TYPE tol = 1e-5 ;
01465 VL_PRINTF("inconsistencies before assignments:\n");
01466 for (xx = 0 ; xx < numData ; ++xx) {
01467 for (cc = 0 ; cc < self->numCenters ; ++cc) {
01468 TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01469 TYPE b = distFn(self->dimension,
01470 data + self->dimension * xx,
01471 (TYPE*)self->centers + self->dimension * cc) ;
01472 if (cc == assignments[xx]) {
01473 TYPE z = pointToClosestCenterUB[xx] ;
01474 if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01475 cc, xx, z, b) ;
01476 }
01477 if (a>b+tol) VL_PRINTF("LB %d %d = %f > %f (assign = %d)\n",
01478 cc, xx, a, b, assignments[xx]) ;
01479 }
01480 }
01481 }
01482 #endif
01483
01484
01485
01486
01487
01488 allDone = VL_TRUE ;
01489
01490 #if defined(_OPENMP)
01491 #pragma omp parallel for \
01492 default(none) \
01493 shared(self,numData, \
01494 pointToClosestCenterUB,pointToCenterLB, \
01495 nextCenterDistances,pointToClosestCenterUBIsStrict, \
01496 assignments,data,distFn,allDone) \
01497 private(c,x) \
01498 reduction(+:numDistanceComputationsToRefreshUB,numDistanceComputationsToRefreshLB) \
01499 num_threads(vl_get_max_threads())
01500 #endif
01501 for (x = 0 ; x < (signed)numData ; ++ x) {
01502
01503
01504
01505
01506
01507 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01508 pointToClosestCenterUB[x] <= nextCenterDistances[assignments[x]]) {
01509 continue ;
01510 }
01511
01512 for (c = 0 ; c < self->numCenters ; ++c) {
01513 vl_uint32 cx = assignments[x] ;
01514 TYPE distance ;
01515
01516
01517
01518
01519
01520
01521
01522
01523
01524
01525 if (cx == c) {
01526 continue ;
01527 }
01528 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01529 pointToClosestCenterUB[x] <= ((TYPE*)self->centerDistances)
01530 [c + cx * self->numCenters]) {
01531 continue ;
01532 }
01533 if (pointToClosestCenterUB[x] <= pointToCenterLB
01534 [c + x * self->numCenters]) {
01535 continue ;
01536 }
01537
01538
01539 if (! pointToClosestCenterUBIsStrict[x]) {
01540 distance = distFn(self->dimension,
01541 data + self->dimension * x,
01542 (TYPE*)self->centers + self->dimension * cx) ;
01543 pointToClosestCenterUB[x] = distance ;
01544 pointToClosestCenterUBIsStrict[x] = VL_TRUE ;
01545 pointToCenterLB[cx + x * self->numCenters] = distance ;
01546 numDistanceComputationsToRefreshUB += 1 ;
01547
01548 if (((self->distance == VlDistanceL1) ? 2.0 : 4.0) *
01549 pointToClosestCenterUB[x] <= ((TYPE*)self->centerDistances)
01550 [c + cx * self->numCenters]) {
01551 continue ;
01552 }
01553 if (pointToClosestCenterUB[x] <= pointToCenterLB
01554 [c + x * self->numCenters]) {
01555 continue ;
01556 }
01557 }
01558
01559
01560
01561
01562
01563
01564
01565 distance = distFn(self->dimension,
01566 data + x * self->dimension,
01567 (TYPE*)self->centers + c * self->dimension) ;
01568 numDistanceComputationsToRefreshLB += 1 ;
01569 pointToCenterLB[c + x * self->numCenters] = distance ;
01570
01571 if (distance < pointToClosestCenterUB[x]) {
01572 assignments[x] = c ;
01573 pointToClosestCenterUB[x] = distance ;
01574 allDone = VL_FALSE ;
01575
01576 }
01577
01578 }
01579 }
01580
01581
01582 totDistanceComputationsToRefreshUB
01583 += numDistanceComputationsToRefreshUB ;
01584
01585 totDistanceComputationsToRefreshLB
01586 += numDistanceComputationsToRefreshLB ;
01587
01588 totDistanceComputationsToRefreshCenterDistances
01589 += numDistanceComputationsToRefreshCenterDistances ;
01590
01591 totDistanceComputationsToNewCenters
01592 += numDistanceComputationsToNewCenters ;
01593
01594 totNumRestartedCenters
01595 += numRestartedCenters ;
01596
01597 #ifdef SANITY
01598 {
01599 int xx ;
01600 int cc ;
01601 TYPE tol = 1e-5 ;
01602 VL_PRINTF("inconsistencies after assignments:\n");
01603 for (xx = 0 ; xx < numData ; ++xx) {
01604 for (cc = 0 ; cc < self->numCenters ; ++cc) {
01605 TYPE a = pointToCenterLB[cc + xx * self->numCenters] ;
01606 TYPE b = distFn(self->dimension,
01607 data + self->dimension * xx,
01608 (TYPE*)self->centers + self->dimension * cc) ;
01609 if (cc == assignments[xx]) {
01610 TYPE z = pointToClosestCenterUB[xx] ;
01611 if (z+tol<b) VL_PRINTF("UB %d %d = %f < %f\n",
01612 cc, xx, z, b) ;
01613 }
01614 if (a>b+tol) VL_PRINTF("LB %d %d = %f > %f (assign = %d)\n",
01615 cc, xx, a, b, assignments[xx]) ;
01616 }
01617 }
01618 }
01619 #endif
01620
01621
01622 energy = 0 ;
01623 for (x = 0 ; x < (signed)numData ; ++x) {
01624 energy += pointToClosestCenterUB[x] ;
01625 }
01626
01627 if (self->verbosity) {
01628 vl_size numDistanceComputations =
01629 numDistanceComputationsToRefreshUB +
01630 numDistanceComputationsToRefreshLB +
01631 numDistanceComputationsToRefreshCenterDistances +
01632 numDistanceComputationsToNewCenters ;
01633 VL_PRINTF("kmeans: Elkan iter %d: energy <= %g, dist. calc. = %d\n",
01634 iteration,
01635 energy,
01636 numDistanceComputations) ;
01637 if (numRestartedCenters) {
01638 VL_PRINTF("kmeans: Elkan iter %d: restarted %d centers\n",
01639 iteration,
01640 energy,
01641 numRestartedCenters) ;
01642 }
01643 if (self->verbosity > 1) {
01644 VL_PRINTF("kmeans: Elkan iter %d: total dist. calc. per type: "
01645 "UB: %.1f%% (%d), LB: %.1f%% (%d), "
01646 "intra_center: %.1f%% (%d), "
01647 "new_center: %.1f%% (%d)\n",
01648 iteration,
01649 100.0 * numDistanceComputationsToRefreshUB / numDistanceComputations,
01650 numDistanceComputationsToRefreshUB,
01651 100.0 *numDistanceComputationsToRefreshLB / numDistanceComputations,
01652 numDistanceComputationsToRefreshLB,
01653 100.0 * numDistanceComputationsToRefreshCenterDistances / numDistanceComputations,
01654 numDistanceComputationsToRefreshCenterDistances,
01655 100.0 * numDistanceComputationsToNewCenters / numDistanceComputations,
01656 numDistanceComputationsToNewCenters) ;
01657 }
01658 }
01659
01660
01661 if (iteration >= self->maxNumIterations) {
01662 if (self->verbosity) {
01663 VL_PRINTF("kmeans: Elkan terminating because maximum number of iterations reached\n") ;
01664 }
01665 break ;
01666 }
01667 if (allDone) {
01668 if (self->verbosity) {
01669 VL_PRINTF("kmeans: Elkan terminating because the algorithm fully converged\n") ;
01670 }
01671 break ;
01672 }
01673
01674 }
01675
01676
01677 energy = 0 ;
01678 for (x = 0 ; x < (signed)numData ; ++ x) {
01679 vl_uindex cx = assignments [x] ;
01680 energy += distFn(self->dimension,
01681 data + self->dimension * x,
01682 (TYPE*)self->centers + self->dimension * cx) ;
01683 totDistanceComputationsToFinalize += 1 ;
01684 }
01685
01686 {
01687 vl_size totDistanceComputations =
01688 totDistanceComputationsToInit +
01689 totDistanceComputationsToRefreshUB +
01690 totDistanceComputationsToRefreshLB +
01691 totDistanceComputationsToRefreshCenterDistances +
01692 totDistanceComputationsToNewCenters +
01693 totDistanceComputationsToFinalize ;
01694
01695 double saving = (double)totDistanceComputations
01696 / (iteration * self->numCenters * numData) ;
01697
01698 if (self->verbosity) {
01699 VL_PRINTF("kmeans: Elkan: total dist. calc.: %d (%.2f %% of Lloyd)\n",
01700 totDistanceComputations, saving * 100.0) ;
01701 if (totNumRestartedCenters) {
01702 VL_PRINTF("kmeans: Elkan: there have been %d restarts\n",
01703 totNumRestartedCenters) ;
01704 }
01705 }
01706
01707 if (self->verbosity > 1) {
01708 VL_PRINTF("kmeans: Elkan: total dist. calc. per type: "
01709 "init: %.1f%% (%d), UB: %.1f%% (%d), LB: %.1f%% (%d), "
01710 "intra_center: %.1f%% (%d), "
01711 "new_center: %.1f%% (%d), "
01712 "finalize: %.1f%% (%d)\n",
01713 100.0 * totDistanceComputationsToInit / totDistanceComputations,
01714 totDistanceComputationsToInit,
01715 100.0 * totDistanceComputationsToRefreshUB / totDistanceComputations,
01716 totDistanceComputationsToRefreshUB,
01717 100.0 *totDistanceComputationsToRefreshLB / totDistanceComputations,
01718 totDistanceComputationsToRefreshLB,
01719 100.0 * totDistanceComputationsToRefreshCenterDistances / totDistanceComputations,
01720 totDistanceComputationsToRefreshCenterDistances,
01721 100.0 * totDistanceComputationsToNewCenters / totDistanceComputations,
01722 totDistanceComputationsToNewCenters,
01723 100.0 * totDistanceComputationsToFinalize / totDistanceComputations,
01724 totDistanceComputationsToFinalize) ;
01725 }
01726 }
01727
01728 if (permutations) {
01729 vl_free(permutations) ;
01730 }
01731 if (numSeenSoFar) {
01732 vl_free(numSeenSoFar) ;
01733 }
01734
01735 vl_free(distances) ;
01736 vl_free(assignments) ;
01737 vl_free(clusterMasses) ;
01738
01739 vl_free(nextCenterDistances) ;
01740 vl_free(pointToClosestCenterUB) ;
01741 vl_free(pointToClosestCenterUBIsStrict) ;
01742 vl_free(pointToCenterLB) ;
01743 vl_free(newCenters) ;
01744 vl_free(centerToNewCenterDistances) ;
01745
01746 return energy ;
01747 }
01748
01749
01750 static double
01751 VL_XCAT(_vl_kmeans_refine_centers_, SFX)
01752 (VlKMeans * self,
01753 TYPE const * data,
01754 vl_size numData)
01755 {
01756 switch (self->algorithm) {
01757 case VlKMeansLloyd:
01758 return
01759 VL_XCAT(_vl_kmeans_refine_centers_lloyd_, SFX)(self, data, numData) ;
01760 break ;
01761 case VlKMeansElkan:
01762 return
01763 VL_XCAT(_vl_kmeans_refine_centers_elkan_, SFX)(self, data, numData) ;
01764 break ;
01765 case VlKMeansANN:
01766 return
01767 VL_XCAT(_vl_kmeans_refine_centers_ann_, SFX)(self, data, numData) ;
01768 break ;
01769 default:
01770 abort() ;
01771 }
01772 }
01773
01774
01775 #else
01776
01777 #ifndef __DOXYGEN__
01778 #define FLT VL_TYPE_FLOAT
01779 #define TYPE float
01780 #define SFX f
01781 #define VL_KMEANS_INSTANTIATING
01782 #include "kmeans.c"
01783
01784 #define FLT VL_TYPE_DOUBLE
01785 #define TYPE double
01786 #define SFX d
01787 #define VL_KMEANS_INSTANTIATING
01788 #include "kmeans.c"
01789 #endif
01790
01791
01792 #endif
01793
01794
01795 #ifndef VL_KMEANS_INSTANTIATING
01796
01805 VL_EXPORT void
01806 vl_kmeans_set_centers
01807 (VlKMeans * self,
01808 void const * centers,
01809 vl_size dimension,
01810 vl_size numCenters)
01811 {
01812 vl_kmeans_reset (self) ;
01813
01814 switch (self->dataType) {
01815 case VL_TYPE_FLOAT :
01816 _vl_kmeans_set_centers_f
01817 (self, (float const *)centers, dimension, numCenters) ;
01818 break ;
01819 case VL_TYPE_DOUBLE :
01820 _vl_kmeans_set_centers_d
01821 (self, (double const *)centers, dimension, numCenters) ;
01822 break ;
01823 default:
01824 abort() ;
01825 }
01826 }
01827
01840 VL_EXPORT void
01841 vl_kmeans_init_centers_with_rand_data
01842 (VlKMeans * self,
01843 void const * data,
01844 vl_size dimension,
01845 vl_size numData,
01846 vl_size numCenters)
01847 {
01848 vl_kmeans_reset (self) ;
01849
01850 switch (self->dataType) {
01851 case VL_TYPE_FLOAT :
01852 _vl_kmeans_init_centers_with_rand_data_f
01853 (self, (float const *)data, dimension, numData, numCenters) ;
01854 break ;
01855 case VL_TYPE_DOUBLE :
01856 _vl_kmeans_init_centers_with_rand_data_d
01857 (self, (double const *)data, dimension, numData, numCenters) ;
01858 break ;
01859 default:
01860 abort() ;
01861 }
01862 }
01863
01873 VL_EXPORT void
01874 vl_kmeans_init_centers_plus_plus
01875 (VlKMeans * self,
01876 void const * data,
01877 vl_size dimension,
01878 vl_size numData,
01879 vl_size numCenters)
01880 {
01881 vl_kmeans_reset (self) ;
01882
01883 switch (self->dataType) {
01884 case VL_TYPE_FLOAT :
01885 _vl_kmeans_init_centers_plus_plus_f
01886 (self, (float const *)data, dimension, numData, numCenters) ;
01887 break ;
01888 case VL_TYPE_DOUBLE :
01889 _vl_kmeans_init_centers_plus_plus_d
01890 (self, (double const *)data, dimension, numData, numCenters) ;
01891 break ;
01892 default:
01893 abort() ;
01894 }
01895 }
01896
01906 VL_EXPORT void
01907 vl_kmeans_quantize
01908 (VlKMeans * self,
01909 vl_uint32 * assignments,
01910 void * distances,
01911 void const * data,
01912 vl_size numData)
01913 {
01914 switch (self->dataType) {
01915 case VL_TYPE_FLOAT :
01916 _vl_kmeans_quantize_f
01917 (self, assignments, distances, (float const *)data, numData) ;
01918 break ;
01919 case VL_TYPE_DOUBLE :
01920 _vl_kmeans_quantize_d
01921 (self, assignments, distances, (double const *)data, numData) ;
01922 break ;
01923 default:
01924 abort() ;
01925 }
01926 }
01927
01946 VL_EXPORT void
01947 vl_kmeans_quantize_ann
01948 (VlKMeans * self,
01949 vl_uint32 * assignments,
01950 void * distances,
01951 void const * data,
01952 vl_size numData,
01953 vl_bool update)
01954 {
01955 switch (self->dataType) {
01956 case VL_TYPE_FLOAT :
01957 _vl_kmeans_quantize_ann_f
01958 (self, assignments, distances, (float const *)data, numData, update) ;
01959 break ;
01960 case VL_TYPE_DOUBLE :
01961 _vl_kmeans_quantize_ann_d
01962 (self, assignments, distances, (double const *)data, numData, update) ;
01963 break ;
01964 default:
01965 abort() ;
01966 }
01967 }
01968
01983 VL_EXPORT double
01984 vl_kmeans_refine_centers
01985 (VlKMeans * self,
01986 void const * data,
01987 vl_size numData)
01988 {
01989 assert (self->centers) ;
01990
01991 switch (self->dataType) {
01992 case VL_TYPE_FLOAT :
01993 return
01994 _vl_kmeans_refine_centers_f
01995 (self, (float const *)data, numData) ;
01996 case VL_TYPE_DOUBLE :
01997 return
01998 _vl_kmeans_refine_centers_d
01999 (self, (double const *)data, numData) ;
02000 default:
02001 abort() ;
02002 }
02003 }
02004
02005
02023 VL_EXPORT double
02024 vl_kmeans_cluster (VlKMeans * self,
02025 void const * data,
02026 vl_size dimension,
02027 vl_size numData,
02028 vl_size numCenters)
02029 {
02030 vl_uindex repetition ;
02031 double bestEnergy = VL_INFINITY_D ;
02032 void * bestCenters = NULL ;
02033
02034 for (repetition = 0 ; repetition < self->numRepetitions ; ++ repetition) {
02035 double energy ;
02036 double timeRef ;
02037
02038 if (self->verbosity) {
02039 VL_PRINTF("kmeans: repetition %d of %d\n", repetition + 1, self->numRepetitions) ;
02040 }
02041
02042 timeRef = vl_get_cpu_time() ;
02043 switch (self->initialization) {
02044 case VlKMeansRandomSelection :
02045 vl_kmeans_init_centers_with_rand_data (self,
02046 data, dimension, numData,
02047 numCenters) ;
02048 break ;
02049 case VlKMeansPlusPlus :
02050 vl_kmeans_init_centers_plus_plus (self,
02051 data, dimension, numData,
02052 numCenters) ;
02053 break ;
02054 default:
02055 abort() ;
02056 }
02057
02058 if (self->verbosity) {
02059 VL_PRINTF("kmeans: K-means initialized in %.2f s\n",
02060 vl_get_cpu_time() - timeRef) ;
02061 }
02062
02063 timeRef = vl_get_cpu_time () ;
02064 energy = vl_kmeans_refine_centers (self, data, numData) ;
02065 if (self->verbosity) {
02066 VL_PRINTF("kmeans: K-means terminated in %.2f s with energy %g\n",
02067 vl_get_cpu_time() - timeRef, energy) ;
02068 }
02069
02070
02071
02072
02073 if (energy < bestEnergy || repetition == 0) {
02074 void * temp ;
02075 bestEnergy = energy ;
02076
02077 if (bestCenters == NULL) {
02078 bestCenters = vl_malloc(vl_get_type_size(self->dataType) *
02079 self->dimension *
02080 self->numCenters) ;
02081 }
02082
02083
02084 temp = bestCenters ;
02085 bestCenters = self->centers ;
02086 self->centers = temp ;
02087 }
02088 }
02089
02090 vl_free (self->centers) ;
02091 self->centers = bestCenters ;
02092 return bestEnergy ;
02093 }
02094
02095
02096 #endif
02097
02098 #undef SFX
02099 #undef TYPE
02100 #undef FLT
02101 #undef VL_KMEANS_INSTANTIATING