00001 # Copyright (c) 2008, Willow Garage, Inc. 00002 # All rights reserved. 00003 # 00004 # Redistribution and use in source and binary forms, with or without 00005 # modification, are permitted provided that the following conditions are met: 00006 # 00007 # * Redistributions of source code must retain the above copyright 00008 # notice, this list of conditions and the following disclaimer. 00009 # * Redistributions in binary form must reproduce the above copyright 00010 # notice, this list of conditions and the following disclaimer in the 00011 # documentation and/or other materials provided with the distribution. 00012 # * Neither the name of the Willow Garage, Inc. nor the names of its 00013 # contributors may be used to endorse or promote products derived from 00014 # this software without specific prior written permission. 00015 # 00016 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00017 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00018 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00019 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 00020 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00021 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00022 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00023 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00024 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00025 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00026 # POSSIBILITY OF SUCH DAMAGE. 00027 # 00028 ## @author Hai Nguyen/hai@gatech.edu 00029 import numpy as np 00030 #from pylab import * 00031 00032 def pca_gain_threshold(s, percentage_change_threshold=.15): 00033 if s.__class__ != np.ndarray: 00034 raise ValueError('Need ndarray as input.') 00035 shifted = np.concatenate((s[1:].copy(), np.array([s[-1]])), axis=1) 00036 diff = s - shifted 00037 percent_diff = diff / s 00038 positions = np.where(percent_diff < percentage_change_threshold) 00039 return positions[0][0] 00040 00041 def pca_variance_threshold(eigen_values, percent_variance=.9): 00042 eigen_sum = np.sum(eigen_values) 00043 #print 'pca_variance_threshold: eigen_sum', eigen_sum 00044 eigen_normed = np.cumsum(eigen_values) / eigen_sum 00045 positions = np.where(eigen_normed > percent_variance) 00046 print 'pca_variance_threshold: percent_variance', percent_variance 00047 #print positions 00048 return positions[0][0] 00049 00050 def pca(data): 00051 cov_data = np.cov(data) 00052 u, s, vh = np.linalg.svd(cov_data) 00053 return u,s,vh 00054 00055 def pca_vectors(data, percent_variance): 00056 u, s, vh = pca(data) 00057 number_of_vectors = pca_variance_threshold(s, percent_variance=percent_variance) 00058 return np.matrix(u[:,0:number_of_vectors+1]) 00059 00060 def randomized_vectors(dataset, number_of_vectors): 00061 rvectors = np.matrix(np.random.random_sample((dataset.num_attributes(), number_of_vectors))) * 2 - 1.0 00062 lengths = np.diag(1.0 / np.power(np.sum(np.power(rvectors, 2), axis=0), 0.5)) 00063 return rvectors * lengths 00064 00065