random_forest.py
Go to the documentation of this file.
00001 # Copyright (c) 2008, Willow Garage, Inc.
00002 # All rights reserved.
00003 # 
00004 # Redistribution and use in source and binary forms, with or without
00005 # modification, are permitted provided that the following conditions are met:
00006 # 
00007 #     * Redistributions of source code must retain the above copyright
00008 #       notice, this list of conditions and the following disclaimer.
00009 #     * Redistributions in binary form must reproduce the above copyright
00010 #       notice, this list of conditions and the following disclaimer in the
00011 #       documentation and/or other materials provided with the distribution.
00012 #     * Neither the name of the Willow Garage, Inc. nor the names of its
00013 #       contributors may be used to endorse or promote products derived from
00014 #       this software without specific prior written permission.
00015 # 
00016 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00017 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00018 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00019 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00020 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00021 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00022 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00023 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00024 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00025 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00026 # POSSIBILITY OF SUCH DAMAGE.
00027 #
00028 ## @author Hai Nguyen/hai@gatech.edu
00029 import numpy as np
00030 import itertools as it
00031 import functools as ft
00032 import time
00033 
00034 class Dataset:
00035     def __init__(self, inputs, outputs):
00036         """
00037             inputs coded as numpy array, column vectors
00038             outputs also as numpy array, column vectors
00039         """
00040         self.inputs  = inputs
00041         self.outputs = outputs
00042         assert(inputs.shape[1] == outputs.shape[1])
00043 
00044     def num_examples(self):
00045         return self.inputs.shape[1]
00046 
00047     def num_attributes(self):
00048         return self.inputs.shape[0]
00049 
00050     def split_continuous(self, attribute, split_point):
00051         selected_attribute = self.inputs[attribute, :]
00052         leq_bool           = selected_attribute <= split_point
00053         _, leq_col         = np.where(leq_bool)
00054 
00055         #print 'leq_col', leq_col
00056         if leq_col.shape[1] > 0:
00057             leq_dataset        = Dataset(self.inputs[:, leq_col.A[0]], self.outputs[:, leq_col.A[0]])
00058         else:
00059             leq_dataset        = Dataset(np.matrix([]), np.matrix([]))
00060 
00061         _, gt_col          = np.where(~leq_bool)
00062         if gt_col.shape[1] > 0:
00063             gt_dataset         = Dataset(self.inputs[:, gt_col.A[0]], self.outputs[:, gt_col.A[0]])
00064         else:
00065             gt_dataset         = Dataset(np.matrix([]), np.matrix([]))
00066         
00067         ret_sets = []
00068         if leq_dataset.num_examples() > 0:
00069             ret_sets.append(leq_dataset)
00070         if gt_dataset.num_examples() > 0:
00071             ret_sets.append(gt_dataset)
00072         return ret_sets
00073 
00074     def unique_values(self, attribute_number, set='input'):
00075         if set == 'input':
00076             examples = self.inputs
00077         else:
00078             examples = self.outputs
00079 
00080         values   = dict()
00081         for instance_idx in xrange(examples.shape[1]):
00082             values[examples[attribute_number, instance_idx]] = True
00083         k = values.keys()
00084         k.sort()
00085         return k
00086 
00087     def bootstrap_samples(self, number_samples, points_per_sample):
00088         for i in xrange(number_samples):
00089             selected_pts     = np.random.randint(0, self.inputs.shape[1], points_per_sample)
00090             selected_inputs  = self.inputs[:, selected_pts]
00091             selected_outputs = self.outputs[:, selected_pts]
00092             #print 'Dataset.bootstrap count', i
00093             yield Dataset(selected_inputs, selected_outputs)
00094 
00095     def entropy_discrete(self):
00096         values = self.unique_values(0, 'output')
00097         #print 'entropy_discrete: values', values
00098         #for each output class calculate
00099         def calc_class_entropy(value):
00100             number_in_class     = np.sum(self.outputs[0,:] == value)
00101             percentage_in_class = (number_in_class / float(self.num_examples()))
00102             return -percentage_in_class * np.log2(percentage_in_class)
00103         return np.sum(map(calc_class_entropy, values))
00104 
00105     def append(self, another_dataset):
00106         self.inputs  = np.concatenate((self.inputs, another_dataset.inputs), axis=1)
00107         self.outputs = np.concatenate((self.outputs, another_dataset.outputs), axis=1)
00108 
00109 class LinearDimReduceDataset(Dataset):
00110     def __init__(self, inputs, outputs):
00111         Dataset.__init__(self, inputs, outputs)
00112 
00113     def set_projection_vectors(self, vec):
00114         '''
00115             projection vectors are assumed to be columnwise
00116         '''
00117         self.projection_basis = vec
00118         print 'LinearDimReduceDataset: projection_basis', vec.shape
00119 
00120     def reduce(self, data_points):
00121         return self.projection_basis.T * data_points
00122 
00123     def reduce_input(self):
00124         '''
00125            reduce dimensionality of this dataset
00126         '''
00127         self.inputs =  self.projection_basis.T * self.inputs
00128 
00129 def binary_less_than(attribute, threshold, input_vec):
00130     return input_vec[attribute,0] <= threshold
00131 
00132 def binary_greater_than(attribute, threshold, input_vec):
00133     return input_vec[attribute,0] > threshold
00134 
00135 def create_binary_tests(attribute, threshold):
00136     return [ft.partial(binary_less_than, attribute, threshold), 
00137             ft.partial(binary_greater_than, attribute, threshold)]
00138 
00139 def mode_exhaustive(set):
00140     '''
00141        Finds the mode of a given set
00142     '''
00143 
00144     #Count, store in dictionary
00145     mdict = dict()
00146     for s in set:
00147         has_stored = False
00148         for k in mdict.keys():
00149             if k == s:
00150                 mdict[k] = 1+mdict[k]
00151                 has_stored = True
00152         if not has_stored:
00153             mdict[s] = 1
00154 
00155     #Find the key with maximum votes
00156     max_key   = None
00157     max_count = -1
00158     for k in mdict.keys():
00159         if mdict[k] > max_count:
00160             max_key   = k
00161             max_count = mdict[k]
00162     #print 'mode_exhaustive: ', mdict
00163     return max_key, mdict
00164 
00165 def min_entropy_split(dataset):
00166     '''
00167         Find the split that produces subsets with the minimum combined entropy
00168         return splitting attribute & splitting point for that attribute
00169     '''
00170     #print 'in min_entropy_split'
00171     # Assume inputs are continuous, and are column vectors.
00172     hypotheses     = []
00173     entropies      = []
00174     # For each attribute find the best split point.
00175     for attribute in xrange(dataset.num_attributes()):
00176         values = dataset.unique_values(attribute)
00177         #Iterate over the possible values of split & calculate entropy for each split.
00178         for split_point in values:
00179             def calc_entropy(data_set):
00180                 num_points = data_set.num_examples()
00181                 return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete()
00182             split_entropy = map(calc_entropy, dataset.split_continuous(attribute, split_point))
00183             hypotheses.append((attribute, split_point))
00184             entropies.append(sum(split_entropy))
00185     # Select the attribute split pair that has the lowest entropy.
00186     entropies                              = np.matrix(entropies)
00187     min_idx                                = np.argmin(entropies)
00188     return hypotheses[min_idx]
00189 
00190 def random_subset(subset_size, total_size):
00191     #print 'in random_subset'
00192     assert(subset_size <= total_size)
00193     occupancy = np.matrix(np.zeros((1, total_size)))
00194     while occupancy.sum() < subset_size:
00195         occupancy[0, np.random.randint(0, total_size)] = 1
00196     rows, columns = np.where(occupancy > 0)
00197     return columns.A[0]
00198 
00199 def split_random_subset(subset_size, total_size):
00200     assert(subset_size <= total_size)
00201     occupancy = np.matrix(np.zeros((1, total_size)))
00202     while occupancy.sum() < subset_size:
00203         occupancy[0, np.random.randint(0, total_size)] = 1
00204     bool_sel                = occupancy > 0
00205     rows, columns_subset    = np.where(bool_sel)
00206     rows, columns_remaining = np.where(np.invert(bool_sel))
00207     return columns_subset.A[0], columns_remaining.A[0]
00208 
00209 def random_subset_split(num_subset, dataset):
00210     ''' splitter in decision tree '''
00211     #print 'in random_subset_split'
00212     #print 'num_subset', num_subset, dataset, 'dataset.input.shape', dataset.inputs.shape
00213     subset_indexes   = random_subset(num_subset, dataset.num_attributes())
00214     sub_dataset      = Dataset(dataset.inputs[subset_indexes,:], dataset.outputs)
00215     attribute, point = min_entropy_split(sub_dataset)
00216     return subset_indexes[attribute], point
00217 
00218 def totally_random_split(dataset):
00219     #print 'totally random'
00220     attr     = np.random.randint(0, dataset.num_attributes())
00221     split_pt = dataset.inputs[attr, np.random.randint(0, dataset.num_examples())]
00222     return attr, split_pt
00223 
00224 class DecisionTree:
00225     def __init__(self, dataset=None, splitting_func=min_entropy_split):
00226         self.children   = None
00227         self.prediction = None
00228         if dataset is not None:
00229             self.train(dataset, splitting_func=splitting_func)
00230 
00231     def train(self, dataset, splitting_func=min_entropy_split):
00232         if not self.make_leaf(dataset):
00233             #print 'in train.splitting', dataset.num_examples()
00234             self.split_attribute, self.split_point = splitting_func(dataset)
00235             #print 'self.split_attribute, self.split_point', self.split_attribute, self.split_point 
00236             data_sets                              = dataset.split_continuous(self.split_attribute, self.split_point)
00237             if len(data_sets) < 2:
00238                 self.prediction = dataset.outputs
00239                 return
00240             
00241             def tree_split(set):
00242                 #print 'tree', set.num_examples()
00243                 return DecisionTree(set, splitting_func=splitting_func)
00244             # Create & train child decision nodes
00245             tests            = create_binary_tests(self.split_attribute, self.split_point)
00246             self.children    = zip(tests, map(tree_split, data_sets))
00247 
00248     def make_leaf(self, dataset):
00249         if np.all(dataset.outputs[:,0] == dataset.outputs):
00250             self.prediction = dataset.outputs[:,0]
00251             #print 'leaf'
00252             return True
00253         elif np.all(dataset.inputs[:,0] == dataset.inputs):
00254             self.prediction = dataset.outputs
00255             #print 'leaf'
00256             return True
00257         else:
00258             return False
00259 
00260     def predict(self, input):
00261         if self.prediction is not None:
00262             return self.prediction[:, np.random.randint(0, self.prediction.shape[1])]
00263         else:
00264             for test, child in self.children:
00265                 if test(input):
00266                     return child.predict(input)
00267             raise RuntimeError("DecisionTree: splits not exhaustive, unable to split for input" + str(input.T))
00268 
00269 class RFBase:
00270     def __init__(self, dataset=None, number_of_dimensions=None, number_of_learners=100):
00271         """ 
00272             number_of_dimensions   unclear which direction, but should be around 10-20% of original 
00273                                    data dimension
00274             number_of_learners        limited by processor performance, higher is better
00275         """
00276         self.number_of_learners   = number_of_learners
00277         self.number_of_dimensions = number_of_dimensions
00278         if dataset != None:
00279             self.train(dataset)
00280 
00281     def predict(self, data, vote_combine_function=None):
00282         def predict_(learner):
00283             return learner.predict(learner.transform_input(data))
00284         predictions = map(predict_,self.learners)
00285         if vote_combine_function is not None:
00286             return vote_combine_function(predictions)
00287         else:
00288             return mode_exhaustive(predictions)
00289 
00290     def train(self, dataset):
00291         pass
00292 
00293 def identity(x):
00294     return x
00295 
00296 class RFBreiman(RFBase):
00297     def train(self, dataset):
00298         def train_trees(examples_subset):
00299             tree = DecisionTree()
00300             #tree.train(examples_subset, splitting_func=ft.partial(random_subset_split, self.number_of_dimensions))
00301             tree.train(examples_subset, splitting_func=totally_random_split)
00302             #use identity function
00303             tree.transform_input = identity 
00304             return tree
00305 
00306         if self.number_of_dimensions == None:
00307             self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00308         points_per_sample = dataset.num_examples() * 1.0 / 3.0
00309         self.learners     = map(train_trees, dataset.bootstrap_samples(self.number_of_learners, points_per_sample))
00310 
00311 class RFRandomInputSubset(RFBase):
00312     def train(self, dataset):
00313         def train_trees(examples_subset):
00314             #select a subset of dimensions
00315             dims               = random_subset(self.number_of_dimensions, examples_subset.num_attributes())
00316             subset_input       = examples_subset.inputs[dims, :]
00317             reduced_sample     = Dataset(subset_input, examples_subset.outputs)
00318             tree               = DecisionTree(reduced_sample)
00319             tree.dimensions_subset = dims
00320             def transform_input(input):
00321                 return input[dims, :]
00322             tree.transform_input = transform_input
00323             return tree
00324 
00325         if self.number_of_dimensions == None:
00326             self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00327         points_per_sample = dataset.num_examples() * 1.0 / 3.0
00328         self.learners     = map(train_trees, dataset.bootstrap_samples(self.number_of_learners, points_per_sample))
00329 
00330 def evaluate_classifier(building_func, data, times=10.0, percentage=None, extra_args={}, test_pca=False):
00331     '''
00332         Evaluate classifier by dividing dataset into training and test set.
00333         @param building_func Function that will build classifier given data and args in extra_args.
00334         @param data Dataset to use for evaluation/training.
00335         @param times The number of bootstrap samples to take.
00336         @param percentage The percentage of data to use for training.
00337         @param extra_args Extra arguments to pass to building_func.
00338     '''
00339     print 'evaluate_classifier: extra_args', extra_args
00340     total_pts            = data.num_examples()
00341     testing_errors       = []
00342     training_errors      = []
00343     build_times          = []
00344     classification_times = []
00345     for i in range(times):
00346         if percentage == None:
00347             percentage = (i+1)/times
00348         num_examples = int(round(total_pts*percentage))
00349         print 'Evaluate classifier built with', percentage*100, '% data, num examples', num_examples
00350         subset, unselected = split_random_subset(num_examples, total_pts)
00351         i                  = data.inputs[:,  subset]
00352         o                  = data.outputs[:, subset]
00353         print "Building classifier..."
00354         if test_pca:
00355             print '            TESTING PCA'
00356             import dimreduce as dr
00357             subseted_dataset   = LinearDimReduceDataset(i,o)
00358             subseted_dataset.set_projection_vectors(dr.pca_vectors(subseted_dataset.inputs, percent_variance=.95))
00359             subseted_dataset.reduce_input()
00360             print 'subseted_dataset.num_attributes(), subseted_dataset.num_examples()', subseted_dataset.num_attributes(), subseted_dataset.num_examples()
00361         else:
00362             subseted_dataset   = Dataset(i,o)
00363 
00364         start_time         = time.time()
00365         classifier         = building_func(subseted_dataset, **extra_args)
00366         build_times.append(time.time() - start_time)
00367         print "done building..."
00368 
00369         ##########################################
00370         #Classify training set
00371         ##########################################
00372         count_selected     = []
00373         for i, idx in enumerate(subset):
00374             start_time    = time.time()
00375             if test_pca:
00376                 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00377             else:
00378                 prediction, _ = classifier.predict(data.inputs[:,idx])
00379 
00380             classification_times.append(time.time() - start_time)
00381             true_val      = data.outputs[:,idx]
00382             if prediction == true_val:
00383                 count_selected.append(1)
00384             else:
00385                 count_selected.append(0)
00386             if i%100 == 0:
00387                 print i
00388         count_selected = np.matrix(count_selected)
00389 
00390         ##########################################
00391         #Classify testing set
00392         ##########################################
00393         confusion_matrix   = dict()
00394         count_unselected   = []
00395         print 'Total points', total_pts
00396         for idx in unselected:
00397             start_time    = time.time()
00398             if test_pca:
00399                 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00400             else:
00401                 prediction, _ = classifier.predict(data.inputs[:,idx])
00402             classification_times.append(time.time() - start_time)
00403             true_val      = data.outputs[:,idx]
00404             if prediction == true_val:
00405                 count_unselected.append(1)
00406             else:
00407                 count_unselected.append(0)
00408             if confusion_matrix.has_key(true_val[0,0]):
00409                 if confusion_matrix[true_val[0,0]].has_key(prediction[0,0]):
00410                     confusion_matrix[true_val[0,0]][prediction[0,0]] = confusion_matrix[true_val[0,0]][prediction[0,0]] + 1
00411                 else:
00412                     confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00413             else:
00414                 confusion_matrix[true_val[0,0]] = dict()
00415                 confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00416 
00417         training_error = 100.0 * np.sum(count_selected) / float(len(subset))
00418         testing_error  = 100.0 * np.sum(count_unselected) / float(len(unselected))
00419         testing_errors.append(testing_error)
00420         training_errors.append(training_error)
00421         print 'Correct on training set', training_error, '%'
00422         print '        on testing set',  testing_error, '%'
00423         print 'Confusion'
00424         for k in confusion_matrix.keys():
00425             sum = 0.0
00426             for k2 in confusion_matrix[k]:
00427                 sum = sum + confusion_matrix[k][k2]
00428             for k2 in confusion_matrix[k]:
00429                 print 'true class', k, 'classified as', k2, 100.0 * (confusion_matrix[k][k2] / sum), '% of the time'
00430 
00431     def print_stats(name, list_data):
00432         m = np.matrix(list_data)
00433         print '%s: average %f std %f' % (name, m.mean(), np.std(m))
00434 
00435     print_stats('training error', training_errors)
00436     print_stats('testing error', testing_errors)
00437     print_stats('build time', build_times)
00438     print_stats('classification time', classification_times)
00439 
00440 
00441 if __name__ == '__main__':
00442     test_iris         = False
00443     test_pickle       = True
00444     test_number_trees = False
00445     test_pca          = False
00446     if test_iris:
00447         #Setup for repeated testing
00448         iris_array = np.matrix(np.loadtxt('iris.data', dtype='|S30', delimiter=','))
00449         inputs     = np.float32(iris_array[:, 0:4]).T
00450         outputs    = iris_array[:, 4].T
00451         dataset    = Dataset(inputs, outputs)
00452 
00453         print '================================'
00454         print "Test DecisionTree"
00455         evaluate_classifier(DecisionTree, dataset, 5, .9)
00456 
00457         print '================================'
00458         #print "Test random forest"
00459         #for i in range(4):
00460         #    #print "Test RFRandomInputSubset"
00461         #    #evaluate_classifier(RFRandomInputSubset, dataset, 1, .7)
00462         #    print "Test RFBreiman"
00463         #    evaluate_classifier(RFEntropySplitRandomInputSubset, dataset, 1, .7)
00464     if test_pickle:
00465         import pickle as pk
00466         def load_pickle(filename):
00467             p = open(filename, 'r')
00468             picklelicious = pk.load(p)
00469             p.close()
00470             return picklelicious
00471 
00472         def print_separator(times=2):
00473             for i in xrange(times):
00474                 print '==============================================================='
00475 
00476         dataset = load_pickle('PatchClassifier.dataset.pickle')
00477         #if test_pca:
00478         #    print_separator(1)
00479         #    print_separator(1)
00480         #    dataset.reduce_input()
00481 
00482         if test_number_trees:
00483             tree_types = [RFBreiman, RFRandomInputSubset]
00484             #tree_types = [RFBreiman]
00485             for tree_type in tree_types:
00486                 print_separator()
00487                 print 'Testing', tree_type
00488                 for i in range(10):
00489                     print tree_type, 'using', (i+1)*10, 'trees'
00490                     evaluate_classifier(tree_type, dataset, 3, .95, 
00491                             extra_args={'number_of_learners': (i+1)*10}, test_pca=test_pca)
00492         else:
00493             tree_types = [RFBreiman, RFRandomInputSubset]
00494             #tree_types = [RFRandomInputSubset]
00495             for tree_type in tree_types:
00496                 print_separator()
00497                 print tree_type
00498                 evaluate_classifier(tree_type, dataset, 10, .95, 
00499                         extra_args={'number_of_learners': 70}, test_pca=test_pca)
00500 
00501 
00502 
00503 
00504 
00505 


laser_interface
Author(s): Hai Nguyen and Travis Deyle. Advisor: Prof. Charlie Kemp, Lab: Healthcare Robotics Lab at Georgia Tech
autogenerated on Wed Nov 27 2013 11:45:51