random_forest.py
Go to the documentation of this file.
00001 ## @author Hai Nguyen/hai@gatech.edu
00002 import roslib; roslib.load_manifest('ml_lib')
00003 
00004 import numpy as np
00005 import itertools as it
00006 import functools as ft
00007 import time
00008 import dataset as ds
00009 
00010 ##
00011 # Base class for random forest classifier
00012 #
00013 class RFBase:
00014 
00015     ##
00016     # @param dataset Dataset object
00017     # @param number_of_dimensions   unclear which direction, but should be around 10-20% of original 
00018     #                               data dimension
00019     # @param number_of_learners     limited by processor performance, higher is better
00020     def __init__(self, dataset=None, number_of_dimensions=None, number_of_learners=100):
00021         self.number_of_learners   = number_of_learners
00022         self.number_of_dimensions = number_of_dimensions
00023         if dataset != None:
00024             self.train(dataset)
00025 
00026     ##
00027     # @param data
00028     # @param vote_combine_function function to combine votes, by default
00029     #                              returns the label with the most votes
00030     def predict(self, data, vote_combine_function=None):
00031         def predict_(learner):
00032             return learner.predict(learner.transform_input(data, learner))
00033         predictions = map(predict_,self.learners)
00034         if vote_combine_function is not None:
00035             return vote_combine_function(predictions)
00036         else:
00037             return mode_exhaustive(predictions)
00038 
00039     def train(self, dataset):
00040         pass
00041 
00042     def avg_tree_depth(self):
00043         return np.average(map(DecisionTree.get_tree_depth, self.learners))
00044 
00045 
00046 ##
00047 # Train a random forest using DecisionTrees on bootstrap samples using splits
00048 # on random attributes and random values on those attributes.
00049 #
00050 class RFBreiman(RFBase):
00051     def train(self, dataset):
00052         def train_trees(examples_subset):
00053             tree = DecisionTree()
00054             #tree.train(examples_subset, splitting_func=ft.partial(random_subset_split, self.number_of_dimensions))
00055             tree.train(examples_subset, splitting_func=totally_random_split)
00056             #use identity function
00057             tree.transform_input = identity 
00058             return tree
00059 
00060         if self.number_of_dimensions == None:
00061             self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00062         points_per_sample = dataset.num_examples() * 1.0 / 3.0
00063         self.learners     = map(train_trees, ds.bootstrap_samples(dataset, self.number_of_learners, points_per_sample))
00064 
00065 ##
00066 # Train a random forest using DecisionTrees on bootstrap samples where each
00067 # sample has a random subset of dimensions but the split point is performed
00068 # using a minimum entropy criteria.
00069 #
00070 class RFRandomInputSubset(RFBase):
00071     def train(self, dataset):
00072         def train_trees(examples_subset):
00073             #select a subset of dimensions
00074             dims               = random_subset(self.number_of_dimensions, examples_subset.num_attributes())
00075             subset_input       = examples_subset.inputs[dims, :]
00076             reduced_sample     = Dataset(subset_input, examples_subset.outputs)
00077             tree               = DecisionTree(reduced_sample)
00078             tree.dimensions_subset = dims
00079             return tree
00080 
00081         if self.number_of_dimensions == None:
00082             self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00083         points_per_sample = dataset.num_examples() * 1.0 / 3.0
00084         in_bags, out_bags = ds.bootstrap_samples(dataset, 
00085                                                  self.number_of_learners, 
00086                                                  points_per_sample)
00087         self.learners     = map(train_trees, in_bags)
00088 
00089     def transform_input(self, input, tree=None):
00090         return input[tree.dimensions_subset, :]
00091 
00092 
00093 def binary_less_than(attribute, threshold, input_vec):
00094     return input_vec[attribute,0] <= threshold
00095 
00096 def binary_greater_than(attribute, threshold, input_vec):
00097     return input_vec[attribute,0] > threshold
00098 
00099 def create_binary_tests(attribute, threshold):
00100     return [('binary_less_than', attribute, threshold), 
00101             ('binary_greater_than', attribute, threshold)]
00102 
00103 ###############################################################################
00104 # Helper functions
00105 ###############################################################################
00106 
00107 ##
00108 # Finds the mode of a given set
00109 # 
00110 def mode_exhaustive(set):
00111 
00112     #Count, store in dictionary
00113     mdict = dict()
00114     for s in set:
00115         has_stored = False
00116         keys = mdict.keys()
00117         keys.sort() # sorting these keys gives determinism to the classifier
00118         for k in keys:
00119             if k == s:
00120                 mdict[k] = 1+mdict[k]
00121                 has_stored = True
00122         if not has_stored:
00123             mdict[s] = 1
00124 
00125     #Find the key with maximum votes
00126     max_key   = None
00127     max_count = -1
00128     keys = mdict.keys()
00129     keys.sort() # sorting these keys gives determinism to the classifier
00130     for k in keys:
00131         if mdict[k] > max_count:
00132             max_key   = k
00133             max_count = mdict[k]
00134     #print 'mode_exhaustive: ', mdict
00135     return max_key, mdict
00136 
00137 ##  
00138 # Find the split that produces subsets with the minimum combined entropy
00139 # return splitting attribute & splitting point for that attribute
00140 #
00141 # @param dataset
00142 def min_entropy_split(dataset):
00143     #print 'in min_entropy_split'
00144     # Assume inputs are continuous, and are column vectors.
00145     hypotheses     = []
00146     entropies      = []
00147     # For each attribute find the best split point.
00148     for attribute in xrange(dataset.num_attributes()):
00149         values = ds.unique_values(dataset.inputs, attribute)
00150         #Iterate over the possible values of split & calculate entropy for each split.
00151         for split_point in values:
00152             def calc_entropy(data_set):
00153                 num_points = data_set.num_examples()
00154                 #return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete()
00155                 return (num_points / float(dataset.num_examples())) * ds.dataset_entropy_discrete(dataset)
00156             split_entropy = map(calc_entropy, ds.split_continuous(dataset, attribute, split_point))
00157             hypotheses.append((attribute, split_point))
00158             entropies.append(sum(split_entropy))
00159     # Select the attribute split pair that has the lowest entropy.
00160     entropies                              = np.matrix(entropies)
00161     min_idx                                = np.argmin(entropies)
00162     return hypotheses[min_idx]
00163 
00164 def random_subset(subset_size, total_size):
00165     #print 'in random_subset'
00166     assert(subset_size <= total_size)
00167     occupancy = np.matrix(np.zeros((1, total_size)))
00168     while occupancy.sum() < subset_size:
00169         occupancy[0, np.random.randint(0, total_size)] = 1
00170     rows, columns = np.where(occupancy > 0)
00171     return columns.A[0]
00172 
00173 def split_random_subset(subset_size, total_size):
00174     assert(subset_size <= total_size)
00175     occupancy = np.matrix(np.zeros((1, total_size)))
00176     while occupancy.sum() < subset_size:
00177         occupancy[0, np.random.randint(0, total_size)] = 1
00178     bool_sel                = occupancy > 0
00179     rows, columns_subset    = np.where(bool_sel)
00180     rows, columns_remaining = np.where(np.invert(bool_sel))
00181     return columns_subset.A[0], columns_remaining.A[0]
00182 
00183 ##
00184 # splitter in decision tree 
00185 def random_subset_split(num_subset, dataset):
00186     #print 'in random_subset_split'
00187     #print 'num_subset', num_subset, dataset, 'dataset.input.shape', dataset.inputs.shape
00188     subset_indexes   = random_subset(num_subset, dataset.num_attributes())
00189     sub_dataset      = Dataset(dataset.inputs[subset_indexes,:], dataset.outputs)
00190     attribute, point = min_entropy_split(sub_dataset)
00191     return subset_indexes[attribute], point
00192 
00193 def totally_random_split(dataset):
00194     #print 'totally random'
00195     attr     = np.random.randint(0, dataset.num_attributes())
00196     split_pt = dataset.inputs[attr, np.random.randint(0, dataset.num_examples())]
00197     return attr, split_pt
00198 
00199 ###############################################################################
00200 # Basic DecisionTree that the random forest is based on
00201 class DecisionTree:
00202 
00203     def __init__(self, dataset=None, splitting_func=min_entropy_split):
00204         self.children   = None
00205         self.prediction = None
00206         if dataset is not None:
00207             self.train(dataset, splitting_func=splitting_func)
00208 
00209     def train(self, dataset, splitting_func=min_entropy_split):
00210         if not self.make_leaf(dataset):
00211             #print 'in train.splitting', dataset.num_examples()
00212             self.split_attribute, self.split_point = splitting_func(dataset)
00213             #print 'self.split_attribute, self.split_point', self.split_attribute, self.split_point 
00214             data_sets = ds.split_continuous(dataset, self.split_attribute, self.split_point)
00215             if len(data_sets) < 2:
00216                 self.prediction = dataset.outputs
00217                 return
00218             
00219             def tree_split(set):
00220                 #print 'tree', set.num_examples()
00221                 return DecisionTree(set, splitting_func=splitting_func)
00222             # Create & train child decision nodes
00223             tests            = create_binary_tests(self.split_attribute, self.split_point)
00224             self.children    = zip(tests, map(tree_split, data_sets))
00225 
00226     def make_leaf(self, dataset):
00227         if np.all(dataset.outputs[:,0] == dataset.outputs):
00228             self.prediction = dataset.outputs[:,0]
00229             #print 'leaf'
00230             return True
00231         elif np.all(dataset.inputs[:,0] == dataset.inputs):
00232             self.prediction = dataset.outputs
00233             #print 'leaf'
00234             return True
00235         else:
00236             return False
00237 
00238     def predict(self, input):
00239         if self.prediction is not None:
00240             return self.prediction[:, np.random.randint(0, self.prediction.shape[1])]
00241         else:
00242             for test, child in self.children:
00243                 test_func_name, attribute, threshold = test
00244                 if test_func_name == 'binary_less_than':
00245                     test_func = binary_less_than
00246                 elif test_func_name == 'binary_greater_than':
00247                     test_func = binary_greater_than
00248                 else:
00249                     rospy.logerr("DecisionTree bad function name : %s" % 
00250                                                                test_func_name)
00251                 if test_func(attribute, threshold, input):
00252                     return child.predict(input)
00253             raise RuntimeError("DecisionTree: splits not exhaustive, unable to split for input" + str(input.T))
00254 
00255     ##
00256     # Identity function
00257     def transform_input(self, input, tree=None):
00258         return input
00259             
00260     def get_tree_depth(self):
00261         if self.prediction is not None:
00262             return 1
00263 
00264         depths = []
00265         for test, child in self.children:
00266             depths.append(child.get_tree_depth() + 1)
00267 
00268         return max(depths)
00269 
00270 
00271 ##
00272 # Evaluate classifier by dividing dataset into training and test set.
00273 # @param building_func Function that will build classifier given data and args in extra_args.
00274 # @param data Dataset to use for evaluation/training.
00275 # @param times The number of bootstrap samples to take.
00276 # @param percentage The percentage of data to use for training.
00277 # @param extra_args Extra arguments to pass to building_func.
00278 def evaluate_classifier(building_func, data, times=10.0, percentage=None, extra_args={}, test_pca=False):
00279     print 'evaluate_classifier: extra_args', extra_args
00280     total_pts            = data.num_examples()
00281     testing_errors       = []
00282     training_errors      = []
00283     build_times          = []
00284     classification_times = []
00285     for i in range(times):
00286         if percentage == None:
00287             percentage = (i+1)/times
00288         num_examples = int(round(total_pts*percentage))
00289         print 'Evaluate classifier built with', percentage*100, '% data, num examples', num_examples
00290         subset, unselected = split_random_subset(num_examples, total_pts)
00291         i                  = data.inputs[:,  subset]
00292         o                  = data.outputs[:, subset]
00293         print "Building classifier..."
00294         if test_pca:
00295             print '            TESTING PCA'
00296             import dimreduce as dr
00297             subseted_dataset = dr.LinearDimReduceDataset(i,o)
00298             subseted_dataset.set_projection_vectors(dr.pca_vectors(subseted_dataset.inputs, percent_variance=.95))
00299             subseted_dataset.reduce_input()
00300             print 'subseted_dataset.num_attributes(), subseted_dataset.num_examples()', subseted_dataset.num_attributes(), subseted_dataset.num_examples()
00301         else:
00302             subseted_dataset   = Dataset(i,o)
00303 
00304         start_time         = time.time()
00305         classifier         = building_func(subseted_dataset, **extra_args)
00306         build_times.append(time.time() - start_time)
00307         print "done building..."
00308 
00309         ##########################################
00310         #Classify training set
00311         ##########################################
00312         count_selected     = []
00313         for i, idx in enumerate(subset):
00314             start_time    = time.time()
00315             if test_pca:
00316                 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00317             else:
00318                 prediction, _ = classifier.predict(data.inputs[:,idx])
00319 
00320             classification_times.append(time.time() - start_time)
00321             true_val      = data.outputs[:,idx]
00322             if prediction == true_val:
00323                 count_selected.append(1)
00324             else:
00325                 count_selected.append(0)
00326             if i%100 == 0:
00327                 print i
00328         count_selected = np.matrix(count_selected)
00329 
00330         ##########################################
00331         #Classify testing set
00332         ##########################################
00333         confusion_matrix   = dict()
00334         count_unselected   = []
00335         print 'Total points', total_pts
00336         for idx in unselected:
00337             start_time    = time.time()
00338             if test_pca:
00339                 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00340             else:
00341                 prediction, _ = classifier.predict(data.inputs[:,idx])
00342             classification_times.append(time.time() - start_time)
00343             true_val      = data.outputs[:,idx]
00344             if prediction == true_val:
00345                 count_unselected.append(1)
00346             else:
00347                 count_unselected.append(0)
00348             if confusion_matrix.has_key(true_val[0,0]):
00349                 if confusion_matrix[true_val[0,0]].has_key(prediction[0,0]):
00350                     confusion_matrix[true_val[0,0]][prediction[0,0]] = confusion_matrix[true_val[0,0]][prediction[0,0]] + 1
00351                 else:
00352                     confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00353             else:
00354                 confusion_matrix[true_val[0,0]] = dict()
00355                 confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00356 
00357         training_error = 100.0 * np.sum(count_selected) / float(len(subset))
00358         testing_error  = 100.0 * np.sum(count_unselected) / float(len(unselected))
00359         testing_errors.append(testing_error)
00360         training_errors.append(training_error)
00361         print 'Correct on training set', training_error, '%'
00362         print '        on testing set',  testing_error, '%'
00363         print 'Confusion'
00364         for k in confusion_matrix.keys():
00365             sum = 0.0
00366             for k2 in confusion_matrix[k]:
00367                 sum = sum + confusion_matrix[k][k2]
00368             for k2 in confusion_matrix[k]:
00369                 print 'true class', k, 'classified as', k2, 100.0 * (confusion_matrix[k][k2] / sum), '% of the time'
00370 
00371     def print_stats(name, list_data):
00372         m = np.matrix(list_data)
00373         print '%s: average %f std %f' % (name, m.mean(), np.std(m))
00374 
00375     print_stats('training error', training_errors)
00376     print_stats('testing error', testing_errors)
00377     print_stats('build time', build_times)
00378     print_stats('classification time', classification_times)
00379 
00380 
00381 if __name__ == '__main__':
00382     test_iris         = False
00383     test_pickle       = False
00384     test_number_trees = False
00385     test_pca          = False
00386     test_packing      = False
00387     test_new_design   = True
00388 
00389     import pickle as pk
00390     def save_pickle(pickle, filename):
00391         p = open(filename, 'w')
00392         picklelicious = pk.dump(pickle, p)
00393         p.close()
00394     def load_pickle(filename):
00395         p = open(filename, 'r')
00396         picklelicious = pk.load(p)
00397         p.close()
00398         return picklelicious
00399 
00400     if test_iris:
00401         #Setup for repeated testing
00402         iris_array = np.matrix(np.loadtxt('iris.data', dtype='|S30', delimiter=','))
00403         inputs     = np.float32(iris_array[:, 0:4]).T
00404         outputs    = iris_array[:, 4].T
00405         dataset    = Dataset(inputs, outputs)
00406 
00407         print '================================'
00408         print "Test DecisionTree"
00409         evaluate_classifier(DecisionTree, dataset, 5, .9)
00410 
00411         print '================================'
00412         #print "Test random forest"
00413         #for i in range(4):
00414         #    #print "Test RFRandomInputSubset"
00415         #    #evaluate_classifier(RFRandomInputSubset, dataset, 1, .7)
00416         #    print "Test RFBreiman"
00417         #    evaluate_classifier(RFEntropySplitRandomInputSubset, dataset, 1, .7)
00418     if test_pickle:
00419 
00420         def print_separator(times=2):
00421             for i in xrange(times):
00422                 print '==============================================================='
00423 
00424         dataset = load_pickle('PatchClassifier.dataset.pickle')
00425         #if test_pca:
00426         #    print_separator(1)
00427         #    print_separator(1)
00428         #    dataset.reduce_input()
00429 
00430         if test_number_trees:
00431             tree_types = [RFBreiman, RFRandomInputSubset]
00432             #tree_types = [RFBreiman]
00433             for tree_type in tree_types:
00434                 print_separator()
00435                 print 'Testing', tree_type
00436                 for i in range(10):
00437                     print tree_type, 'using', (i+1)*10, 'trees'
00438                     evaluate_classifier(tree_type, dataset, 3, .95, 
00439                             extra_args={'number_of_learners': (i+1)*10}, test_pca=test_pca)
00440         else:
00441             tree_types = [RFBreiman, RFRandomInputSubset]
00442             #tree_types = [RFRandomInputSubset]
00443             for tree_type in tree_types:
00444                 print_separator()
00445                 print tree_type
00446                 evaluate_classifier(tree_type, dataset, 10, .95, 
00447                         extra_args={'number_of_learners': 70}, test_pca=test_pca)
00448     if test_packing:
00449         train = np.mat(np.random.rand(4, 20))
00450         resp = np.mat(np.round(np.random.rand(1, 20)))
00451         data = ds.Dataset(train, resp)
00452         dt = DecisionTree(data)
00453         packed = dt.pack_tree()
00454         print packed
00455         new_dt = DecisionTree()
00456         new_dt.unpack_tree(packed)
00457         bad = False
00458         for i in range(500):
00459             np.random.seed(i)
00460             test = np.mat(np.random.rand(4, 1))
00461             np.random.seed(1)
00462             pred_old = dt.predict(test)
00463             np.random.seed(1)
00464             pred_new = new_dt.predict(test)
00465             if pred_old != pred_new:
00466                 print "Not same prediction:"
00467                 print pred_old, pred_new
00468                 bad = True
00469         if not bad:
00470             print "Prediction tests successful!"
00471 
00472         dt = RFBreiman(data)
00473         packed = dt.pack_rf()
00474         new_dt = RFBreiman()
00475         new_dt.unpack_rf(packed)
00476         bad = False
00477         for i in range(500):
00478             np.random.seed(i)
00479             test = np.mat(np.random.rand(4, 1))
00480             np.random.seed(1)
00481             pred_old, _ = dt.predict(test)
00482             np.random.seed(1)
00483             pred_new, _ = new_dt.predict(test)
00484             if pred_old != pred_new:
00485                 print "RF Not same prediction:"
00486                 print pred_old, pred_new
00487                 bad = True
00488         if not bad:
00489             print "RF Prediction tests successful!"
00490 
00491     if test_new_design:
00492         np.random.seed(2)
00493         train = np.mat(np.random.rand(4, 20))
00494         resp = np.mat(np.round(np.random.rand(1, 20)))
00495         data = ds.Dataset(train, resp)
00496         dt = RFBreiman(data)
00497         save_pickle(dt, "rfbreiman.pickle")
00498         dt = load_pickle("rfbreiman.pickle")
00499         preds = []
00500         for i in range(500):
00501             np.random.seed(i)
00502             test = np.mat(np.random.rand(4, 1))
00503             np.random.seed(1)
00504             pred, pred_dict = dt.predict(test)
00505             preds.append(pred[0,0])
00506         print preds
00507 
00508 


ml_lib
Author(s): haidai
autogenerated on Wed Nov 27 2013 11:46:34