00001
00002 import roslib; roslib.load_manifest('ml_lib')
00003
00004 import numpy as np
00005 import itertools as it
00006 import functools as ft
00007 import time
00008 import dataset as ds
00009
00010
00011
00012
00013 class RFBase:
00014
00015
00016
00017
00018
00019
00020 def __init__(self, dataset=None, number_of_dimensions=None, number_of_learners=100):
00021 self.number_of_learners = number_of_learners
00022 self.number_of_dimensions = number_of_dimensions
00023 if dataset != None:
00024 self.train(dataset)
00025
00026
00027
00028
00029
00030 def predict(self, data, vote_combine_function=None):
00031 def predict_(learner):
00032 return learner.predict(learner.transform_input(data, learner))
00033 predictions = map(predict_,self.learners)
00034 if vote_combine_function is not None:
00035 return vote_combine_function(predictions)
00036 else:
00037 return mode_exhaustive(predictions)
00038
00039 def train(self, dataset):
00040 pass
00041
00042 def avg_tree_depth(self):
00043 return np.average(map(DecisionTree.get_tree_depth, self.learners))
00044
00045
00046
00047
00048
00049
00050 class RFBreiman(RFBase):
00051 def train(self, dataset):
00052 def train_trees(examples_subset):
00053 tree = DecisionTree()
00054
00055 tree.train(examples_subset, splitting_func=totally_random_split)
00056
00057 tree.transform_input = identity
00058 return tree
00059
00060 if self.number_of_dimensions == None:
00061 self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00062 points_per_sample = dataset.num_examples() * 1.0 / 3.0
00063 self.learners = map(train_trees, ds.bootstrap_samples(dataset, self.number_of_learners, points_per_sample))
00064
00065
00066
00067
00068
00069
00070 class RFRandomInputSubset(RFBase):
00071 def train(self, dataset):
00072 def train_trees(examples_subset):
00073
00074 dims = random_subset(self.number_of_dimensions, examples_subset.num_attributes())
00075 subset_input = examples_subset.inputs[dims, :]
00076 reduced_sample = Dataset(subset_input, examples_subset.outputs)
00077 tree = DecisionTree(reduced_sample)
00078 tree.dimensions_subset = dims
00079 return tree
00080
00081 if self.number_of_dimensions == None:
00082 self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00083 points_per_sample = dataset.num_examples() * 1.0 / 3.0
00084 in_bags, out_bags = ds.bootstrap_samples(dataset,
00085 self.number_of_learners,
00086 points_per_sample)
00087 self.learners = map(train_trees, in_bags)
00088
00089 def transform_input(self, input, tree=None):
00090 return input[tree.dimensions_subset, :]
00091
00092
00093 def binary_less_than(attribute, threshold, input_vec):
00094 return input_vec[attribute,0] <= threshold
00095
00096 def binary_greater_than(attribute, threshold, input_vec):
00097 return input_vec[attribute,0] > threshold
00098
00099 def create_binary_tests(attribute, threshold):
00100 return [('binary_less_than', attribute, threshold),
00101 ('binary_greater_than', attribute, threshold)]
00102
00103
00104
00105
00106
00107
00108
00109
00110 def mode_exhaustive(set):
00111
00112
00113 mdict = dict()
00114 for s in set:
00115 has_stored = False
00116 keys = mdict.keys()
00117 keys.sort()
00118 for k in keys:
00119 if k == s:
00120 mdict[k] = 1+mdict[k]
00121 has_stored = True
00122 if not has_stored:
00123 mdict[s] = 1
00124
00125
00126 max_key = None
00127 max_count = -1
00128 keys = mdict.keys()
00129 keys.sort()
00130 for k in keys:
00131 if mdict[k] > max_count:
00132 max_key = k
00133 max_count = mdict[k]
00134
00135 return max_key, mdict
00136
00137
00138
00139
00140
00141
00142 def min_entropy_split(dataset):
00143
00144
00145 hypotheses = []
00146 entropies = []
00147
00148 for attribute in xrange(dataset.num_attributes()):
00149 values = ds.unique_values(dataset.inputs, attribute)
00150
00151 for split_point in values:
00152 def calc_entropy(data_set):
00153 num_points = data_set.num_examples()
00154
00155 return (num_points / float(dataset.num_examples())) * ds.dataset_entropy_discrete(dataset)
00156 split_entropy = map(calc_entropy, ds.split_continuous(dataset, attribute, split_point))
00157 hypotheses.append((attribute, split_point))
00158 entropies.append(sum(split_entropy))
00159
00160 entropies = np.matrix(entropies)
00161 min_idx = np.argmin(entropies)
00162 return hypotheses[min_idx]
00163
00164 def random_subset(subset_size, total_size):
00165
00166 assert(subset_size <= total_size)
00167 occupancy = np.matrix(np.zeros((1, total_size)))
00168 while occupancy.sum() < subset_size:
00169 occupancy[0, np.random.randint(0, total_size)] = 1
00170 rows, columns = np.where(occupancy > 0)
00171 return columns.A[0]
00172
00173 def split_random_subset(subset_size, total_size):
00174 assert(subset_size <= total_size)
00175 occupancy = np.matrix(np.zeros((1, total_size)))
00176 while occupancy.sum() < subset_size:
00177 occupancy[0, np.random.randint(0, total_size)] = 1
00178 bool_sel = occupancy > 0
00179 rows, columns_subset = np.where(bool_sel)
00180 rows, columns_remaining = np.where(np.invert(bool_sel))
00181 return columns_subset.A[0], columns_remaining.A[0]
00182
00183
00184
00185 def random_subset_split(num_subset, dataset):
00186
00187
00188 subset_indexes = random_subset(num_subset, dataset.num_attributes())
00189 sub_dataset = Dataset(dataset.inputs[subset_indexes,:], dataset.outputs)
00190 attribute, point = min_entropy_split(sub_dataset)
00191 return subset_indexes[attribute], point
00192
00193 def totally_random_split(dataset):
00194
00195 attr = np.random.randint(0, dataset.num_attributes())
00196 split_pt = dataset.inputs[attr, np.random.randint(0, dataset.num_examples())]
00197 return attr, split_pt
00198
00199
00200
00201 class DecisionTree:
00202
00203 def __init__(self, dataset=None, splitting_func=min_entropy_split):
00204 self.children = None
00205 self.prediction = None
00206 if dataset is not None:
00207 self.train(dataset, splitting_func=splitting_func)
00208
00209 def train(self, dataset, splitting_func=min_entropy_split):
00210 if not self.make_leaf(dataset):
00211
00212 self.split_attribute, self.split_point = splitting_func(dataset)
00213
00214 data_sets = ds.split_continuous(dataset, self.split_attribute, self.split_point)
00215 if len(data_sets) < 2:
00216 self.prediction = dataset.outputs
00217 return
00218
00219 def tree_split(set):
00220
00221 return DecisionTree(set, splitting_func=splitting_func)
00222
00223 tests = create_binary_tests(self.split_attribute, self.split_point)
00224 self.children = zip(tests, map(tree_split, data_sets))
00225
00226 def make_leaf(self, dataset):
00227 if np.all(dataset.outputs[:,0] == dataset.outputs):
00228 self.prediction = dataset.outputs[:,0]
00229
00230 return True
00231 elif np.all(dataset.inputs[:,0] == dataset.inputs):
00232 self.prediction = dataset.outputs
00233
00234 return True
00235 else:
00236 return False
00237
00238 def predict(self, input):
00239 if self.prediction is not None:
00240 return self.prediction[:, np.random.randint(0, self.prediction.shape[1])]
00241 else:
00242 for test, child in self.children:
00243 test_func_name, attribute, threshold = test
00244 if test_func_name == 'binary_less_than':
00245 test_func = binary_less_than
00246 elif test_func_name == 'binary_greater_than':
00247 test_func = binary_greater_than
00248 else:
00249 rospy.logerr("DecisionTree bad function name : %s" %
00250 test_func_name)
00251 if test_func(attribute, threshold, input):
00252 return child.predict(input)
00253 raise RuntimeError("DecisionTree: splits not exhaustive, unable to split for input" + str(input.T))
00254
00255
00256
00257 def transform_input(self, input, tree=None):
00258 return input
00259
00260 def get_tree_depth(self):
00261 if self.prediction is not None:
00262 return 1
00263
00264 depths = []
00265 for test, child in self.children:
00266 depths.append(child.get_tree_depth() + 1)
00267
00268 return max(depths)
00269
00270
00271
00272
00273
00274
00275
00276
00277
00278 def evaluate_classifier(building_func, data, times=10.0, percentage=None, extra_args={}, test_pca=False):
00279 print 'evaluate_classifier: extra_args', extra_args
00280 total_pts = data.num_examples()
00281 testing_errors = []
00282 training_errors = []
00283 build_times = []
00284 classification_times = []
00285 for i in range(times):
00286 if percentage == None:
00287 percentage = (i+1)/times
00288 num_examples = int(round(total_pts*percentage))
00289 print 'Evaluate classifier built with', percentage*100, '% data, num examples', num_examples
00290 subset, unselected = split_random_subset(num_examples, total_pts)
00291 i = data.inputs[:, subset]
00292 o = data.outputs[:, subset]
00293 print "Building classifier..."
00294 if test_pca:
00295 print ' TESTING PCA'
00296 import dimreduce as dr
00297 subseted_dataset = dr.LinearDimReduceDataset(i,o)
00298 subseted_dataset.set_projection_vectors(dr.pca_vectors(subseted_dataset.inputs, percent_variance=.95))
00299 subseted_dataset.reduce_input()
00300 print 'subseted_dataset.num_attributes(), subseted_dataset.num_examples()', subseted_dataset.num_attributes(), subseted_dataset.num_examples()
00301 else:
00302 subseted_dataset = Dataset(i,o)
00303
00304 start_time = time.time()
00305 classifier = building_func(subseted_dataset, **extra_args)
00306 build_times.append(time.time() - start_time)
00307 print "done building..."
00308
00309
00310
00311
00312 count_selected = []
00313 for i, idx in enumerate(subset):
00314 start_time = time.time()
00315 if test_pca:
00316 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00317 else:
00318 prediction, _ = classifier.predict(data.inputs[:,idx])
00319
00320 classification_times.append(time.time() - start_time)
00321 true_val = data.outputs[:,idx]
00322 if prediction == true_val:
00323 count_selected.append(1)
00324 else:
00325 count_selected.append(0)
00326 if i%100 == 0:
00327 print i
00328 count_selected = np.matrix(count_selected)
00329
00330
00331
00332
00333 confusion_matrix = dict()
00334 count_unselected = []
00335 print 'Total points', total_pts
00336 for idx in unselected:
00337 start_time = time.time()
00338 if test_pca:
00339 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00340 else:
00341 prediction, _ = classifier.predict(data.inputs[:,idx])
00342 classification_times.append(time.time() - start_time)
00343 true_val = data.outputs[:,idx]
00344 if prediction == true_val:
00345 count_unselected.append(1)
00346 else:
00347 count_unselected.append(0)
00348 if confusion_matrix.has_key(true_val[0,0]):
00349 if confusion_matrix[true_val[0,0]].has_key(prediction[0,0]):
00350 confusion_matrix[true_val[0,0]][prediction[0,0]] = confusion_matrix[true_val[0,0]][prediction[0,0]] + 1
00351 else:
00352 confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00353 else:
00354 confusion_matrix[true_val[0,0]] = dict()
00355 confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00356
00357 training_error = 100.0 * np.sum(count_selected) / float(len(subset))
00358 testing_error = 100.0 * np.sum(count_unselected) / float(len(unselected))
00359 testing_errors.append(testing_error)
00360 training_errors.append(training_error)
00361 print 'Correct on training set', training_error, '%'
00362 print ' on testing set', testing_error, '%'
00363 print 'Confusion'
00364 for k in confusion_matrix.keys():
00365 sum = 0.0
00366 for k2 in confusion_matrix[k]:
00367 sum = sum + confusion_matrix[k][k2]
00368 for k2 in confusion_matrix[k]:
00369 print 'true class', k, 'classified as', k2, 100.0 * (confusion_matrix[k][k2] / sum), '% of the time'
00370
00371 def print_stats(name, list_data):
00372 m = np.matrix(list_data)
00373 print '%s: average %f std %f' % (name, m.mean(), np.std(m))
00374
00375 print_stats('training error', training_errors)
00376 print_stats('testing error', testing_errors)
00377 print_stats('build time', build_times)
00378 print_stats('classification time', classification_times)
00379
00380
00381 if __name__ == '__main__':
00382 test_iris = False
00383 test_pickle = False
00384 test_number_trees = False
00385 test_pca = False
00386 test_packing = False
00387 test_new_design = True
00388
00389 import pickle as pk
00390 def save_pickle(pickle, filename):
00391 p = open(filename, 'w')
00392 picklelicious = pk.dump(pickle, p)
00393 p.close()
00394 def load_pickle(filename):
00395 p = open(filename, 'r')
00396 picklelicious = pk.load(p)
00397 p.close()
00398 return picklelicious
00399
00400 if test_iris:
00401
00402 iris_array = np.matrix(np.loadtxt('iris.data', dtype='|S30', delimiter=','))
00403 inputs = np.float32(iris_array[:, 0:4]).T
00404 outputs = iris_array[:, 4].T
00405 dataset = Dataset(inputs, outputs)
00406
00407 print '================================'
00408 print "Test DecisionTree"
00409 evaluate_classifier(DecisionTree, dataset, 5, .9)
00410
00411 print '================================'
00412
00413
00414
00415
00416
00417
00418 if test_pickle:
00419
00420 def print_separator(times=2):
00421 for i in xrange(times):
00422 print '==============================================================='
00423
00424 dataset = load_pickle('PatchClassifier.dataset.pickle')
00425
00426
00427
00428
00429
00430 if test_number_trees:
00431 tree_types = [RFBreiman, RFRandomInputSubset]
00432
00433 for tree_type in tree_types:
00434 print_separator()
00435 print 'Testing', tree_type
00436 for i in range(10):
00437 print tree_type, 'using', (i+1)*10, 'trees'
00438 evaluate_classifier(tree_type, dataset, 3, .95,
00439 extra_args={'number_of_learners': (i+1)*10}, test_pca=test_pca)
00440 else:
00441 tree_types = [RFBreiman, RFRandomInputSubset]
00442
00443 for tree_type in tree_types:
00444 print_separator()
00445 print tree_type
00446 evaluate_classifier(tree_type, dataset, 10, .95,
00447 extra_args={'number_of_learners': 70}, test_pca=test_pca)
00448 if test_packing:
00449 train = np.mat(np.random.rand(4, 20))
00450 resp = np.mat(np.round(np.random.rand(1, 20)))
00451 data = ds.Dataset(train, resp)
00452 dt = DecisionTree(data)
00453 packed = dt.pack_tree()
00454 print packed
00455 new_dt = DecisionTree()
00456 new_dt.unpack_tree(packed)
00457 bad = False
00458 for i in range(500):
00459 np.random.seed(i)
00460 test = np.mat(np.random.rand(4, 1))
00461 np.random.seed(1)
00462 pred_old = dt.predict(test)
00463 np.random.seed(1)
00464 pred_new = new_dt.predict(test)
00465 if pred_old != pred_new:
00466 print "Not same prediction:"
00467 print pred_old, pred_new
00468 bad = True
00469 if not bad:
00470 print "Prediction tests successful!"
00471
00472 dt = RFBreiman(data)
00473 packed = dt.pack_rf()
00474 new_dt = RFBreiman()
00475 new_dt.unpack_rf(packed)
00476 bad = False
00477 for i in range(500):
00478 np.random.seed(i)
00479 test = np.mat(np.random.rand(4, 1))
00480 np.random.seed(1)
00481 pred_old, _ = dt.predict(test)
00482 np.random.seed(1)
00483 pred_new, _ = new_dt.predict(test)
00484 if pred_old != pred_new:
00485 print "RF Not same prediction:"
00486 print pred_old, pred_new
00487 bad = True
00488 if not bad:
00489 print "RF Prediction tests successful!"
00490
00491 if test_new_design:
00492 np.random.seed(2)
00493 train = np.mat(np.random.rand(4, 20))
00494 resp = np.mat(np.round(np.random.rand(1, 20)))
00495 data = ds.Dataset(train, resp)
00496 dt = RFBreiman(data)
00497 save_pickle(dt, "rfbreiman.pickle")
00498 dt = load_pickle("rfbreiman.pickle")
00499 preds = []
00500 for i in range(500):
00501 np.random.seed(i)
00502 test = np.mat(np.random.rand(4, 1))
00503 np.random.seed(1)
00504 pred, pred_dict = dt.predict(test)
00505 preds.append(pred[0,0])
00506 print preds
00507
00508