00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 import numpy as np
00030 import itertools as it
00031 import functools as ft
00032 import time
00033
00034 class Dataset:
00035 def __init__(self, inputs, outputs):
00036 """
00037 inputs coded as numpy array, column vectors
00038 outputs also as numpy array, column vectors
00039 """
00040 self.inputs = inputs
00041 self.outputs = outputs
00042 assert(inputs.shape[1] == outputs.shape[1])
00043
00044 def num_examples(self):
00045 return self.inputs.shape[1]
00046
00047 def num_attributes(self):
00048 return self.inputs.shape[0]
00049
00050 def split_continuous(self, attribute, split_point):
00051 selected_attribute = self.inputs[attribute, :]
00052 leq_bool = selected_attribute <= split_point
00053 _, leq_col = np.where(leq_bool)
00054
00055
00056 if leq_col.shape[1] > 0:
00057 leq_dataset = Dataset(self.inputs[:, leq_col.A[0]], self.outputs[:, leq_col.A[0]])
00058 else:
00059 leq_dataset = Dataset(np.matrix([]), np.matrix([]))
00060
00061 _, gt_col = np.where(~leq_bool)
00062 if gt_col.shape[1] > 0:
00063 gt_dataset = Dataset(self.inputs[:, gt_col.A[0]], self.outputs[:, gt_col.A[0]])
00064 else:
00065 gt_dataset = Dataset(np.matrix([]), np.matrix([]))
00066
00067 ret_sets = []
00068 if leq_dataset.num_examples() > 0:
00069 ret_sets.append(leq_dataset)
00070 if gt_dataset.num_examples() > 0:
00071 ret_sets.append(gt_dataset)
00072 return ret_sets
00073
00074 def unique_values(self, attribute_number, set='input'):
00075 if set == 'input':
00076 examples = self.inputs
00077 else:
00078 examples = self.outputs
00079
00080 values = dict()
00081 for instance_idx in xrange(examples.shape[1]):
00082 values[examples[attribute_number, instance_idx]] = True
00083 k = values.keys()
00084 k.sort()
00085 return k
00086
00087 def bootstrap_samples(self, number_samples, points_per_sample):
00088 for i in xrange(number_samples):
00089 selected_pts = np.random.randint(0, self.inputs.shape[1], points_per_sample)
00090 selected_inputs = self.inputs[:, selected_pts]
00091 selected_outputs = self.outputs[:, selected_pts]
00092
00093 yield Dataset(selected_inputs, selected_outputs)
00094
00095 def entropy_discrete(self):
00096 values = self.unique_values(0, 'output')
00097
00098
00099 def calc_class_entropy(value):
00100 number_in_class = np.sum(self.outputs[0,:] == value)
00101 percentage_in_class = (number_in_class / float(self.num_examples()))
00102 return -percentage_in_class * np.log2(percentage_in_class)
00103 return np.sum(map(calc_class_entropy, values))
00104
00105 def append(self, another_dataset):
00106 self.inputs = np.concatenate((self.inputs, another_dataset.inputs), axis=1)
00107 self.outputs = np.concatenate((self.outputs, another_dataset.outputs), axis=1)
00108
00109 class LinearDimReduceDataset(Dataset):
00110 def __init__(self, inputs, outputs):
00111 Dataset.__init__(self, inputs, outputs)
00112
00113 def set_projection_vectors(self, vec):
00114 '''
00115 projection vectors are assumed to be columnwise
00116 '''
00117 self.projection_basis = vec
00118 print 'LinearDimReduceDataset: projection_basis', vec.shape
00119
00120 def reduce(self, data_points):
00121 return self.projection_basis.T * data_points
00122
00123 def reduce_input(self):
00124 '''
00125 reduce dimensionality of this dataset
00126 '''
00127 self.inputs = self.projection_basis.T * self.inputs
00128
00129 def binary_less_than(attribute, threshold, input_vec):
00130 return input_vec[attribute,0] <= threshold
00131
00132 def binary_greater_than(attribute, threshold, input_vec):
00133 return input_vec[attribute,0] > threshold
00134
00135 def create_binary_tests(attribute, threshold):
00136 return [ft.partial(binary_less_than, attribute, threshold),
00137 ft.partial(binary_greater_than, attribute, threshold)]
00138
00139 def mode_exhaustive(set):
00140 '''
00141 Finds the mode of a given set
00142 '''
00143
00144
00145 mdict = dict()
00146 for s in set:
00147 has_stored = False
00148 for k in mdict.keys():
00149 if k == s:
00150 mdict[k] = 1+mdict[k]
00151 has_stored = True
00152 if not has_stored:
00153 mdict[s] = 1
00154
00155
00156 max_key = None
00157 max_count = -1
00158 for k in mdict.keys():
00159 if mdict[k] > max_count:
00160 max_key = k
00161 max_count = mdict[k]
00162
00163 return max_key, mdict
00164
00165 def min_entropy_split(dataset):
00166 '''
00167 Find the split that produces subsets with the minimum combined entropy
00168 return splitting attribute & splitting point for that attribute
00169 '''
00170
00171
00172 hypotheses = []
00173 entropies = []
00174
00175 for attribute in xrange(dataset.num_attributes()):
00176 values = dataset.unique_values(attribute)
00177
00178 for split_point in values:
00179 def calc_entropy(data_set):
00180 num_points = data_set.num_examples()
00181 return (num_points / float(dataset.num_examples())) * data_set.entropy_discrete()
00182 split_entropy = map(calc_entropy, dataset.split_continuous(attribute, split_point))
00183 hypotheses.append((attribute, split_point))
00184 entropies.append(sum(split_entropy))
00185
00186 entropies = np.matrix(entropies)
00187 min_idx = np.argmin(entropies)
00188 return hypotheses[min_idx]
00189
00190 def random_subset(subset_size, total_size):
00191
00192 assert(subset_size <= total_size)
00193 occupancy = np.matrix(np.zeros((1, total_size)))
00194 while occupancy.sum() < subset_size:
00195 occupancy[0, np.random.randint(0, total_size)] = 1
00196 rows, columns = np.where(occupancy > 0)
00197 return columns.A[0]
00198
00199 def split_random_subset(subset_size, total_size):
00200 assert(subset_size <= total_size)
00201 occupancy = np.matrix(np.zeros((1, total_size)))
00202 while occupancy.sum() < subset_size:
00203 occupancy[0, np.random.randint(0, total_size)] = 1
00204 bool_sel = occupancy > 0
00205 rows, columns_subset = np.where(bool_sel)
00206 rows, columns_remaining = np.where(np.invert(bool_sel))
00207 return columns_subset.A[0], columns_remaining.A[0]
00208
00209 def random_subset_split(num_subset, dataset):
00210 ''' splitter in decision tree '''
00211
00212
00213 subset_indexes = random_subset(num_subset, dataset.num_attributes())
00214 sub_dataset = Dataset(dataset.inputs[subset_indexes,:], dataset.outputs)
00215 attribute, point = min_entropy_split(sub_dataset)
00216 return subset_indexes[attribute], point
00217
00218 def totally_random_split(dataset):
00219
00220 attr = np.random.randint(0, dataset.num_attributes())
00221 split_pt = dataset.inputs[attr, np.random.randint(0, dataset.num_examples())]
00222 return attr, split_pt
00223
00224 class DecisionTree:
00225 def __init__(self, dataset=None, splitting_func=min_entropy_split):
00226 self.children = None
00227 self.prediction = None
00228 if dataset is not None:
00229 self.train(dataset, splitting_func=splitting_func)
00230
00231 def train(self, dataset, splitting_func=min_entropy_split):
00232 if not self.make_leaf(dataset):
00233
00234 self.split_attribute, self.split_point = splitting_func(dataset)
00235
00236 data_sets = dataset.split_continuous(self.split_attribute, self.split_point)
00237 if len(data_sets) < 2:
00238 self.prediction = dataset.outputs
00239 return
00240
00241 def tree_split(set):
00242
00243 return DecisionTree(set, splitting_func=splitting_func)
00244
00245 tests = create_binary_tests(self.split_attribute, self.split_point)
00246 self.children = zip(tests, map(tree_split, data_sets))
00247
00248 def make_leaf(self, dataset):
00249 if np.all(dataset.outputs[:,0] == dataset.outputs):
00250 self.prediction = dataset.outputs[:,0]
00251
00252 return True
00253 elif np.all(dataset.inputs[:,0] == dataset.inputs):
00254 self.prediction = dataset.outputs
00255
00256 return True
00257 else:
00258 return False
00259
00260 def predict(self, input):
00261 if self.prediction is not None:
00262 return self.prediction[:, np.random.randint(0, self.prediction.shape[1])]
00263 else:
00264 for test, child in self.children:
00265 if test(input):
00266 return child.predict(input)
00267 raise RuntimeError("DecisionTree: splits not exhaustive, unable to split for input" + str(input.T))
00268
00269 class RFBase:
00270 def __init__(self, dataset=None, number_of_dimensions=None, number_of_learners=100):
00271 """
00272 number_of_dimensions unclear which direction, but should be around 10-20% of original
00273 data dimension
00274 number_of_learners limited by processor performance, higher is better
00275 """
00276 self.number_of_learners = number_of_learners
00277 self.number_of_dimensions = number_of_dimensions
00278 if dataset != None:
00279 self.train(dataset)
00280
00281 def predict(self, data, vote_combine_function=None):
00282 def predict_(learner):
00283 return learner.predict(learner.transform_input(data))
00284 predictions = map(predict_,self.learners)
00285 if vote_combine_function is not None:
00286 return vote_combine_function(predictions)
00287 else:
00288 return mode_exhaustive(predictions)
00289
00290 def train(self, dataset):
00291 pass
00292
00293 def identity(x):
00294 return x
00295
00296 class RFBreiman(RFBase):
00297 def train(self, dataset):
00298 def train_trees(examples_subset):
00299 tree = DecisionTree()
00300
00301 tree.train(examples_subset, splitting_func=totally_random_split)
00302
00303 tree.transform_input = identity
00304 return tree
00305
00306 if self.number_of_dimensions == None:
00307 self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00308 points_per_sample = dataset.num_examples() * 1.0 / 3.0
00309 self.learners = map(train_trees, dataset.bootstrap_samples(self.number_of_learners, points_per_sample))
00310
00311 class RFRandomInputSubset(RFBase):
00312 def train(self, dataset):
00313 def train_trees(examples_subset):
00314
00315 dims = random_subset(self.number_of_dimensions, examples_subset.num_attributes())
00316 subset_input = examples_subset.inputs[dims, :]
00317 reduced_sample = Dataset(subset_input, examples_subset.outputs)
00318 tree = DecisionTree(reduced_sample)
00319 tree.dimensions_subset = dims
00320 def transform_input(input):
00321 return input[dims, :]
00322 tree.transform_input = transform_input
00323 return tree
00324
00325 if self.number_of_dimensions == None:
00326 self.number_of_dimensions = min(np.log2(dataset.num_attributes()) + 1, 1)
00327 points_per_sample = dataset.num_examples() * 1.0 / 3.0
00328 self.learners = map(train_trees, dataset.bootstrap_samples(self.number_of_learners, points_per_sample))
00329
00330 def evaluate_classifier(building_func, data, times=10.0, percentage=None, extra_args={}, test_pca=False):
00331 '''
00332 Evaluate classifier by dividing dataset into training and test set.
00333 @param building_func Function that will build classifier given data and args in extra_args.
00334 @param data Dataset to use for evaluation/training.
00335 @param times The number of bootstrap samples to take.
00336 @param percentage The percentage of data to use for training.
00337 @param extra_args Extra arguments to pass to building_func.
00338 '''
00339 print 'evaluate_classifier: extra_args', extra_args
00340 total_pts = data.num_examples()
00341 testing_errors = []
00342 training_errors = []
00343 build_times = []
00344 classification_times = []
00345 for i in range(times):
00346 if percentage == None:
00347 percentage = (i+1)/times
00348 num_examples = int(round(total_pts*percentage))
00349 print 'Evaluate classifier built with', percentage*100, '% data, num examples', num_examples
00350 subset, unselected = split_random_subset(num_examples, total_pts)
00351 i = data.inputs[:, subset]
00352 o = data.outputs[:, subset]
00353 print "Building classifier..."
00354 if test_pca:
00355 print ' TESTING PCA'
00356 import dimreduce as dr
00357 subseted_dataset = LinearDimReduceDataset(i,o)
00358 subseted_dataset.set_projection_vectors(dr.pca_vectors(subseted_dataset.inputs, percent_variance=.95))
00359 subseted_dataset.reduce_input()
00360 print 'subseted_dataset.num_attributes(), subseted_dataset.num_examples()', subseted_dataset.num_attributes(), subseted_dataset.num_examples()
00361 else:
00362 subseted_dataset = Dataset(i,o)
00363
00364 start_time = time.time()
00365 classifier = building_func(subseted_dataset, **extra_args)
00366 build_times.append(time.time() - start_time)
00367 print "done building..."
00368
00369
00370
00371
00372 count_selected = []
00373 for i, idx in enumerate(subset):
00374 start_time = time.time()
00375 if test_pca:
00376 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00377 else:
00378 prediction, _ = classifier.predict(data.inputs[:,idx])
00379
00380 classification_times.append(time.time() - start_time)
00381 true_val = data.outputs[:,idx]
00382 if prediction == true_val:
00383 count_selected.append(1)
00384 else:
00385 count_selected.append(0)
00386 if i%100 == 0:
00387 print i
00388 count_selected = np.matrix(count_selected)
00389
00390
00391
00392
00393 confusion_matrix = dict()
00394 count_unselected = []
00395 print 'Total points', total_pts
00396 for idx in unselected:
00397 start_time = time.time()
00398 if test_pca:
00399 prediction, _ = classifier.predict(data.reduce(data.inputs[:,idx]))
00400 else:
00401 prediction, _ = classifier.predict(data.inputs[:,idx])
00402 classification_times.append(time.time() - start_time)
00403 true_val = data.outputs[:,idx]
00404 if prediction == true_val:
00405 count_unselected.append(1)
00406 else:
00407 count_unselected.append(0)
00408 if confusion_matrix.has_key(true_val[0,0]):
00409 if confusion_matrix[true_val[0,0]].has_key(prediction[0,0]):
00410 confusion_matrix[true_val[0,0]][prediction[0,0]] = confusion_matrix[true_val[0,0]][prediction[0,0]] + 1
00411 else:
00412 confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00413 else:
00414 confusion_matrix[true_val[0,0]] = dict()
00415 confusion_matrix[true_val[0,0]][prediction[0,0]] = 1
00416
00417 training_error = 100.0 * np.sum(count_selected) / float(len(subset))
00418 testing_error = 100.0 * np.sum(count_unselected) / float(len(unselected))
00419 testing_errors.append(testing_error)
00420 training_errors.append(training_error)
00421 print 'Correct on training set', training_error, '%'
00422 print ' on testing set', testing_error, '%'
00423 print 'Confusion'
00424 for k in confusion_matrix.keys():
00425 sum = 0.0
00426 for k2 in confusion_matrix[k]:
00427 sum = sum + confusion_matrix[k][k2]
00428 for k2 in confusion_matrix[k]:
00429 print 'true class', k, 'classified as', k2, 100.0 * (confusion_matrix[k][k2] / sum), '% of the time'
00430
00431 def print_stats(name, list_data):
00432 m = np.matrix(list_data)
00433 print '%s: average %f std %f' % (name, m.mean(), np.std(m))
00434
00435 print_stats('training error', training_errors)
00436 print_stats('testing error', testing_errors)
00437 print_stats('build time', build_times)
00438 print_stats('classification time', classification_times)
00439
00440
00441 if __name__ == '__main__':
00442 test_iris = False
00443 test_pickle = True
00444 test_number_trees = False
00445 test_pca = False
00446 if test_iris:
00447
00448 iris_array = np.matrix(np.loadtxt('iris.data', dtype='|S30', delimiter=','))
00449 inputs = np.float32(iris_array[:, 0:4]).T
00450 outputs = iris_array[:, 4].T
00451 dataset = Dataset(inputs, outputs)
00452
00453 print '================================'
00454 print "Test DecisionTree"
00455 evaluate_classifier(DecisionTree, dataset, 5, .9)
00456
00457 print '================================'
00458
00459
00460
00461
00462
00463
00464 if test_pickle:
00465 import pickle as pk
00466 def load_pickle(filename):
00467 p = open(filename, 'r')
00468 picklelicious = pk.load(p)
00469 p.close()
00470 return picklelicious
00471
00472 def print_separator(times=2):
00473 for i in xrange(times):
00474 print '==============================================================='
00475
00476 dataset = load_pickle('PatchClassifier.dataset.pickle')
00477
00478
00479
00480
00481
00482 if test_number_trees:
00483 tree_types = [RFBreiman, RFRandomInputSubset]
00484
00485 for tree_type in tree_types:
00486 print_separator()
00487 print 'Testing', tree_type
00488 for i in range(10):
00489 print tree_type, 'using', (i+1)*10, 'trees'
00490 evaluate_classifier(tree_type, dataset, 3, .95,
00491 extra_args={'number_of_learners': (i+1)*10}, test_pca=test_pca)
00492 else:
00493 tree_types = [RFBreiman, RFRandomInputSubset]
00494
00495 for tree_type in tree_types:
00496 print_separator()
00497 print tree_type
00498 evaluate_classifier(tree_type, dataset, 10, .95,
00499 extra_args={'number_of_learners': 70}, test_pca=test_pca)
00500
00501
00502
00503
00504
00505