00001
00002
00003 import numpy as np
00004 import random
00005 import sys
00006 import matplotlib.pyplot as plt
00007 import multiprocessing as mp
00008 import cPickle as pickle
00009
00010 import roslib; roslib.load_manifest('pr2_overhead_grasping')
00011 import rospy
00012
00013 from helpers import log, err, node_name, FileOperations
00014
00015 import ml_lib.random_forest as rf
00016 import ml_lib.dataset as ds
00017 import ml_lib.dimreduce as ldds
00018
00019 class AbstractClassifier(object):
00020
00021 def __init__(self):
00022
00023 self.fos = FileOperations()
00024
00025
00026
00027
00028 def train(self, compiled_dataset):
00029 err("Classifier not implemented!")
00030
00031 def predict(self, instance):
00032 err("Classifier not implemented!")
00033 return 0.
00034
00035 def build(self, compiled_dataset, filename):
00036 err("Classifier not implemented!")
00037
00038 def load(self, filename):
00039 err("Classifier not implemented!")
00040
00041 def cross_valid(self, compilied_dataset, num_folds = 10, seed = 1):
00042
00043 c_mat_list, traj_dicts = self._fold_predictions(compiled_dataset, num_folds, seed)
00044 confusion_mat = sum(c_mat_list)
00045
00046 confusion_matrix_stats(confusion_mat)
00047 log("-"*60)
00048 log("Analysis")
00049 first_coll_diffs = []
00050 for traj_dict in traj_dicts:
00051 traj_list = [traj_dict[k] for k in traj_dict]
00052 first_coll_diffs.extend(self.analyze_testing_results(traj_list))
00053 log("min: %1.3f, max: %1.3f, median: %1.3f, mean: %1.3f, std: %1.3f" % (np.min(first_coll_diffs), np.max(first_coll_diffs), np.median(first_coll_diffs), np.mean(first_coll_diffs), np.std(first_coll_diffs)))
00054 log("[" + ", ".join(["%1.3f" % v for v in first_coll_diffs]) + "]")
00055
00056
00057 def _fold_predictions(self, compiled_dataset, num_folds, seed):
00058 cdata = compiled_dataset
00059 times_list = cdata["training_times_list"]
00060 colls_list = cdata["training_colls_list"]
00061
00062 train = cdata["training_data"]
00063 responses = cdata["training_labels"]
00064 random.seed(seed)
00065 labels = {}
00066 for i, l in enumerate(cdata["trajectory_labels"]):
00067 if l[0] not in labels:
00068 labels[l[0]] = {l[1] : [i]}
00069 elif l[1] not in labels[l[0]]:
00070 labels[l[0]][l[1]] = [i]
00071 else:
00072 labels[l[0]][l[1]].append(i)
00073
00074 train_test_combos = [{"train" : [], "test" : [], "test_traj_labels" : []} for i in range(num_folds)]
00075 for dir in labels:
00076 fns = labels[dir].keys()
00077 split_ind = int(round(float(len(fns)) / num_folds))
00078 fns.sort()
00079 random.shuffle(fns)
00080 last_f_ind = 0
00081 for nf in range(num_folds):
00082 cur_f_ind = int(round(float(len(fns) * (nf + 1)) / num_folds))
00083 train_inds, test_inds = [], []
00084 test_range = range(last_f_ind, cur_f_ind)
00085 for f_ind, fn in enumerate(fns):
00086 if f_ind not in test_range:
00087 train_inds.extend(labels[dir][fns[f_ind]])
00088 else:
00089 test_inds.extend(labels[dir][fns[f_ind]])
00090 train_test_combos[nf]["train"].extend(train_inds)
00091 train_test_combos[nf]["test"].extend(test_inds)
00092 train_test_combos[nf]["test_traj_labels"].extend([dir for i in test_inds])
00093
00094 last_f_ind = cur_f_ind
00095
00096 params = []
00097 for fold_i in range(num_folds):
00098 log("Fold:", fold_i + 1)
00099 print train.shape
00100 train_fold = train[:, train_test_combos[fold_i]["train"]]
00101 train_fold_resp = responses[0, train_test_combos[fold_i]["train"]]
00102 train_fold_times = times_list[train_test_combos[fold_i]["train"]]
00103 train_fold_coll_times = colls_list[train_test_combos[fold_i]["train"]]
00104 test_fold = train[:, train_test_combos[fold_i]["test"]]
00105 test_fold_resp = responses[0, train_test_combos[fold_i]["test"]]
00106 test_fold_times = times_list[train_test_combos[fold_i]["test"]]
00107 test_fold_coll_times = colls_list[train_test_combos[fold_i]["test"]]
00108 test_fold_traj_labels = train_test_combos[fold_i]["test_traj_labels"]
00109 new_comp_dataset = {"training_data" : train_fold,
00110 "training_labels" : train_fold_resp,
00111 "training_times_list" : train_fold_times,
00112 "training_colls_list" : train_fold_coll_times,
00113 "testing_data" : test_fold,
00114 "testing_labels" : test_fold_resp,
00115 "testing_times_list" : test_fold_times,
00116 "testing_colls_list" : test_fold_coll_times,
00117 "trajectory_labels" : test_fold_traj_labels}
00118 params.append((self, new_comp_dataset,
00119 train_test_combos[fold_i]["test"]))
00120
00121 pool = mp.Pool()
00122 results = pool.map(run_fold_process, params)
00123 c_mat_list, traj_dicts = zip(*results)
00124 return c_mat_list, traj_dicts
00125
00126 def analyze_testing_results(self, traj_list):
00127 t_labels, pred_label_lists, diff_lists = [], [], []
00128 for i, cur_traj in enumerate(traj_list):
00129 times, labels, pred_labels, coll_times, indicies = zip(*cur_traj)
00130 traj_label = 0.
00131 coll_time = coll_times[0]
00132 for l in labels:
00133 if l != 0.:
00134 traj_label = l
00135 break
00136 pred_label_list, diff_list = [None] * 10, [None] * 10
00137 for i, l in enumerate(pred_labels):
00138 if l != 0.:
00139 for pll_i, _ in enumerate(pred_label_list):
00140 if pred_label_list[pll_i] is None:
00141 pred_label_list[pll_i] = l
00142 diff_list[pll_i] = times[i] - coll_time
00143 break
00144
00145 t_labels.append(traj_label)
00146 pred_label_lists.append(pred_label_list)
00147 diff_lists.append(diff_list)
00148
00149 first_coll_diffs = list(zip(*diff_lists)[0])
00150 for n in range(5):
00151 log("Num in a row: %d" % (n+1))
00152 first_preds = list(zip(*pred_label_lists)[n])
00153 for i, v in enumerate(first_coll_diffs):
00154 if v is None:
00155 first_coll_diffs[i] = 0.0
00156 if first_preds[i] is None:
00157 first_preds[i] = 0.
00158 confus_mat = np.zeros((3,3))
00159 for i, v in enumerate(t_labels):
00160 confus_mat[int(t_labels[i]), int(first_preds[i])] += 1
00161 log(confus_mat)
00162 log("min: %1.3f, max: %1.3f, median: %1.3f, mean: %1.3f, std: %1.3f" % (np.min(first_coll_diffs), np.max(first_coll_diffs), np.median(first_coll_diffs), np.mean(first_coll_diffs), np.std(first_coll_diffs)))
00163 log("[" + ", ".join(["%1.3f" % v for v in first_coll_diffs]) + "]")
00164 return first_coll_diffs
00165
00166
00167
00168
00169
00170
00171
00172
00173 def plot_testing_results(self, traj_list):
00174
00175 plt.figure(1)
00176 plt.fill_between([-20.0, 20.0], -1., 1., facecolor='green', alpha=0.3)
00177 plt.fill_between([-20.0, 20.0], 1., 2., facecolor='red', alpha=0.3)
00178 plt.fill_between([-20.0, 20.0], 2., 3., facecolor='blue', alpha=0.3)
00179
00180 fp_diffs = []
00181 for i, cur_traj in enumerate(traj_list):
00182 (pred_norm, pred_norm_t, pred_ext,
00183 pred_ext_t, pred_tab, pred_tab_t) = [], [], [], [], [], []
00184 for pt in cur_traj:
00185 if pt[3] <= 0.:
00186 pt[3] = cur_traj[-1][0]
00187 if pt[1] == 0. and pt[2] != 0.:
00188 fp_diffs.append(pt[0] - pt[3])
00189 if pt[1] == 0:
00190 pred_norm.append(pt[2] + i * 0.018)
00191 pred_norm_t.append(pt[0] - pt[3])
00192 elif pt[1] == 1:
00193 pred_ext.append(pt[2] + i * 0.018)
00194 pred_ext_t.append(pt[0] - pt[3])
00195 elif pt[1] == 2:
00196 pred_tab.append(pt[2] + i * 0.018)
00197 pred_tab_t.append(pt[0] - pt[3])
00198 if len(pred_norm) > 0:
00199 plt.scatter(pred_norm_t, pred_norm, c='green', marker='o', s = 30)
00200 if len(pred_ext) > 0:
00201 plt.scatter(pred_ext_t, pred_ext, c='red', marker='o', s = 30)
00202 if len(pred_tab) > 0:
00203 plt.scatter(pred_tab_t, pred_tab, c='blue', marker='o', s = 30)
00204 plt.axvline(0.0, color='black', linestyle = '-')
00205 plt.axhline(1.0, color='black', linestyle = '-')
00206 plt.axhline(2.0, color='black', linestyle = '-')
00207 plt.axis([-4.8, 1.2, -0.1, 2.8])
00208 plt.show()
00209
00210 def confusion_matrix_stats(confusion_mat):
00211 c_mat = np.matrix(confusion_mat.astype(float))
00212 assert c_mat.shape[0] == c_mat.shape[1]
00213 N = c_mat.shape[0]
00214 print "-"*60
00215 print "Confusion Matrix Statistics\n"
00216 print
00217 print " Predicted "
00218 print " ",
00219 for c in range(N):
00220 print "%4d" % c,
00221 print ""
00222 for r in range(N):
00223 print "Actual: %d" % r,
00224 print "|",
00225 for c in range(N):
00226 print "%6d" % int(c_mat[r, c]),
00227 print "|"
00228
00229 print c_mat
00230 print
00231 print "Number of instances: %d" % np.sum(c_mat)
00232 acc = np.trace(c_mat) / np.sum(c_mat)
00233 print "Accuracy: %1.4f" % acc
00234 print
00235 for l in range(N):
00236 tpr = c_mat[l, l] / np.sum(c_mat[l, :])
00237 fpr = ((np.sum(c_mat[:, l]) - c_mat[l, l]) /
00238 (np.sum(c_mat) - np.sum(c_mat[l, :])))
00239 spc = 1. - fpr
00240 print "Class %d stats: TPR %1.4f, SPEC %1.4f, FPR %1.4f" % (l, tpr, spc, fpr)
00241
00242 def run_fold_process(data):
00243 (classifier, cdata, test_fold_indicies) = data
00244 confusion_mat = np.zeros((3, 3))
00245 log("Building classifier...")
00246 st_time = rospy.Time.now().to_sec()
00247 classifier.train(cdata)
00248 end_time = rospy.Time.now().to_sec()
00249 log("Done building classifier (Time taken: %3.3f)" % (end_time - st_time))
00250 traj_dict = {}
00251 for i, t_inst in enumerate(cdata["testing_data"].T):
00252 pred = classifier.predict(t_inst.T)
00253 confusion_mat[int(cdata["testing_labels"][0, i]), int(pred)] += 1
00254 if not cdata["trajectory_labels"][i] in traj_dict:
00255 traj_dict[cdata["trajectory_labels"][i]] = []
00256 traj_dict[cdata["trajectory_labels"][i]].append([cdata["testing_times_list"][i], int(cdata["testing_labels"][0, i]),
00257 int(pred), cdata["testing_times_list"][i], test_fold_indicies[i]])
00258 log("Confusion Matrix:")
00259 log(confusion_mat)
00260 return (confusion_mat, traj_dict)
00261
00262 def pool_loading(fns):
00263 NUM_PROCESSES = 12
00264 pool = mp.Pool(NUM_PROCESSES)
00265 learners = pool.map(hard_load_pickle, fns)
00266 return learners
00267
00268 def pool_saving(objs, fns):
00269 NUM_PROCESSES = 12
00270 pool = mp.Pool(NUM_PROCESSES)
00271 pool.map(hard_save_pickle, zip(objs, fns))
00272
00273 def hard_save_pickle(params):
00274 pickle_data, fn = params
00275 f = open(fn, "w")
00276 pickle.dump(pickle_data, f)
00277 f.close()
00278
00279 def hard_load_pickle(fn):
00280 f = open(fn, "r")
00281 p = pickle.load(f)
00282 f.close()
00283 return p
00284
00285 class RFBreimanClassifier(AbstractClassifier):
00286
00287 def __init__(self, num_learners=100):
00288 super(RFBreimanClassifier, self).__init__()
00289 self.rfb = rf.RFBreiman(None, None, num_learners)
00290 self.num_learners = num_learners
00291
00292 def predict(self, instance):
00293 pred, _ = self.rfb.predict(instance)
00294 return pred[0,0]
00295
00296 def train(self, compiled_dataset):
00297 train = compiled_dataset["training_data"]
00298 responses = compiled_dataset["training_labels"]
00299 dataset = ds.Dataset(train, responses)
00300 self.rfb.train(dataset)
00301
00302 def build(self, compiled_dataset, filename):
00303
00304
00305
00306
00307
00308 log("Building classifier...")
00309 st_time = rospy.Time.now().to_sec()
00310 self.train(compiled_dataset)
00311 end_time = rospy.Time.now().to_sec()
00312 log("Done building classifier (Time taken: %3.3f)" % (end_time - st_time))
00313 log("Average tree depth: %3f" % self.rfb.avg_tree_depth())
00314
00315 log("Saving...")
00316 pb = None
00317 self.fos.save_pickle((pb, self.rfb),
00318 filename)
00319 self._random_forest_split(filename)
00320 log("Finished saving")
00321
00322
00323
00324 def _random_forest_split(self, filename, num_processes = 8):
00325 proj_basis, classifier = self.fos.load_pickle(filename)
00326 self.fos.save_pickle((classifier.number_of_learners, proj_basis),
00327 filename.split(".")[0] + "_split_index.pickle")
00328 fns = [self.fos.get_pickle_name(filename.split(".")[0] +
00329 "_%03d.pickle" % (i)) for i in range(
00330 classifier.number_of_learners)]
00331 pool_saving(classifier.learners, fns)
00332
00333
00334
00335 def load(self, filename):
00336 try:
00337 log("Loading random forest classifier from pickle...")
00338 num_trees, projection_basis = self.fos.load_pickle(filename.split(".")[0] + "_split_index.pickle")
00339 self.rfb = rf.RFBreiman(number_of_learners=num_trees)
00340 fns = [self.fos.get_pickle_name(filename.split(".")[0] +
00341 "_%03d.pickle" % (i)) for i in range(num_trees)]
00342 self.rfb.learners = pool_loading(fns)
00343 log("Classifier loaded")
00344 except Exception as e:
00345 err("Problem loading classifier (Has it been built?)")
00346 print e
00347 sys.exit()
00348
00349 class RFBreimanRefinedClassifier(RFBreimanClassifier):
00350
00351 def __init__(self, num_learners=100, refine_folds=5, refine_learners=100,
00352 refine_runs=5, refine_cut=2):
00353 super(RFBreimanRefinedClassifier, self).__init__(num_learners)
00354 self.refine_folds = refine_folds
00355 self.refine_learners = refine_learners
00356 self.refine_runs = refine_runs
00357 self.refine_cut = refine_cut
00358
00359 def train(self, compiled_dataset):
00360 train = compiled_dataset["training_data"]
00361 responses = compiled_dataset["training_labels"]
00362 index_false_counts = [0] * train.shape[1]
00363 index_true_count = 0
00364 for r_run in range(self.refine_runs):
00365 cur_classifier = RFBreimanClassifier(self.refine_learners)
00366 c_mat_list, traj_dicts = cur_classifier._fold_predictions(compiled_dataset,
00367 self.refine_folds,
00368 random.randint(0, 1000))
00369
00370
00371 for traj_dict in traj_dicts:
00372 for i, k in enumerate(traj_dict):
00373 cur_traj = traj_dict[k]
00374
00375
00376 for pt in cur_traj:
00377 time, label, pred_label, coll_time, index = pt
00378 if label != pred_label:
00379 index_false_counts[index] += 1
00380 else:
00381 index_true_count += 1
00382
00383 print index_true_count, sum(index_false_counts), index_true_count + sum(index_false_counts), (r_run + 1) * train.shape[1]
00384 indices_to_keep = []
00385 for ind, count in enumerate(index_false_counts):
00386 if count <= self.refine_cut:
00387 indices_to_keep.append(ind)
00388 print "Old shapes:", train.shape, responses.shape
00389 train = train[:, indices_to_keep]
00390 responses = responses[:, indices_to_keep]
00391 print "New shapes:", train.shape, responses.shape
00392 dataset = ds.Dataset(train, responses)
00393 self.rfb.train(dataset)
00394
00395
00396 classifiers_dict = {"small_random_forest" : RFBreimanClassifier(20),
00397 "large_random_forest" : RFBreimanClassifier(150),
00398 "small_refined_random_forest" : RFBreimanRefinedClassifier(20, 8, 100, 5, 2),
00399 "large_refined_random_forest" : RFBreimanRefinedClassifier(100, 8, 100, 5, 2)}