classifiers.py
Go to the documentation of this file.
00001 #! /usr/bin/python
00002 
00003 import numpy as np
00004 import random
00005 import sys
00006 import matplotlib.pyplot as plt
00007 import multiprocessing as mp
00008 import cPickle as pickle
00009 
00010 import roslib; roslib.load_manifest('pr2_overhead_grasping')
00011 import rospy
00012 
00013 from helpers import log, err, node_name, FileOperations
00014 
00015 import ml_lib.random_forest as rf
00016 import ml_lib.dataset as ds
00017 import ml_lib.dimreduce as ldds
00018                                             
00019 class AbstractClassifier(object):
00020 
00021     def __init__(self):
00022         # self.load_parameters()
00023         self.fos = FileOperations()
00024 
00025     # def load_parameters(self):
00026     #     self.RTREE_CLASSIFIER = rospy.get_param("/overhead_grasping/rtree_classifier")
00027 
00028     def train(self, compiled_dataset):
00029         err("Classifier not implemented!")
00030 
00031     def predict(self, instance):
00032         err("Classifier not implemented!")
00033         return 0.
00034 
00035     def build(self, compiled_dataset, filename):
00036         err("Classifier not implemented!")
00037 
00038     def load(self, filename):
00039         err("Classifier not implemented!")
00040 
00041     def cross_valid(self, compilied_dataset, num_folds = 10, seed = 1):
00042 
00043         c_mat_list, traj_dicts = self._fold_predictions(compiled_dataset, num_folds, seed)
00044         confusion_mat = sum(c_mat_list)
00045 
00046         confusion_matrix_stats(confusion_mat)
00047         log("-"*60)
00048         log("Analysis")
00049         first_coll_diffs = []
00050         for traj_dict in traj_dicts:
00051             traj_list = [traj_dict[k] for k in traj_dict]
00052             first_coll_diffs.extend(self.analyze_testing_results(traj_list))
00053         log("min: %1.3f, max: %1.3f, median: %1.3f, mean: %1.3f, std: %1.3f" % (np.min(first_coll_diffs), np.max(first_coll_diffs), np.median(first_coll_diffs), np.mean(first_coll_diffs), np.std(first_coll_diffs)))
00054         log("[" + ", ".join(["%1.3f" % v for v in first_coll_diffs]) + "]")
00055         # self.plot_testing_results(traj_lists[0])
00056 
00057     def _fold_predictions(self, compiled_dataset, num_folds, seed):
00058         cdata = compiled_dataset
00059         times_list = cdata["training_times_list"]
00060         colls_list = cdata["training_colls_list"]
00061 
00062         train = cdata["training_data"]
00063         responses = cdata["training_labels"]
00064         random.seed(seed)
00065         labels = {}
00066         for i, l in enumerate(cdata["trajectory_labels"]):
00067             if l[0] not in labels:
00068                 labels[l[0]] = {l[1] : [i]}
00069             elif l[1] not in labels[l[0]]:
00070                 labels[l[0]][l[1]] = [i]
00071             else:
00072                 labels[l[0]][l[1]].append(i)
00073 
00074         train_test_combos = [{"train" : [], "test" : [], "test_traj_labels" : []} for i in range(num_folds)]
00075         for dir in labels:
00076             fns = labels[dir].keys()
00077             split_ind = int(round(float(len(fns)) / num_folds))
00078             fns.sort()
00079             random.shuffle(fns)
00080             last_f_ind = 0
00081             for nf in range(num_folds):
00082                 cur_f_ind = int(round(float(len(fns) * (nf + 1)) / num_folds))
00083                 train_inds, test_inds = [], []
00084                 test_range = range(last_f_ind, cur_f_ind)
00085                 for f_ind, fn in enumerate(fns):
00086                     if f_ind not in test_range:
00087                         train_inds.extend(labels[dir][fns[f_ind]])
00088                     else:
00089                         test_inds.extend(labels[dir][fns[f_ind]])
00090                 train_test_combos[nf]["train"].extend(train_inds)
00091                 train_test_combos[nf]["test"].extend(test_inds)
00092                 train_test_combos[nf]["test_traj_labels"].extend([dir for i in test_inds])
00093 
00094                 last_f_ind = cur_f_ind
00095         
00096         params = []
00097         for fold_i in range(num_folds):
00098             log("Fold:", fold_i + 1)
00099             print train.shape
00100             train_fold = train[:, train_test_combos[fold_i]["train"]]
00101             train_fold_resp = responses[0, train_test_combos[fold_i]["train"]]
00102             train_fold_times = times_list[train_test_combos[fold_i]["train"]]
00103             train_fold_coll_times = colls_list[train_test_combos[fold_i]["train"]]
00104             test_fold = train[:, train_test_combos[fold_i]["test"]]
00105             test_fold_resp = responses[0, train_test_combos[fold_i]["test"]]
00106             test_fold_times = times_list[train_test_combos[fold_i]["test"]]
00107             test_fold_coll_times = colls_list[train_test_combos[fold_i]["test"]]
00108             test_fold_traj_labels = train_test_combos[fold_i]["test_traj_labels"]
00109             new_comp_dataset = {"training_data" : train_fold,
00110                                 "training_labels" : train_fold_resp,
00111                                 "training_times_list" : train_fold_times, 
00112                                 "training_colls_list" : train_fold_coll_times, 
00113                                 "testing_data" : test_fold,
00114                                 "testing_labels" : test_fold_resp,
00115                                 "testing_times_list" : test_fold_times, 
00116                                 "testing_colls_list" : test_fold_coll_times,
00117                                 "trajectory_labels" : test_fold_traj_labels}
00118             params.append((self, new_comp_dataset,
00119                            train_test_combos[fold_i]["test"]))
00120 
00121         pool = mp.Pool()
00122         results = pool.map(run_fold_process, params)
00123         c_mat_list, traj_dicts = zip(*results)
00124         return c_mat_list, traj_dicts
00125 
00126     def analyze_testing_results(self, traj_list):
00127         t_labels, pred_label_lists, diff_lists = [], [], []
00128         for i, cur_traj in enumerate(traj_list):
00129             times, labels, pred_labels, coll_times, indicies = zip(*cur_traj)
00130             traj_label = 0.
00131             coll_time = coll_times[0]
00132             for l in labels:
00133                 if l != 0.:
00134                     traj_label = l
00135                     break
00136             pred_label_list, diff_list = [None] * 10, [None] * 10
00137             for i, l in enumerate(pred_labels):
00138                 if l != 0.:
00139                     for pll_i, _ in enumerate(pred_label_list):
00140                         if pred_label_list[pll_i] is None:
00141                             pred_label_list[pll_i] = l
00142                             diff_list[pll_i] = times[i] - coll_time
00143                             break
00144             # log(traj_label, pred_label_list, diff_list)
00145             t_labels.append(traj_label)
00146             pred_label_lists.append(pred_label_list)
00147             diff_lists.append(diff_list)
00148 
00149         first_coll_diffs = list(zip(*diff_lists)[0])
00150         for n in range(5):
00151             log("Num in a row: %d" % (n+1))
00152             first_preds = list(zip(*pred_label_lists)[n])
00153             for i, v in enumerate(first_coll_diffs):
00154                 if v is None:
00155                     first_coll_diffs[i] = 0.0
00156                 if first_preds[i] is None:
00157                     first_preds[i] = 0.
00158             confus_mat = np.zeros((3,3))
00159             for i, v in enumerate(t_labels):
00160                 confus_mat[int(t_labels[i]), int(first_preds[i])] += 1
00161             log(confus_mat)
00162         log("min: %1.3f, max: %1.3f, median: %1.3f, mean: %1.3f, std: %1.3f" % (np.min(first_coll_diffs), np.max(first_coll_diffs), np.median(first_coll_diffs), np.mean(first_coll_diffs), np.std(first_coll_diffs)))
00163         log("[" + ", ".join(["%1.3f" % v for v in first_coll_diffs]) + "]")
00164         return first_coll_diffs
00165 
00166     ##
00167     # Takes the trajectory list from eval_predicts and plots it.
00168     # each trajectory is stacked in a horizontal lines.  Vertical placement
00169     # indicates predicted result.  Color indicates actual result.
00170     # Collision trajectories are offset so that the colllision occurs at t = 0
00171     # Empty grasps are offset so that the end of the trajectory is at t = 0
00172     # Green = No collision, Red = External collision, Blue = Table collision
00173     def plot_testing_results(self, traj_list):
00174 
00175         plt.figure(1)
00176         plt.fill_between([-20.0, 20.0], -1., 1., facecolor='green', alpha=0.3)
00177         plt.fill_between([-20.0, 20.0], 1., 2., facecolor='red', alpha=0.3)
00178         plt.fill_between([-20.0, 20.0], 2., 3., facecolor='blue', alpha=0.3)
00179 
00180         fp_diffs = []
00181         for i, cur_traj in enumerate(traj_list):
00182             (pred_norm, pred_norm_t, pred_ext, 
00183              pred_ext_t, pred_tab, pred_tab_t) = [], [], [], [], [], []
00184             for pt in cur_traj:
00185                 if pt[3] <= 0.:
00186                     pt[3] = cur_traj[-1][0]
00187                 if pt[1] == 0. and pt[2] != 0.:
00188                     fp_diffs.append(pt[0] - pt[3])
00189                 if pt[1] == 0:
00190                     pred_norm.append(pt[2] + i * 0.018)
00191                     pred_norm_t.append(pt[0] - pt[3])
00192                 elif pt[1] == 1:
00193                     pred_ext.append(pt[2] + i * 0.018)
00194                     pred_ext_t.append(pt[0] - pt[3])
00195                 elif pt[1] == 2:
00196                     pred_tab.append(pt[2] + i * 0.018)
00197                     pred_tab_t.append(pt[0] - pt[3])
00198             if len(pred_norm) > 0:
00199                 plt.scatter(pred_norm_t, pred_norm, c='green', marker='o', s = 30)
00200             if len(pred_ext) > 0:
00201                 plt.scatter(pred_ext_t, pred_ext, c='red', marker='o', s = 30)
00202             if len(pred_tab) > 0:
00203                 plt.scatter(pred_tab_t, pred_tab, c='blue', marker='o', s = 30)
00204         plt.axvline(0.0, color='black', linestyle = '-')
00205         plt.axhline(1.0, color='black', linestyle = '-')
00206         plt.axhline(2.0, color='black', linestyle = '-')
00207         plt.axis([-4.8, 1.2, -0.1, 2.8])
00208         plt.show()
00209 
00210 def confusion_matrix_stats(confusion_mat):
00211     c_mat = np.matrix(confusion_mat.astype(float))
00212     assert c_mat.shape[0] == c_mat.shape[1]
00213     N = c_mat.shape[0]
00214     print "-"*60
00215     print "Confusion Matrix Statistics\n"
00216     print
00217     print "            Predicted        "
00218     print     "              ",
00219     for c in range(N):
00220         print "%4d" % c,
00221     print ""
00222     for r in range(N):
00223         print "Actual: %d" % r,
00224         print "|",
00225         for c in range(N):
00226             print "%6d" % int(c_mat[r, c]),
00227         print "|"
00228 
00229     print c_mat
00230     print 
00231     print "Number of instances: %d" % np.sum(c_mat)
00232     acc = np.trace(c_mat) / np.sum(c_mat) 
00233     print "Accuracy: %1.4f" % acc
00234     print
00235     for l in range(N):
00236         tpr = c_mat[l, l] / np.sum(c_mat[l, :])
00237         fpr = ((np.sum(c_mat[:, l]) - c_mat[l, l]) / 
00238                 (np.sum(c_mat) - np.sum(c_mat[l, :])))
00239         spc = 1. - fpr
00240         print "Class %d stats: TPR %1.4f, SPEC %1.4f, FPR %1.4f" % (l, tpr, spc, fpr)
00241 
00242 def run_fold_process(data):
00243     (classifier, cdata, test_fold_indicies) = data
00244     confusion_mat = np.zeros((3, 3))
00245     log("Building classifier...")
00246     st_time = rospy.Time.now().to_sec()
00247     classifier.train(cdata)
00248     end_time = rospy.Time.now().to_sec()
00249     log("Done building classifier (Time taken: %3.3f)" % (end_time - st_time))
00250     traj_dict = {}
00251     for i, t_inst in enumerate(cdata["testing_data"].T):
00252         pred = classifier.predict(t_inst.T)
00253         confusion_mat[int(cdata["testing_labels"][0, i]), int(pred)] += 1
00254         if not cdata["trajectory_labels"][i] in traj_dict:
00255             traj_dict[cdata["trajectory_labels"][i]] = []
00256         traj_dict[cdata["trajectory_labels"][i]].append([cdata["testing_times_list"][i], int(cdata["testing_labels"][0, i]), 
00257                      int(pred), cdata["testing_times_list"][i], test_fold_indicies[i]])
00258     log("Confusion Matrix:")
00259     log(confusion_mat)
00260     return (confusion_mat, traj_dict)
00261 
00262 def pool_loading(fns):
00263     NUM_PROCESSES = 12
00264     pool = mp.Pool(NUM_PROCESSES)
00265     learners = pool.map(hard_load_pickle, fns) 
00266     return learners
00267 
00268 def pool_saving(objs, fns):
00269     NUM_PROCESSES = 12
00270     pool = mp.Pool(NUM_PROCESSES)
00271     pool.map(hard_save_pickle, zip(objs, fns))
00272 
00273 def hard_save_pickle(params):
00274     pickle_data, fn = params
00275     f = open(fn, "w")
00276     pickle.dump(pickle_data, f)
00277     f.close()
00278 
00279 def hard_load_pickle(fn):
00280     f = open(fn, "r")
00281     p = pickle.load(f)
00282     f.close()
00283     return p
00284 
00285 class RFBreimanClassifier(AbstractClassifier):
00286     
00287     def __init__(self, num_learners=100):
00288         super(RFBreimanClassifier, self).__init__()
00289         self.rfb = rf.RFBreiman(None, None, num_learners)
00290         self.num_learners = num_learners
00291 
00292     def predict(self, instance):
00293         pred, _ = self.rfb.predict(instance)
00294         return pred[0,0]
00295 
00296     def train(self, compiled_dataset):
00297         train = compiled_dataset["training_data"]
00298         responses = compiled_dataset["training_labels"]
00299         dataset = ds.Dataset(train, responses)
00300         self.rfb.train(dataset)
00301 
00302     def build(self, compiled_dataset, filename):
00303         # dataset = ldds.LinearDimReduceDataset(train, responses)
00304         # log("PCA dimension reduction")
00305         # dataset.pca_reduce(percent_var)
00306         # log("Reducing dataset to %d dimensions" % (dataset.projection_basis.shape[1]))
00307         # dataset.reduce_input()
00308         log("Building classifier...")
00309         st_time = rospy.Time.now().to_sec()
00310         self.train(compiled_dataset)
00311         end_time = rospy.Time.now().to_sec()
00312         log("Done building classifier (Time taken: %3.3f)" % (end_time - st_time))
00313         log("Average tree depth: %3f" % self.rfb.avg_tree_depth())
00314         # pb = dataset.projection_basis
00315         log("Saving...")
00316         pb = None
00317         self.fos.save_pickle((pb, self.rfb), 
00318                              filename)
00319         self._random_forest_split(filename)
00320         log("Finished saving")
00321 
00322     ##
00323     # multithreaded saving
00324     def _random_forest_split(self, filename, num_processes = 8):
00325         proj_basis, classifier = self.fos.load_pickle(filename)
00326         self.fos.save_pickle((classifier.number_of_learners, proj_basis), 
00327                         filename.split(".")[0] + "_split_index.pickle")
00328         fns = [self.fos.get_pickle_name(filename.split(".")[0] + 
00329                                  "_%03d.pickle" % (i)) for i in range(
00330                                                 classifier.number_of_learners)]
00331         pool_saving(classifier.learners, fns)
00332 
00333     ##
00334     # multithreaded load
00335     def load(self, filename):
00336         try:
00337             log("Loading random forest classifier from pickle...")
00338             num_trees, projection_basis = self.fos.load_pickle(filename.split(".")[0] + "_split_index.pickle")
00339             self.rfb = rf.RFBreiman(number_of_learners=num_trees)
00340             fns = [self.fos.get_pickle_name(filename.split(".")[0] + 
00341                                  "_%03d.pickle" % (i)) for i in range(num_trees)]
00342             self.rfb.learners = pool_loading(fns)
00343             log("Classifier loaded")
00344         except Exception as e:
00345             err("Problem loading classifier (Has it been built?)")
00346             print e
00347             sys.exit()
00348 
00349 class RFBreimanRefinedClassifier(RFBreimanClassifier):
00350     
00351     def __init__(self, num_learners=100, refine_folds=5, refine_learners=100, 
00352                                          refine_runs=5, refine_cut=2):
00353         super(RFBreimanRefinedClassifier, self).__init__(num_learners)
00354         self.refine_folds = refine_folds
00355         self.refine_learners = refine_learners
00356         self.refine_runs = refine_runs
00357         self.refine_cut = refine_cut
00358 
00359     def train(self, compiled_dataset):
00360         train = compiled_dataset["training_data"]
00361         responses = compiled_dataset["training_labels"]
00362         index_false_counts = [0] * train.shape[1]
00363         index_true_count = 0
00364         for r_run in range(self.refine_runs):
00365             cur_classifier = RFBreimanClassifier(self.refine_learners)
00366             c_mat_list, traj_dicts = cur_classifier._fold_predictions(compiled_dataset, 
00367                                                             self.refine_folds,
00368                                                             random.randint(0, 1000))
00369             # traj_dicts is a list of different folds,
00370             # together they represent a test on all of the data
00371             for traj_dict in traj_dicts:
00372                 for i, k in enumerate(traj_dict):
00373                     cur_traj = traj_dict[k]
00374                     # pt is the result of an instance of data after prediction
00375                     # in a trajectory
00376                     for pt in cur_traj:
00377                         time, label, pred_label, coll_time, index = pt
00378                         if label != pred_label:
00379                             index_false_counts[index] += 1
00380                         else:
00381                             index_true_count += 1
00382 
00383             print index_true_count, sum(index_false_counts), index_true_count + sum(index_false_counts), (r_run + 1) * train.shape[1]
00384         indices_to_keep = []
00385         for ind, count in enumerate(index_false_counts):
00386             if count <= self.refine_cut:
00387                 indices_to_keep.append(ind)
00388         print "Old shapes:", train.shape, responses.shape
00389         train = train[:, indices_to_keep]
00390         responses = responses[:, indices_to_keep]
00391         print "New shapes:", train.shape, responses.shape
00392         dataset = ds.Dataset(train, responses)
00393         self.rfb.train(dataset)
00394 
00395 
00396 classifiers_dict = {"small_random_forest" : RFBreimanClassifier(20),
00397                     "large_random_forest" : RFBreimanClassifier(150),
00398                     "small_refined_random_forest" : RFBreimanRefinedClassifier(20, 8, 100, 5, 2),
00399                     "large_refined_random_forest" : RFBreimanRefinedClassifier(100, 8, 100, 5, 2)}


kelsey_sandbox
Author(s): kelsey
autogenerated on Wed Nov 27 2013 11:52:03