00001
00002
00003 import numpy as np
00004 import copy
00005
00006 class Dataset:
00007
00008
00009
00010
00011 def __init__(self, inputs, outputs):
00012 self.inputs = inputs
00013 self.outputs = outputs
00014 self.metadata = []
00015 assert(inputs.shape[1] == outputs.shape[1])
00016
00017 def num_examples(self):
00018 return self.inputs.shape[1]
00019
00020 def num_attributes(self):
00021 return self.inputs.shape[0]
00022
00023 def add_attribute_descriptor(self, descriptor):
00024 self.metadata.append(descriptor)
00025
00026
00027 def append(self, another_dataset):
00028 if self.inputs != None:
00029 self.inputs = np.concatenate((self.inputs, another_dataset.inputs), axis=1)
00030 else:
00031 self.inputs = another_dataset.inputs
00032
00033 if self.outputs != None:
00034 self.outputs = np.concatenate((self.outputs, another_dataset.outputs), axis=1)
00035 else:
00036 self.outputs = another_dataset.outputs
00037
00038 class AttributeDescriptor:
00039 def __init__(self, name, extent):
00040 self.name = name
00041 self.extent = extent
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052 def leave_one_out(dataset, index):
00053 inputs = np.column_stack((dataset.inputs[:, :index], dataset.inputs[:, index+1:]))
00054 outputs = np.column_stack((dataset.outputs[:, :index], dataset.outputs[:, index+1:]))
00055 d = Dataset(inputs, outputs)
00056 d.metadata = copy.copy(dataset.metadata)
00057 return d, dataset.inputs[:, index], dataset.outputs[:,index]
00058
00059
00060
00061
00062
00063 def split_continuous(dataset, attribute, split_point):
00064 selected_attribute = dataset.inputs[attribute, :]
00065 leq_bool = selected_attribute <= split_point
00066 _, leq_col = np.where(leq_bool)
00067
00068
00069 if leq_col.shape[1] > 0:
00070 leq_dataset = Dataset(dataset.inputs[:, leq_col.A[0]], dataset.outputs[:, leq_col.A[0]])
00071 else:
00072 leq_dataset = Dataset(np.matrix([]), np.matrix([]))
00073
00074 _, gt_col = np.where(~leq_bool)
00075 if gt_col.shape[1] > 0:
00076 gt_dataset = Dataset(dataset.inputs[:, gt_col.A[0]], dataset.outputs[:, gt_col.A[0]])
00077 else:
00078 gt_dataset = Dataset(np.matrix([]), np.matrix([]))
00079
00080 ret_sets = []
00081 if leq_dataset.num_examples() > 0:
00082 ret_sets.append(leq_dataset)
00083 if gt_dataset.num_examples() > 0:
00084 ret_sets.append(gt_dataset)
00085 return ret_sets
00086
00087
00088
00089
00090
00091
00092
00093
00094 def bootstrap_samples(dataset, number_samples, points_per_sample):
00095 in_bags, out_bags = [], []
00096 for i in xrange(number_samples):
00097 selected_pts = np.random.randint(0, dataset.inputs.shape[1], points_per_sample)
00098 n_selected_pts = np.setdiff1d(range(dataset.inputs.shape[1]), selected_pts)
00099 selected_inputs = dataset.inputs[:, selected_pts]
00100 selected_outputs = dataset.outputs[:, selected_pts]
00101 n_selected_inputs = dataset.inputs[:, n_selected_pts]
00102 n_selected_outputs = dataset.outputs[:, n_selected_pts]
00103
00104 in_bags.append(Dataset(selected_inputs, selected_outputs))
00105 out_bags.append(Dataset(n_selected_inputs, n_selected_outputs))
00106 return in_bags, out_bags
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120 def unique_values(data, attribute_number=0):
00121 values = dict()
00122 for instance_idx in xrange(data.shape[1]):
00123 values[data[attribute_number, instance_idx]] = True
00124 k = values.keys()
00125 k.sort()
00126 return k
00127
00128
00129
00130
00131
00132 def entropy_discrete(data):
00133 values = unique_values(data)
00134
00135 def calc_class_entropy(value):
00136 number_in_class = np.sum(data == value)
00137 num_examples = data.shape[1]
00138 percentage_in_class = (number_in_class / float(num_examples))
00139 return -percentage_in_class * np.log2(percentage_in_class)
00140 return np.sum(map(calc_class_entropy, values))
00141
00142
00143
00144
00145
00146 def dataset_entropy_discrete(dataset):
00147 return entropy_discrete(dataset.outputs[0,:])
00148
00149
00150
00151
00152
00153