ml_lib: dataset.py Source File

Go to the documentation of this file.
00001 ##
00002 # Representation of a dataset, operations that can be performed on it, and quantities that can be calculated from it
00003 import numpy as np
00004 import copy
00005 
00006 class Dataset:
00007 
00008     ##    
00009     # inputs coded as numpy array, column vectors
00010     # outputs also as numpy array, column vectors
00011     def __init__(self, inputs, outputs):
00012         self.inputs  = inputs
00013         self.outputs = outputs
00014         self.metadata = []
00015         assert(inputs.shape[1] == outputs.shape[1])
00016 
00017     def num_examples(self):
00018         return self.inputs.shape[1]
00019 
00020     def num_attributes(self):
00021         return self.inputs.shape[0]
00022 
00023     def add_attribute_descriptor(self, descriptor):
00024         self.metadata.append(descriptor)
00025         #self.metadata[descriptor.name] = descriptor
00026 
00027     def append(self, another_dataset):
00028         if self.inputs != None:
00029             self.inputs  = np.concatenate((self.inputs, another_dataset.inputs), axis=1)
00030         else:
00031             self.inputs = another_dataset.inputs
00032 
00033         if self.outputs != None:
00034             self.outputs = np.concatenate((self.outputs, another_dataset.outputs), axis=1)
00035         else:
00036             self.outputs = another_dataset.outputs
00037 
00038 class AttributeDescriptor:
00039     def __init__(self, name, extent):
00040         self.name = name
00041         self.extent = extent
00042 
00043 
00044 ###############################################################################
00045 # Operations on datasets
00046 ###############################################################################
00047 
00048 ##
00049 # Good for leave one out cross validation loops.
00050 # @param dataset
00051 # @param index
00052 def leave_one_out(dataset, index):
00053     inputs = np.column_stack((dataset.inputs[:, :index], dataset.inputs[:, index+1:]))
00054     outputs = np.column_stack((dataset.outputs[:, :index], dataset.outputs[:, index+1:]))
00055     d = Dataset(inputs, outputs)
00056     d.metadata = copy.copy(dataset.metadata)
00057     return d, dataset.inputs[:, index], dataset.outputs[:,index]
00058 
00059 ##
00060 # Splits up a dataset based on value in a particular attribute
00061 # @param attribute attribute to split on
00062 # @param split_point value in that attribute to split on
00063 def split_continuous(dataset, attribute, split_point):
00064     selected_attribute = dataset.inputs[attribute, :]
00065     leq_bool           = selected_attribute <= split_point
00066     _, leq_col         = np.where(leq_bool)
00067 
00068     #print 'leq_col', leq_col
00069     if leq_col.shape[1] > 0:
00070         leq_dataset        = Dataset(dataset.inputs[:, leq_col.A[0]], dataset.outputs[:, leq_col.A[0]])
00071     else:
00072         leq_dataset        = Dataset(np.matrix([]), np.matrix([]))
00073 
00074     _, gt_col          = np.where(~leq_bool)
00075     if gt_col.shape[1] > 0:
00076         gt_dataset         = Dataset(dataset.inputs[:, gt_col.A[0]], dataset.outputs[:, gt_col.A[0]])
00077     else:
00078         gt_dataset         = Dataset(np.matrix([]), np.matrix([]))
00079     
00080     ret_sets = []
00081     if leq_dataset.num_examples() > 0:
00082         ret_sets.append(leq_dataset)
00083     if gt_dataset.num_examples() > 0:
00084         ret_sets.append(gt_dataset)
00085     return ret_sets
00086 
00087 ##
00088 # Makes bootstrap samples
00089 #
00090 # @param dataset Dataset object
00091 # @param number_samples number of bootstrap set to generate
00092 # @param points_per_sample number of points in each sample
00093 # @return an iterator over bootstrap samples
00094 def bootstrap_samples(dataset, number_samples, points_per_sample):
00095     in_bags, out_bags = [], []
00096     for i in xrange(number_samples):
00097         selected_pts     = np.random.randint(0, dataset.inputs.shape[1], points_per_sample)
00098         n_selected_pts = np.setdiff1d(range(dataset.inputs.shape[1]), selected_pts)
00099         selected_inputs  = dataset.inputs[:, selected_pts]
00100         selected_outputs = dataset.outputs[:, selected_pts]
00101         n_selected_inputs  = dataset.inputs[:, n_selected_pts]
00102         n_selected_outputs = dataset.outputs[:, n_selected_pts]
00103         #print 'Dataset.bootstrap count', i
00104         in_bags.append(Dataset(selected_inputs, selected_outputs))
00105         out_bags.append(Dataset(n_selected_inputs, n_selected_outputs))
00106     return in_bags, out_bags
00107 
00108 
00109 ###############################################################################
00110 # Common quantities calculated from datasets
00111 ###############################################################################
00112 ##
00113 # Returns unique values represented by an attribute
00114 #
00115 # ex.  unique_values(np.matrix([1, 2, 3, 4, 4, 4, 5]), 0)
00116 #      returns [1,2,3,4,5]
00117 #
00118 # @param data nxm matrix where each column is a data vector
00119 # @param attribute_number row to find unique values in
00120 def unique_values(data, attribute_number=0):
00121     values   = dict()
00122     for instance_idx in xrange(data.shape[1]):
00123         values[data[attribute_number, instance_idx]] = True
00124     k = values.keys()
00125     k.sort()
00126     return k
00127 
00128 ##
00129 # Caculates the discrete entropy of a data vector
00130 #
00131 # @param data 1xn matrix of discrete values
00132 def entropy_discrete(data):
00133     values = unique_values(data)
00134     #for each output class calculate
00135     def calc_class_entropy(value):
00136         number_in_class = np.sum(data == value)
00137         num_examples = data.shape[1]
00138         percentage_in_class = (number_in_class / float(num_examples))
00139         return -percentage_in_class * np.log2(percentage_in_class)
00140     return np.sum(map(calc_class_entropy, values))
00141 
00142 ##
00143 # Calculates entropy in a dataset's output labels
00144 #
00145 # @param dataset
00146 def dataset_entropy_discrete(dataset):
00147     return entropy_discrete(dataset.outputs[0,:])
00148 
00149 
00150 
00151 
00152 
00153