00001
00002
00003 import collections
00004 import copy
00005 import numpy as np
00006 import itertools
00007 import math
00008 import matplotlib.cm as cm
00009 import matplotlib.colors as colors
00010 import matplotlib.pyplot as plt; plt.rcdefaults()
00011 from mpl_toolkits.mplot3d import Axes3D
00012 import re
00013 import scipy.stats as stats
00014 import pandas as pd
00015
00016
00017 METHOD_COLORS = ['yellow', 'red', 'aqua', 'green', 'lightgray', 'blue']
00018 METHOD_HATCH = ['/', '\\', 'x', '*', 'o', 'O', '.']
00019 LINE_COLORS = ['red', 'blue', 'green']
00020 LINE_HATCH = [(20,0),(20,5),(5,5),(15,5,5,5),(15,5,2,5)]
00021
00022 def calculate_mean_and_standard_error(data):
00023 a = 1.0 * np.array(data)
00024 return np.mean(a), stats.sem(a)
00025
00026 def mean_confidence_interval(data, confidence=0.95):
00027 a = 1.0 * np.array(data)
00028 n = len(a)
00029 m, se = np.mean(a), stats.sem(a)
00030 h = se * stats.t._ppf((1+confidence)/2., n-1)
00031 return m, h
00032
00033 def is_significant(a, b, confidence=0.95):
00034 t, p = stats.ttest_ind(a, b)
00035 return p < (1.0 - confidence)
00036
00037 def mix_colors(c1, c2, amount):
00038 return [int((1 - amount) * c1[i] + amount * c2[i]) for i in range(3)]
00039
00040 def color_to_html_string(c):
00041
00042 return '#%02x%02x%02x' % (c[0],c[1],c[2])
00043
00044 def get_formatted_float(f):
00045 s = "%.2f"%f
00046 return s.rstrip('0').rstrip('.') if '.' in s else s
00047
00048 def draw_bar_chart(samples, top_level_names, second_level_names=None,
00049 title=None, xlabel=None, ylabel=None, color=None,
00050 bottom=None, yticklabels=None):
00051
00052
00053
00054
00055
00056
00057
00058
00059 means = []
00060 confs = []
00061 second_level_grouping_available = \
00062 isinstance(samples[0][0], collections.Sequence)
00063 top_level_methods = len(samples)
00064
00065 if second_level_grouping_available:
00066 second_level_methods = len(samples[0])
00067 samples2 = samples
00068 else:
00069
00070 second_level_methods = 1
00071 samples2 = [[samples[i]] for i in range(top_level_methods)]
00072
00073 for i in range(top_level_methods):
00074 means.append([])
00075 confs.append([])
00076
00077 for i in range(top_level_methods):
00078 for j in range(second_level_methods):
00079 m, h = calculate_mean_and_standard_error(samples2[i][j])
00080 print str(top_level_names[i]) + ",",
00081 if second_level_names is not None:
00082 print str(second_level_names[j]),
00083 else:
00084 print j,
00085 print ": " + "%.2f"%m + "+-" + "%.2f"%h
00086 is_sig = True
00087 for k in range(second_level_methods):
00088 if j == k:
00089 continue
00090 is_sig = is_significant(samples2[i][j], samples2[i][k])
00091 if not is_sig:
00092 break
00093 if is_sig:
00094 print " is significantly different from all other methods in the group."
00095 means[i].append(m)
00096 confs[i].append(h)
00097
00098
00099 ind = np.arange(second_level_methods)
00100 width = 1.0 / (top_level_methods + 1)
00101 fig, ax = plt.subplots()
00102 rects = []
00103 for i in range(top_level_methods):
00104 barhatch = None
00105 barcolor = color
00106 if color is None:
00107 barcolor = METHOD_COLORS[i % len(METHOD_COLORS)]
00108 barhatch = METHOD_HATCH[i % len(METHOD_HATCH)]
00109 rect = ax.bar(ind + i*width, means[i], width,
00110 color=barcolor,
00111 hatch=barhatch,
00112 yerr=confs[i], ecolor='black', bottom=bottom)
00113 rects.append(rect)
00114
00115 if xlabel:
00116 ax.set_xlabel(xlabel)
00117 if ylabel:
00118 ax.set_ylabel(ylabel)
00119 if title:
00120 ax.set_title(title)
00121
00122 if second_level_grouping_available:
00123 ax.set_xticks(ind+0.5-width/2)
00124 if second_level_names:
00125 ax.set_xticklabels(second_level_names)
00126 else:
00127 ax.set_xticklabels([])
00128
00129 if yticklabels:
00130 ax.set_yticklabels(yticklabels)
00131
00132 if top_level_names:
00133 ax.legend(rects, top_level_names)
00134
00135 return fig, ax, rects, means
00136
00137 def draw_line_graph(samples, top_level_names, second_level_names=None,
00138 title=None, xlabel=None, ylabel=None, yticklabels=None):
00139
00140
00141
00142
00143
00144
00145
00146
00147 means = []
00148 confs = []
00149 second_level_grouping_available = \
00150 isinstance(samples[0][0], collections.Sequence)
00151 top_level_methods = len(samples)
00152
00153 if second_level_grouping_available:
00154 second_level_methods = len(samples[0])
00155 samples2 = samples
00156 else:
00157
00158 second_level_methods = 1
00159 samples2 = [[samples[i]] for i in range(top_level_methods)]
00160
00161 for i in range(top_level_methods):
00162 means.append([])
00163 confs.append([])
00164
00165 for i in range(top_level_methods):
00166 for j in range(second_level_methods):
00167 m, h = calculate_mean_and_standard_error(samples[i][j])
00168 means[i].append(m)
00169 confs[i].append(h)
00170
00171 ind = np.arange(second_level_methods)
00172 fig, ax = plt.subplots()
00173 rects = []
00174 for i in range(top_level_methods):
00175 rect, = ax.plot(np.arange(0, second_level_methods), means[i],
00176 color=LINE_COLORS[i%len(LINE_COLORS)],
00177 dashes=LINE_HATCH[i%len(LINE_HATCH)],
00178 linewidth = 2)
00179 rects.append(rect)
00180
00181 if xlabel:
00182 ax.set_xlabel(xlabel)
00183 if ylabel:
00184 ax.set_ylabel(ylabel)
00185 if title:
00186 ax.set_title(title)
00187
00188 if second_level_grouping_available:
00189 tick_multiplier = int(math.ceil(float(second_level_methods)/float(len(second_level_names))))
00190 ax.set_xticks(tick_multiplier * ind)
00191 if second_level_names:
00192 ax.set_xticklabels(second_level_names)
00193
00194 if yticklabels:
00195 ax.set_yticklabels(yticklabels)
00196 ax.legend(rects, top_level_names, handlelength=4)
00197
00198 return fig, ax, rects, means
00199
00200 def draw_3d_bar_chart(samples, top_level_names=None, second_level_names=None,
00201 title=None, xlabel=None, ylabel=None, zlabel=None,
00202 flip_y=True, third_level_names=None):
00203
00204
00205
00206
00207
00208
00209
00210
00211 means = []
00212 confs = []
00213 second_level_grouping_available = \
00214 isinstance(samples[0][0], collections.Sequence)
00215 top_level_methods = len(samples)
00216
00217 if second_level_grouping_available:
00218 second_level_methods = len(samples[0])
00219 samples2 = samples
00220 else:
00221
00222 second_level_methods = 1
00223 samples2 = [[samples[i]] for i in range(top_level_methods)]
00224
00225 for i in range(top_level_methods):
00226 means.append([])
00227 confs.append([])
00228
00229 for i in range(top_level_methods):
00230 for j in range(second_level_methods):
00231 m, h = calculate_mean_and_standard_error(samples[i][j])
00232 means[i].append(m)
00233 confs[i].append(h)
00234
00235 ind = np.arange(second_level_methods)
00236 fig = plt.figure()
00237 ax = fig.add_subplot(111, projection='3d')
00238 rects = []
00239 xpos = np.array([])
00240 ypos = np.array([])
00241 zpos = np.array([])
00242 dx = np.array([])
00243 dy = np.array([])
00244 dz = np.array([])
00245 for j in range(second_level_methods):
00246 for i in range(top_level_methods):
00247 xpos = np.append(xpos, i)
00248 if flip_y:
00249 ypos = np.append(ypos, second_level_methods - j - 1)
00250 else:
00251 ypos = np.append(ypos, j)
00252 zpos = np.append(zpos, 0)
00253 dx = np.append(dx, 1.0)
00254 dy = np.append(dy, 0.5)
00255 dz = np.append(dz, means[i][j])
00256
00257
00258 offset = dz + np.abs(dz.min())
00259 fracs = offset.astype(float)/offset.max()
00260 norm = colors.normalize(fracs.min(), fracs.max())
00261 colors_t = cm.jet(norm(fracs) / 2 + 0.5)
00262
00263
00264
00265 rects = ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color=colors_t, zsort=True)
00266
00267 if xlabel:
00268 ax.set_xlabel(xlabel)
00269 if ylabel:
00270 ax.set_ylabel(ylabel)
00271 if zlabel:
00272 ax.set_zlabel(zlabel)
00273 if title:
00274 ax.set_title(title)
00275
00276 if second_level_grouping_available:
00277 ax.set_yticks(ind + 0.5)
00278 if second_level_names:
00279 if flip_y:
00280 second_level_names.reverse()
00281 ax.set_yticklabels(second_level_names)
00282
00283 if third_level_names:
00284 ax.set_zticklabels(third_level_names)
00285
00286 tick_multiplier = int(math.ceil(float(top_level_methods)/float(len(top_level_names))))
00287 ax.set_xticks(tick_multiplier * np.arange(len(top_level_names)) + 0.5)
00288
00289 xtickrotation = raw_input("Specify rotation for xticklabels [hit enter to use zero]: ")
00290 if xtickrotation is not None and xtickrotation != "":
00291 xtickrotation = float(xtickrotation)
00292 else:
00293 xtickrotation = 0.0
00294 if top_level_names:
00295 ax.set_xticklabels(top_level_names, rotation=xtickrotation)
00296
00297
00298 return fig, ax, rects, means
00299
00300 def is_greek_alphabet(str):
00301 return str.lower() in ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa',
00302 'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi',
00303 'chi', 'psi', 'omega']
00304
00305 def format_word(str):
00306 if is_greek_alphabet(str):
00307 return '$\\' + str + '$'
00308 else:
00309 return str.title()
00310
00311
00312 def get_formatted_name(name, name_mappings=None):
00313 if name_mappings is not None and name in name_mappings:
00314 return name_mappings[name]
00315 name = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name)
00316 name = name.replace('_', ' ')
00317 name = ''.join([format_word(x) for x in name.split(' ')])
00318 return name
00319
00320 def get_formatted_combination_name(name_dict, name_mappings=None):
00321 formatted_name = ""
00322 for key, value in name_dict.iteritems():
00323 if key == "name":
00324 continue
00325 formatted_name += get_formatted_name(key, name_mappings) + "=" + str(value) + ","
00326 formatted_name = formatted_name[:-1]
00327
00328 if "name" in name_dict:
00329 if formatted_name == "":
00330 formatted_name = str(name_dict["name"])
00331 else:
00332 formatted_name = str(name_dict["name"]) + "[" + formatted_name + "]"
00333 return formatted_name
00334
00335 def draw_from_data_frame(filename, output, plot_type, filter=None, secondary_filter=None,
00336 attempt_auto_mapping=True, name_mappings=None):
00337
00338 data_per_file = []
00339 for file in filename:
00340 data_per_file.append(pd.read_csv(file))
00341 data = pd.tools.merge.concat(data_per_file, ignore_index=True)
00342
00343
00344 if output not in data:
00345 print "Output column name not in data!"
00346 return
00347
00348
00349 if (filter is None) and "name" in data:
00350 filter = "name"
00351
00352 primary_filters = None
00353 if filter is not None:
00354 primary_filters = filter.split(",")
00355
00356 secondary_filters = None
00357 if secondary_filter is not None:
00358 secondary_filters = secondary_filter.split(",")
00359
00360 title = None
00361 xlabel = None
00362 ylabel = None
00363 zlabel = get_formatted_name(output, name_mappings)
00364 samples = []
00365 top_level_names = None
00366 second_level_names = None
00367 is_first_primary_combination = True
00368
00369 if primary_filters is None:
00370 samples.append(data[output].tolist())
00371 else:
00372 if len(primary_filters) == 1 and filter != "name":
00373 xlabel = get_formatted_name(filter, name_mappings)
00374 else:
00375 xlabel = 'Methods'
00376 primary_unique_values = []
00377 primary_filters_copy = copy.deepcopy(primary_filters)
00378 for i, pf in enumerate(primary_filters_copy):
00379 if len(data[pf].unique().tolist()) == 1:
00380 primary_filters.remove(pf)
00381 else:
00382 primary_unique_values.append(data[pf].unique().tolist())
00383
00384 all_possible_primary_combinations = list(itertools.product(*primary_unique_values))
00385 all_possible_primary_combinations.sort()
00386
00387 top_level_names = []
00388 for primary_combination in all_possible_primary_combinations:
00389 combination_data = data
00390 combination_name_dict = {}
00391 for i, filter_value in enumerate(primary_combination):
00392 combination_data = combination_data[combination_data[primary_filters[i]] == filter_value]
00393 if isinstance(filter_value, float):
00394 filter_value = get_formatted_float(filter_value)
00395 combination_name_dict[primary_filters[i]] = filter_value
00396 if len(combination_data.index) == 0:
00397 continue
00398
00399 combination_name = get_formatted_combination_name(combination_name_dict, name_mappings)
00400 entered_name = raw_input("Suggested combincation name (Hit Enter to use default, Enter 'skip' to skip this combination)[" + combination_name + "]: ")
00401 if entered_name is not None and entered_name != "":
00402 if entered_name == 'skip':
00403 continue
00404 else:
00405 combination_name = entered_name
00406
00407
00408 if secondary_filters is None:
00409 combination_samples = combination_data[output].tolist()
00410 else:
00411 secondary_unique_values = []
00412 secondary_filters_copy = copy.deepcopy(secondary_filters)
00413 for i, sf in enumerate(secondary_filters_copy):
00414 secondary_unique_values.append(combination_data[sf].unique().tolist())
00415
00416 all_possible_secondary_combinations = list(itertools.product(*secondary_unique_values))
00417 all_possible_secondary_combinations.sort()
00418
00419 combination_samples = []
00420 if is_first_primary_combination:
00421 second_level_names = []
00422 if len(secondary_filters) == 1:
00423 ylabel = get_formatted_name(secondary_filter, name_mappings)
00424 else:
00425 ylabel = 'Methods'
00426 for secondary_combination in all_possible_secondary_combinations:
00427 secondary_combination_data = combination_data
00428 secondary_combination_name_dict = {}
00429 for i, filter_value in enumerate(secondary_combination):
00430 secondary_combination_data = secondary_combination_data[secondary_combination_data[secondary_filters[i]] == filter_value]
00431 if isinstance(filter_value, float):
00432 filter_value = get_formatted_float(filter_value)
00433 secondary_combination_name_dict[secondary_filters[i]] = filter_value
00434 secondary_combination_name = get_formatted_combination_name(secondary_combination_name_dict, name_mappings)
00435 secondary_combination_samples = secondary_combination_data[output].tolist()
00436
00437 if is_first_primary_combination:
00438 if ylabel == 'Methods':
00439 second_level_names.append(secondary_combination_name)
00440 else:
00441 second_level_names.append(secondary_combination_name_dict[secondary_filters[0]])
00442 combination_samples.append(secondary_combination_samples)
00443
00444 if xlabel == 'Methods' or plot_type != '3d' or combination_name == ' ':
00445 top_level_names.append(combination_name)
00446 else:
00447 top_level_names.append(combination_name_dict[primary_filters[0]])
00448 samples.append(combination_samples)
00449 is_first_primary_combination = False
00450
00451 if plot_type == 'line':
00452 xlabel = ylabel
00453 ylabel = zlabel
00454 return draw_line_graph(samples, top_level_names, second_level_names, title, xlabel, ylabel)
00455 elif plot_type == '3d':
00456 return draw_3d_bar_chart(samples, top_level_names, second_level_names, title, xlabel, ylabel, zlabel)
00457 else:
00458 xlabel = ylabel
00459 ylabel = zlabel
00460 return draw_bar_chart(samples, top_level_names, second_level_names, title, xlabel, ylabel)