bwi_tools: graph.py Source File

Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 
00003 import collections
00004 import copy
00005 import numpy as np
00006 import itertools
00007 import math
00008 import matplotlib.cm as cm
00009 import matplotlib.colors as colors
00010 import matplotlib.pyplot as plt; plt.rcdefaults()
00011 from mpl_toolkits.mplot3d import Axes3D
00012 import re
00013 import scipy.stats as stats
00014 import pandas as pd
00015 
00016 # Keep the following at different length to produce more distinct combinations
00017 METHOD_COLORS = ['yellow', 'red', 'aqua', 'green', 'lightgray', 'blue']
00018 METHOD_HATCH = ['/', '\\', 'x', '*', 'o', 'O', '.']
00019 LINE_COLORS = ['red', 'blue', 'green']
00020 LINE_HATCH = [(20,0),(20,5),(5,5),(15,5,5,5),(15,5,2,5)]
00021 
00022 def calculate_mean_and_standard_error(data):
00023     a = 1.0 * np.array(data)
00024     return np.mean(a), stats.sem(a)
00025 
00026 def mean_confidence_interval(data, confidence=0.95):
00027     a = 1.0 * np.array(data)
00028     n = len(a)
00029     m, se = np.mean(a), stats.sem(a)
00030     h = se * stats.t._ppf((1+confidence)/2., n-1)
00031     return m, h
00032 
00033 def is_significant(a, b, confidence=0.95):
00034     t, p = stats.ttest_ind(a, b)
00035     return p < (1.0 - confidence)
00036 
00037 def mix_colors(c1, c2, amount):
00038     return [int((1 - amount) * c1[i] + amount * c2[i]) for i in range(3)]
00039 
00040 def color_to_html_string(c):
00041     # http://stackoverflow.com/questions/13998901/generating-a-random-hex-color-in-python
00042     return '#%02x%02x%02x' % (c[0],c[1],c[2])
00043 
00044 def get_formatted_float(f):
00045     s = "%.2f"%f
00046     return s.rstrip('0').rstrip('.') if '.' in s else s
00047 
00048 def draw_bar_chart(samples, top_level_names, second_level_names=None, 
00049                    title=None, xlabel=None, ylabel=None, color=None,
00050                    bottom=None, yticklabels=None):
00051 
00052     # So, samples can either contain a list of lists. The top level list
00053     # contains top level groups, and the second level list contains actual
00054     # samples (top_level_grouping_only = true)
00055 
00056     # Alternatively, samples may be a list of lists of lists, with top-level 
00057     # groups, second level groups and actual samples. (top_level_grouping_only)
00058 
00059     means = []
00060     confs = []
00061     second_level_grouping_available = \
00062             isinstance(samples[0][0], collections.Sequence)
00063     top_level_methods = len(samples)
00064 
00065     if second_level_grouping_available: 
00066         second_level_methods = len(samples[0])
00067         samples2 = samples
00068     else:
00069         # Create artificial second level grouping
00070         second_level_methods = 1
00071         samples2 = [[samples[i]] for i in range(top_level_methods)]
00072 
00073     for i in range(top_level_methods):
00074         means.append([])
00075         confs.append([])
00076 
00077     for i in range(top_level_methods):
00078         for j in range(second_level_methods):
00079             m, h = calculate_mean_and_standard_error(samples2[i][j])
00080             print str(top_level_names[i]) + ",",
00081             if second_level_names is not None:
00082                 print str(second_level_names[j]), 
00083             else:
00084                 print j,
00085             print ": " + "%.2f"%m + "+-" + "%.2f"%h
00086             is_sig = True
00087             for k in range(second_level_methods):
00088                 if j == k:
00089                     continue
00090                 is_sig = is_significant(samples2[i][j], samples2[i][k])
00091                 if not is_sig:
00092                     break
00093             if is_sig:
00094                 print "  is significantly different from all other methods in the group."
00095             means[i].append(m)
00096             confs[i].append(h)
00097 
00098 
00099     ind = np.arange(second_level_methods)
00100     width = 1.0 / (top_level_methods + 1)
00101     fig, ax = plt.subplots()
00102     rects = []
00103     for i in range(top_level_methods):
00104         barhatch = None
00105         barcolor = color
00106         if color is None:
00107             barcolor = METHOD_COLORS[i % len(METHOD_COLORS)]
00108             barhatch = METHOD_HATCH[i % len(METHOD_HATCH)]
00109         rect = ax.bar(ind + i*width, means[i], width,
00110                       color=barcolor, 
00111                       hatch=barhatch,
00112                       yerr=confs[i], ecolor='black', bottom=bottom)
00113         rects.append(rect)
00114 
00115     if xlabel:
00116         ax.set_xlabel(xlabel)
00117     if ylabel:
00118         ax.set_ylabel(ylabel)
00119     if title:
00120         ax.set_title(title)
00121 
00122     if second_level_grouping_available:
00123         ax.set_xticks(ind+0.5-width/2)
00124         if second_level_names:
00125             ax.set_xticklabels(second_level_names)
00126     else:
00127         ax.set_xticklabels([])
00128 
00129     if yticklabels:
00130         ax.set_yticklabels(yticklabels)
00131 
00132     if top_level_names:
00133         ax.legend(rects, top_level_names)
00134 
00135     return fig, ax, rects, means
00136 
00137 def draw_line_graph(samples, top_level_names, second_level_names=None, 
00138                     title=None, xlabel=None, ylabel=None, yticklabels=None):
00139 
00140     # So, samples can either contain a list of lists. The top level list
00141     # contains top level groups, and the second level list contains actual
00142     # samples (top_level_grouping_only = true)
00143 
00144     # Alternatively, samples may be a list of lists of lists, with top-level 
00145     # groups, second level groups and actual samples. (top_level_grouping_only)
00146 
00147     means = []
00148     confs = []
00149     second_level_grouping_available = \
00150             isinstance(samples[0][0], collections.Sequence)
00151     top_level_methods = len(samples)
00152 
00153     if second_level_grouping_available: 
00154         second_level_methods = len(samples[0])
00155         samples2 = samples
00156     else:
00157         # Create artificial second level grouping
00158         second_level_methods = 1
00159         samples2 = [[samples[i]] for i in range(top_level_methods)]
00160 
00161     for i in range(top_level_methods):
00162         means.append([])
00163         confs.append([])
00164 
00165     for i in range(top_level_methods):
00166         for j in range(second_level_methods):
00167             m, h = calculate_mean_and_standard_error(samples[i][j])
00168             means[i].append(m)
00169             confs[i].append(h)
00170 
00171     ind = np.arange(second_level_methods)
00172     fig, ax = plt.subplots()
00173     rects = []
00174     for i in range(top_level_methods):
00175         rect, = ax.plot(np.arange(0, second_level_methods), means[i],
00176                         color=LINE_COLORS[i%len(LINE_COLORS)],
00177                         dashes=LINE_HATCH[i%len(LINE_HATCH)],
00178                         linewidth = 2)
00179         rects.append(rect)
00180 
00181     if xlabel:
00182         ax.set_xlabel(xlabel)
00183     if ylabel:
00184         ax.set_ylabel(ylabel)
00185     if title:
00186         ax.set_title(title)
00187 
00188     if second_level_grouping_available:
00189         tick_multiplier = int(math.ceil(float(second_level_methods)/float(len(second_level_names))))
00190         ax.set_xticks(tick_multiplier * ind)
00191         if second_level_names:
00192             ax.set_xticklabels(second_level_names)
00193 
00194     if yticklabels:
00195         ax.set_yticklabels(yticklabels)
00196     ax.legend(rects, top_level_names, handlelength=4) #mode='expand', ncol=4)
00197 
00198     return fig, ax, rects, means
00199 
00200 def draw_3d_bar_chart(samples, top_level_names=None, second_level_names=None, 
00201                       title=None, xlabel=None, ylabel=None, zlabel=None,
00202                       flip_y=True, third_level_names=None):
00203 
00204     # So, samples can either contain a list of lists. The top level list
00205     # contains top level groups, and the second level list contains actual
00206     # samples (top_level_grouping_only = true)
00207 
00208     # Alternatively, samples may be a list of lists of lists, with top-level 
00209     # groups, second level groups and actual samples. (top_level_grouping_only)
00210 
00211     means = []
00212     confs = []
00213     second_level_grouping_available = \
00214             isinstance(samples[0][0], collections.Sequence)
00215     top_level_methods = len(samples)
00216 
00217     if second_level_grouping_available: 
00218         second_level_methods = len(samples[0])
00219         samples2 = samples
00220     else:
00221         # Create artificial second level grouping
00222         second_level_methods = 1
00223         samples2 = [[samples[i]] for i in range(top_level_methods)]
00224 
00225     for i in range(top_level_methods):
00226         means.append([])
00227         confs.append([])
00228 
00229     for i in range(top_level_methods):
00230         for j in range(second_level_methods):
00231             m, h = calculate_mean_and_standard_error(samples[i][j])
00232             means[i].append(m)
00233             confs[i].append(h)
00234 
00235     ind = np.arange(second_level_methods)
00236     fig = plt.figure()
00237     ax = fig.add_subplot(111, projection='3d')
00238     rects = []
00239     xpos = np.array([])
00240     ypos = np.array([])
00241     zpos = np.array([])
00242     dx = np.array([])
00243     dy = np.array([])
00244     dz = np.array([])
00245     for j in range(second_level_methods):
00246         for i in range(top_level_methods):
00247             xpos = np.append(xpos, i)
00248             if flip_y:
00249                 ypos = np.append(ypos, second_level_methods - j - 1)
00250             else:
00251                 ypos = np.append(ypos, j)
00252             zpos = np.append(zpos, 0)
00253             dx = np.append(dx, 1.0)
00254             dy = np.append(dy, 0.5)
00255             dz = np.append(dz, means[i][j])
00256 
00257     #http://stackoverflow.com/questions/11950375/apply-color-map-to-mpl-toolkits-mplot3d-axes3d-bar3d
00258     offset = dz + np.abs(dz.min())
00259     fracs = offset.astype(float)/offset.max()
00260     norm = colors.normalize(fracs.min(), fracs.max())
00261     colors_t = cm.jet(norm(fracs) / 2 + 0.5)
00262 
00263     # for xs, ys, zs, dxs, dys, dzs, colors_ts in zip(xpos, ypos, zpos, dx, dy, dz, colors_t):
00264     #     rects.append(ax.bar3d(xs, ys, zs, dxs, dys, dzs, color=colors_ts, zsort=''))
00265     rects = ax.bar3d(xpos, ypos, zpos, dx, dy, dz, color=colors_t, zsort=True)
00266 
00267     if xlabel:
00268         ax.set_xlabel(xlabel)
00269     if ylabel:
00270         ax.set_ylabel(ylabel)
00271     if zlabel:
00272         ax.set_zlabel(zlabel)
00273     if title:
00274         ax.set_title(title)
00275 
00276     if second_level_grouping_available:
00277         ax.set_yticks(ind + 0.5)
00278         if second_level_names:
00279             if flip_y:
00280                 second_level_names.reverse()
00281             ax.set_yticklabels(second_level_names)
00282 
00283     if third_level_names:
00284         ax.set_zticklabels(third_level_names)
00285 
00286     tick_multiplier = int(math.ceil(float(top_level_methods)/float(len(top_level_names))))
00287     ax.set_xticks(tick_multiplier * np.arange(len(top_level_names)) + 0.5)
00288 
00289     xtickrotation = raw_input("Specify rotation for xticklabels [hit enter to use zero]: ")
00290     if xtickrotation is not None and xtickrotation != "":
00291         xtickrotation = float(xtickrotation)
00292     else:
00293         xtickrotation = 0.0
00294     if top_level_names:
00295         ax.set_xticklabels(top_level_names, rotation=xtickrotation)
00296 #    ax.legend(rects, top_level_names, mode='expand', ncol=3)
00297 
00298     return fig, ax, rects, means
00299 
00300 def is_greek_alphabet(str):
00301     return str.lower() in ['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta', 'eta', 'theta', 'iota', 'kappa',
00302                            'lambda', 'mu', 'nu', 'xi', 'omicron', 'pi', 'rho', 'sigma', 'tau', 'upsilon', 'phi',
00303                            'chi', 'psi', 'omega']
00304 
00305 def format_word(str):
00306     if is_greek_alphabet(str):
00307         return '$\\' + str + '$'
00308     else:
00309         return str.title()
00310 
00311 #TODO fix the name mapping file stuff
00312 def get_formatted_name(name, name_mappings=None):
00313     if name_mappings is not None and name in name_mappings:
00314         return name_mappings[name]
00315     name = re.sub('(.)([A-Z][a-z]+)', r'\1 \2', name) # Convert camelCase to space separated words
00316     name = name.replace('_', ' ') # Convert snake_case to space separate words
00317     name = ''.join([format_word(x) for x in name.split(' ')]) # join words while converting greek letters.
00318     return name
00319 
00320 def get_formatted_combination_name(name_dict, name_mappings=None):
00321     formatted_name = ""
00322     for key, value in name_dict.iteritems():
00323         if key == "name":
00324             continue
00325         formatted_name += get_formatted_name(key, name_mappings) + "=" + str(value) + ","
00326     formatted_name = formatted_name[:-1]
00327 
00328     if "name" in name_dict:
00329         if formatted_name == "":
00330             formatted_name = str(name_dict["name"])
00331         else:
00332             formatted_name = str(name_dict["name"]) + "[" + formatted_name + "]"
00333     return formatted_name
00334 
00335 def draw_from_data_frame(filename, output, plot_type, filter=None, secondary_filter=None, 
00336                          attempt_auto_mapping=True, name_mappings=None):
00337 
00338     data_per_file = []
00339     for file in filename:
00340         data_per_file.append(pd.read_csv(file))
00341     data = pd.tools.merge.concat(data_per_file, ignore_index=True)
00342 
00343     # Check if the output column exists in the data frame.
00344     if output not in data:
00345         print "Output column name not in data!"
00346         return
00347 
00348     # Get primary and secondary filters.
00349     if (filter is None) and "name" in data:
00350         filter = "name"
00351 
00352     primary_filters = None
00353     if filter is not None:
00354         primary_filters = filter.split(",")
00355 
00356     secondary_filters = None
00357     if secondary_filter is not None:
00358         secondary_filters = secondary_filter.split(",")
00359 
00360     title = None
00361     xlabel = None
00362     ylabel = None
00363     zlabel = get_formatted_name(output, name_mappings)
00364     samples = []
00365     top_level_names = None
00366     second_level_names = None
00367     is_first_primary_combination = True
00368 
00369     if primary_filters is None:
00370         samples.append(data[output].tolist())
00371     else:
00372         if len(primary_filters) == 1 and filter != "name":
00373             xlabel = get_formatted_name(filter, name_mappings)
00374         else:
00375             xlabel = 'Methods'
00376         primary_unique_values = []
00377         primary_filters_copy = copy.deepcopy(primary_filters)
00378         for i, pf in enumerate(primary_filters_copy):
00379             if len(data[pf].unique().tolist()) == 1: # This column is not really being used. 
00380                 primary_filters.remove(pf)
00381             else:
00382                 primary_unique_values.append(data[pf].unique().tolist())
00383         
00384         all_possible_primary_combinations = list(itertools.product(*primary_unique_values))
00385         all_possible_primary_combinations.sort()
00386         
00387         top_level_names = []
00388         for primary_combination in all_possible_primary_combinations:
00389             combination_data = data
00390             combination_name_dict = {}
00391             for i, filter_value in enumerate(primary_combination):
00392                 combination_data = combination_data[combination_data[primary_filters[i]] == filter_value]
00393                 if isinstance(filter_value, float):
00394                     filter_value = get_formatted_float(filter_value)
00395                 combination_name_dict[primary_filters[i]] = filter_value
00396             if len(combination_data.index) == 0:
00397                 continue
00398 
00399             combination_name = get_formatted_combination_name(combination_name_dict, name_mappings)
00400             entered_name = raw_input("Suggested combincation name (Hit Enter to use default, Enter 'skip' to skip this combination)[" + combination_name + "]: ")
00401             if entered_name is not None and entered_name != "":
00402                 if entered_name == 'skip':
00403                     continue
00404                 else:
00405                     combination_name = entered_name
00406 
00407             # Now that we have combination data, apply secondary filtering if necessary
00408             if secondary_filters is None:
00409                 combination_samples = combination_data[output].tolist()
00410             else:
00411                 secondary_unique_values = []
00412                 secondary_filters_copy = copy.deepcopy(secondary_filters)
00413                 for i, sf in enumerate(secondary_filters_copy):
00414                     secondary_unique_values.append(combination_data[sf].unique().tolist())
00415                 
00416                 all_possible_secondary_combinations = list(itertools.product(*secondary_unique_values))
00417                 all_possible_secondary_combinations.sort()
00418 
00419                 combination_samples = []
00420                 if is_first_primary_combination:
00421                     second_level_names = []
00422                     if len(secondary_filters) == 1:
00423                         ylabel = get_formatted_name(secondary_filter, name_mappings)
00424                     else:
00425                         ylabel = 'Methods'
00426                 for secondary_combination in all_possible_secondary_combinations:
00427                     secondary_combination_data = combination_data
00428                     secondary_combination_name_dict = {}
00429                     for i, filter_value in enumerate(secondary_combination):
00430                         secondary_combination_data = secondary_combination_data[secondary_combination_data[secondary_filters[i]] == filter_value]
00431                         if isinstance(filter_value, float):
00432                             filter_value = get_formatted_float(filter_value)
00433                         secondary_combination_name_dict[secondary_filters[i]] = filter_value
00434                     secondary_combination_name = get_formatted_combination_name(secondary_combination_name_dict, name_mappings)
00435                     secondary_combination_samples = secondary_combination_data[output].tolist()
00436 
00437                     if is_first_primary_combination:
00438                         if ylabel == 'Methods':
00439                             second_level_names.append(secondary_combination_name)
00440                         else:
00441                             second_level_names.append(secondary_combination_name_dict[secondary_filters[0]])
00442                     combination_samples.append(secondary_combination_samples)
00443 
00444             if xlabel == 'Methods' or plot_type != '3d' or combination_name == ' ':
00445                 top_level_names.append(combination_name)
00446             else:
00447                 top_level_names.append(combination_name_dict[primary_filters[0]])
00448             samples.append(combination_samples)
00449             is_first_primary_combination = False
00450 
00451     if plot_type == 'line':
00452         xlabel = ylabel
00453         ylabel = zlabel
00454         return draw_line_graph(samples, top_level_names, second_level_names, title, xlabel, ylabel)
00455     elif plot_type == '3d':
00456         return draw_3d_bar_chart(samples, top_level_names, second_level_names, title, xlabel, ylabel, zlabel)
00457     else:
00458         xlabel = ylabel
00459         ylabel = zlabel
00460         return draw_bar_chart(samples, top_level_names, second_level_names, title, xlabel, ylabel)