stats.py
Go to the documentation of this file.
00001 # Copyright (c) 1999-2007 Gary Strangman; All Rights Reserved.
00002 #
00003 # Permission is hereby granted, free of charge, to any person obtaining a copy
00004 # of this software and associated documentation files (the "Software"), to deal
00005 # in the Software without restriction, including without limitation the rights
00006 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
00007 # copies of the Software, and to permit persons to whom the Software is
00008 # furnished to do so, subject to the following conditions:
00009 # 
00010 # The above copyright notice and this permission notice shall be included in
00011 # all copies or substantial portions of the Software.
00012 # 
00013 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00014 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00015 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
00016 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
00017 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
00018 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
00019 # THE SOFTWARE.
00020 #
00021 # Comments and/or additions are welcome (send e-mail to:
00022 # strang@nmr.mgh.harvard.edu).
00023 # 
00024 """
00025 stats.py module
00026 
00027 (Requires pstat.py module.)
00028 
00029 #################################################
00030 #######  Written by:  Gary Strangman  ###########
00031 #######  Last modified:  Dec 18, 2007 ###########
00032 #################################################
00033 
00034 A collection of basic statistical functions for python.  The function
00035 names appear below.
00036 
00037 IMPORTANT:  There are really *3* sets of functions.  The first set has an 'l'
00038 prefix, which can be used with list or tuple arguments.  The second set has
00039 an 'a' prefix, which can accept NumPy array arguments.  These latter
00040 functions are defined only when NumPy is available on the system.  The third
00041 type has NO prefix (i.e., has the name that appears below).  Functions of
00042 this set are members of a "Dispatch" class, c/o David Ascher.  This class
00043 allows different functions to be called depending on the type of the passed
00044 arguments.  Thus, stats.mean is a member of the Dispatch class and
00045 stats.mean(range(20)) will call stats.lmean(range(20)) while
00046 stats.mean(Numeric.arange(20)) will call stats.amean(Numeric.arange(20)).
00047 This is a handy way to keep consistent function names when different
00048 argument types require different functions to be called.  Having
00049 implementated the Dispatch class, however, means that to get info on
00050 a given function, you must use the REAL function name ... that is
00051 "print stats.lmean.__doc__" or "print stats.amean.__doc__" work fine,
00052 while "print stats.mean.__doc__" will print the doc for the Dispatch
00053 class.  NUMPY FUNCTIONS ('a' prefix) generally have more argument options
00054 but should otherwise be consistent with the corresponding list functions.
00055 
00056 Disclaimers:  The function list is obviously incomplete and, worse, the
00057 functions are not optimized.  All functions have been tested (some more
00058 so than others), but they are far from bulletproof.  Thus, as with any
00059 free software, no warranty or guarantee is expressed or implied. :-)  A
00060 few extra functions that don't appear in the list below can be found by
00061 interested treasure-hunters.  These functions don't necessarily have
00062 both list and array versions but were deemed useful
00063 
00064 CENTRAL TENDENCY:  geometricmean
00065                    harmonicmean
00066                    mean
00067                    median
00068                    medianscore
00069                    mode
00070 
00071 MOMENTS:  moment
00072           variation
00073           skew
00074           kurtosis
00075           skewtest   (for Numpy arrays only)
00076           kurtosistest (for Numpy arrays only)
00077           normaltest (for Numpy arrays only)
00078 
00079 ALTERED VERSIONS:  tmean  (for Numpy arrays only)
00080                    tvar   (for Numpy arrays only)
00081                    tmin   (for Numpy arrays only)
00082                    tmax   (for Numpy arrays only)
00083                    tstdev (for Numpy arrays only)
00084                    tsem   (for Numpy arrays only)
00085                    describe
00086 
00087 FREQUENCY STATS:  itemfreq
00088                   scoreatpercentile
00089                   percentileofscore
00090                   histogram
00091                   cumfreq
00092                   relfreq
00093 
00094 VARIABILITY:  obrientransform
00095               samplevar
00096               samplestdev
00097               signaltonoise (for Numpy arrays only)
00098               var
00099               stdev
00100               sterr
00101               sem
00102               z
00103               zs
00104               zmap (for Numpy arrays only)
00105 
00106 TRIMMING FCNS:  threshold (for Numpy arrays only)
00107                 trimboth
00108                 trim1
00109                 round (round all vals to 'n' decimals; Numpy only)
00110 
00111 CORRELATION FCNS:  covariance  (for Numpy arrays only)
00112                    correlation (for Numpy arrays only)
00113                    paired
00114                    pearsonr
00115                    spearmanr
00116                    pointbiserialr
00117                    kendalltau
00118                    linregress
00119 
00120 INFERENTIAL STATS:  ttest_1samp
00121                     ttest_ind
00122                     ttest_rel
00123                     chisquare
00124                     ks_2samp
00125                     mannwhitneyu
00126                     ranksums
00127                     wilcoxont
00128                     kruskalwallish
00129                     friedmanchisquare
00130 
00131 PROBABILITY CALCS:  chisqprob
00132                     erfcc
00133                     zprob
00134                     ksprob
00135                     fprob
00136                     betacf
00137                     gammln 
00138                     betai
00139 
00140 ANOVA FUNCTIONS:  F_oneway
00141                   F_value
00142 
00143 SUPPORT FUNCTIONS:  writecc
00144                     incr
00145                     sign  (for Numpy arrays only)
00146                     sum
00147                     cumsum
00148                     ss
00149                     summult
00150                     sumdiffsquared
00151                     square_of_sums
00152                     shellsort
00153                     rankdata
00154                     outputpairedstats
00155                     findwithin
00156 """
00157 ## CHANGE LOG:
00158 ## ===========
00159 ## 07-11.26 ... conversion for numpy started
00160 ## 07-05-16 ... added Lin's Concordance Correlation Coefficient (alincc) and acov
00161 ## 05-08-21 ... added "Dice's coefficient"
00162 ## 04-10-26 ... added ap2t(), an ugly fcn for converting p-vals to T-vals
00163 ## 04-04-03 ... added amasslinregress() function to do regression on N-D arrays
00164 ## 03-01-03 ... CHANGED VERSION TO 0.6
00165 ##              fixed atsem() to properly handle limits=None case
00166 ##              improved histogram and median functions (estbinwidth) and
00167 ##                   fixed atvar() function (wrong answers for neg numbers?!?)
00168 ## 02-11-19 ... fixed attest_ind and attest_rel for div-by-zero Overflows
00169 ## 02-05-10 ... fixed lchisqprob indentation (failed when df=even)
00170 ## 00-12-28 ... removed aanova() to separate module, fixed licensing to
00171 ##                   match Python License, fixed doc string & imports
00172 ## 00-04-13 ... pulled all "global" statements, except from aanova()
00173 ##              added/fixed lots of documentation, removed io.py dependency
00174 ##              changed to version 0.5
00175 ## 99-11-13 ... added asign() function
00176 ## 99-11-01 ... changed version to 0.4 ... enough incremental changes now
00177 ## 99-10-25 ... added acovariance and acorrelation functions
00178 ## 99-10-10 ... fixed askew/akurtosis to avoid divide-by-zero errors
00179 ##              added aglm function (crude, but will be improved)
00180 ## 99-10-04 ... upgraded acumsum, ass, asummult, asamplevar, avar, etc. to
00181 ##                   all handle lists of 'dimension's and keepdims
00182 ##              REMOVED ar0, ar2, ar3, ar4 and replaced them with around
00183 ##              reinserted fixes for abetai to avoid math overflows
00184 ## 99-09-05 ... rewrote achisqprob/aerfcc/aksprob/afprob/abetacf/abetai to
00185 ##                   handle multi-dimensional arrays (whew!)
00186 ## 99-08-30 ... fixed l/amoment, l/askew, l/akurtosis per D'Agostino (1990)
00187 ##              added anormaltest per same reference
00188 ##              re-wrote azprob to calc arrays of probs all at once
00189 ## 99-08-22 ... edited attest_ind printing section so arrays could be rounded
00190 ## 99-08-19 ... fixed amean and aharmonicmean for non-error(!) overflow on
00191 ##                   short/byte arrays (mean of #s btw 100-300 = -150??)
00192 ## 99-08-09 ... fixed asum so that the None case works for Byte arrays
00193 ## 99-08-08 ... fixed 7/3 'improvement' to handle t-calcs on N-D arrays
00194 ## 99-07-03 ... improved attest_ind, attest_rel (zero-division errortrap)
00195 ## 99-06-24 ... fixed bug(?) in attest_ind (n1=a.shape[0])
00196 ## 04/11/99 ... added asignaltonoise, athreshold functions, changed all
00197 ##                   max/min in array section to N.maximum/N.minimum,
00198 ##                   fixed square_of_sums to prevent integer overflow
00199 ## 04/10/99 ... !!! Changed function name ... sumsquared ==> square_of_sums
00200 ## 03/18/99 ... Added ar0, ar2, ar3 and ar4 rounding functions
00201 ## 02/28/99 ... Fixed aobrientransform to return an array rather than a list
00202 ## 01/15/99 ... Essentially ceased updating list-versions of functions (!!!)
00203 ## 01/13/99 ... CHANGED TO VERSION 0.3
00204 ##              fixed bug in a/lmannwhitneyu p-value calculation
00205 ## 12/31/98 ... fixed variable-name bug in ldescribe
00206 ## 12/19/98 ... fixed bug in findwithin (fcns needed pstat. prefix)
00207 ## 12/16/98 ... changed amedianscore to return float (not array) for 1 score
00208 ## 12/14/98 ... added atmin and atmax functions
00209 ##              removed umath from import line (not needed)
00210 ##              l/ageometricmean modified to reduce chance of overflows (take
00211 ##                   nth root first, then multiply)
00212 ## 12/07/98 ... added __version__variable (now 0.2)
00213 ##              removed all 'stats.' from anova() fcn
00214 ## 12/06/98 ... changed those functions (except shellsort) that altered
00215 ##                   arguments in-place ... cumsum, ranksort, ...
00216 ##              updated (and fixed some) doc-strings
00217 ## 12/01/98 ... added anova() function (requires NumPy)
00218 ##              incorporated Dispatch class
00219 ## 11/12/98 ... added functionality to amean, aharmonicmean, ageometricmean
00220 ##              added 'asum' function (added functionality to N.add.reduce)
00221 ##              fixed both moment and amoment (two errors)
00222 ##              changed name of skewness and askewness to skew and askew
00223 ##              fixed (a)histogram (which sometimes counted points <lowerlimit)
00224 
00225 import pstat               # required 3rd party module
00226 import math, string, copy  # required python modules
00227 from types import *
00228 
00229 __version__ = 0.6
00230 
00231 ############# DISPATCH CODE ##############
00232 
00233 
00234 class Dispatch:
00235     """
00236 The Dispatch class, care of David Ascher, allows different functions to
00237 be called depending on the argument types.  This way, there can be one
00238 function name regardless of the argument type.  To access function doc
00239 in stats.py module, prefix the function with an 'l' or 'a' for list or
00240 array arguments, respectively.  That is, print stats.lmean.__doc__ or
00241 print stats.amean.__doc__ or whatever.
00242 """
00243 
00244     def __init__(self, *tuples):
00245         self._dispatch = {}
00246         for func, types in tuples:
00247             for t in types:
00248                 if t in self._dispatch.keys():
00249                     raise ValueError, "can't have two dispatches on "+str(t)
00250                 self._dispatch[t] = func
00251         self._types = self._dispatch.keys()
00252 
00253     def __call__(self, arg1, *args, **kw):
00254         if type(arg1) not in self._types:
00255             raise TypeError, "don't know how to dispatch %s arguments" %  type(arg1)
00256         return apply(self._dispatch[type(arg1)], (arg1,) + args, kw)
00257 
00258 
00259 ##########################################################################
00260 ########################   LIST-BASED FUNCTIONS   ########################
00261 ##########################################################################
00262 
00263 ### Define these regardless
00264 
00265 ####################################
00266 #######  CENTRAL TENDENCY  #########
00267 ####################################
00268 
00269 def lgeometricmean (inlist):
00270     """
00271 Calculates the geometric mean of the values in the passed list.
00272 That is:  n-th root of (x1 * x2 * ... * xn).  Assumes a '1D' list.
00273 
00274 Usage:   lgeometricmean(inlist)
00275 """
00276     mult = 1.0
00277     one_over_n = 1.0/len(inlist)
00278     for item in inlist:
00279         mult = mult * pow(item,one_over_n)
00280     return mult
00281 
00282 
00283 def lharmonicmean (inlist):
00284     """
00285 Calculates the harmonic mean of the values in the passed list.
00286 That is:  n / (1/x1 + 1/x2 + ... + 1/xn).  Assumes a '1D' list.
00287 
00288 Usage:   lharmonicmean(inlist)
00289 """
00290     sum = 0
00291     for item in inlist:
00292         sum = sum + 1.0/item
00293     return len(inlist) / sum
00294 
00295 
00296 def lmean (inlist):
00297     """
00298 Returns the arithematic mean of the values in the passed list.
00299 Assumes a '1D' list, but will function on the 1st dim of an array(!).
00300 
00301 Usage:   lmean(inlist)
00302 """
00303     sum = 0
00304     for item in inlist:
00305         sum = sum + item
00306     return sum/float(len(inlist))
00307 
00308 
00309 def lmedian (inlist,numbins=1000):
00310     """
00311 Returns the computed median value of a list of numbers, given the
00312 number of bins to use for the histogram (more bins brings the computed value
00313 closer to the median score, default number of bins = 1000).  See G.W.
00314 Heiman's Basic Stats (1st Edition), or CRC Probability & Statistics.
00315 
00316 Usage:   lmedian (inlist, numbins=1000)
00317 """
00318     (hist, smallest, binsize, extras) = histogram(inlist,numbins,[min(inlist),max(inlist)]) # make histog
00319     cumhist = cumsum(hist)              # make cumulative histogram
00320     for i in range(len(cumhist)):        # get 1st(!) index holding 50%ile score
00321         if cumhist[i]>=len(inlist)/2.0:
00322             cfbin = i
00323             break
00324     LRL = smallest + binsize*cfbin        # get lower read limit of that bin
00325     cfbelow = cumhist[cfbin-1]
00326     freq = float(hist[cfbin])                # frequency IN the 50%ile bin
00327     median = LRL + ((len(inlist)/2.0 - cfbelow)/float(freq))*binsize  # median formula
00328     return median
00329 
00330 
00331 def lmedianscore (inlist):
00332     """
00333 Returns the 'middle' score of the passed list.  If there is an even
00334 number of scores, the mean of the 2 middle scores is returned.
00335 
00336 Usage:   lmedianscore(inlist)
00337 """
00338 
00339     newlist = copy.deepcopy(inlist)
00340     newlist.sort()
00341     if len(newlist) % 2 == 0:   # if even number of scores, average middle 2
00342         index = len(newlist)/2  # integer division correct
00343         median = float(newlist[index] + newlist[index-1]) /2
00344     else:
00345         index = len(newlist)/2  # int divsion gives mid value when count from 0
00346         median = newlist[index]
00347     return median
00348 
00349 
00350 def lmode(inlist):
00351     """
00352 Returns a list of the modal (most common) score(s) in the passed
00353 list.  If there is more than one such score, all are returned.  The
00354 bin-count for the mode(s) is also returned.
00355 
00356 Usage:   lmode(inlist)
00357 Returns: bin-count for mode(s), a list of modal value(s)
00358 """
00359 
00360     scores = pstat.unique(inlist)
00361     scores.sort()
00362     freq = []
00363     for item in scores:
00364         freq.append(inlist.count(item))
00365     maxfreq = max(freq)
00366     mode = []
00367     stillmore = 1
00368     while stillmore:
00369         try:
00370             indx = freq.index(maxfreq)
00371             mode.append(scores[indx])
00372             del freq[indx]
00373             del scores[indx]
00374         except ValueError:
00375             stillmore=0
00376     return maxfreq, mode
00377 
00378 
00379 ####################################
00380 ############  MOMENTS  #############
00381 ####################################
00382 
00383 def lmoment(inlist,moment=1):
00384     """
00385 Calculates the nth moment about the mean for a sample (defaults to
00386 the 1st moment).  Used to calculate coefficients of skewness and kurtosis.
00387 
00388 Usage:   lmoment(inlist,moment=1)
00389 Returns: appropriate moment (r) from ... 1/n * SUM((inlist(i)-mean)**r)
00390 """
00391     if moment == 1:
00392         return 0.0
00393     else:
00394         mn = mean(inlist)
00395         n = len(inlist)
00396         s = 0
00397         for x in inlist:
00398             s = s + (x-mn)**moment
00399         return s/float(n)
00400 
00401 
00402 def lvariation(inlist):
00403     """
00404 Returns the coefficient of variation, as defined in CRC Standard
00405 Probability and Statistics, p.6.
00406 
00407 Usage:   lvariation(inlist)
00408 """
00409     return 100.0*samplestdev(inlist)/float(mean(inlist))
00410 
00411 
00412 def lskew(inlist):
00413     """
00414 Returns the skewness of a distribution, as defined in Numerical
00415 Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.)
00416 
00417 Usage:   lskew(inlist)
00418 """
00419     return moment(inlist,3)/pow(moment(inlist,2),1.5)
00420 
00421 
00422 def lkurtosis(inlist):
00423     """
00424 Returns the kurtosis of a distribution, as defined in Numerical
00425 Recipies (alternate defn in CRC Standard Probability and Statistics, p.6.)
00426 
00427 Usage:   lkurtosis(inlist)
00428 """
00429     return moment(inlist,4)/pow(moment(inlist,2),2.0)
00430 
00431 
00432 def ldescribe(inlist):
00433     """
00434 Returns some descriptive statistics of the passed list (assumed to be 1D).
00435 
00436 Usage:   ldescribe(inlist)
00437 Returns: n, mean, standard deviation, skew, kurtosis
00438 """
00439     n = len(inlist)
00440     mm = (min(inlist),max(inlist))
00441     m = mean(inlist)
00442     sd = stdev(inlist)
00443     sk = skew(inlist)
00444     kurt = kurtosis(inlist)
00445     return n, mm, m, sd, sk, kurt
00446 
00447 
00448 ####################################
00449 #######  FREQUENCY STATS  ##########
00450 ####################################
00451 
00452 def litemfreq(inlist):
00453     """
00454 Returns a list of pairs.  Each pair consists of one of the scores in inlist
00455 and it's frequency count.  Assumes a 1D list is passed.
00456 
00457 Usage:   litemfreq(inlist)
00458 Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies)
00459 """
00460     scores = pstat.unique(inlist)
00461     scores.sort()
00462     freq = []
00463     for item in scores:
00464         freq.append(inlist.count(item))
00465     return pstat.abut(scores, freq)
00466 
00467 
00468 def lscoreatpercentile (inlist, percent):
00469     """
00470 Returns the score at a given percentile relative to the distribution
00471 given by inlist.
00472 
00473 Usage:   lscoreatpercentile(inlist,percent)
00474 """
00475     if percent > 1:
00476         print "\nDividing percent>1 by 100 in lscoreatpercentile().\n"
00477         percent = percent / 100.0
00478     targetcf = percent*len(inlist)
00479     h, lrl, binsize, extras = histogram(inlist)
00480     cumhist = cumsum(copy.deepcopy(h))
00481     for i in range(len(cumhist)):
00482         if cumhist[i] >= targetcf:
00483             break
00484     score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i)
00485     return score
00486 
00487 
00488 def lpercentileofscore (inlist, score,histbins=10,defaultlimits=None):
00489     """
00490 Returns the percentile value of a score relative to the distribution
00491 given by inlist.  Formula depends on the values used to histogram the data(!).
00492 
00493 Usage:   lpercentileofscore(inlist,score,histbins=10,defaultlimits=None)
00494 """
00495 
00496     h, lrl, binsize, extras = histogram(inlist,histbins,defaultlimits)
00497     cumhist = cumsum(copy.deepcopy(h))
00498     i = int((score - lrl)/float(binsize))
00499     pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inlist)) * 100
00500     return pct
00501 
00502 
00503 def lhistogram (inlist,numbins=10,defaultreallimits=None,printextras=0):
00504     """
00505 Returns (i) a list of histogram bin counts, (ii) the smallest value
00506 of the histogram binning, and (iii) the bin width (the last 2 are not
00507 necessarily integers).  Default number of bins is 10.  If no sequence object
00508 is given for defaultreallimits, the routine picks (usually non-pretty) bins
00509 spanning all the numbers in the inlist.
00510 
00511 Usage:   lhistogram (inlist, numbins=10, defaultreallimits=None,suppressoutput=0)
00512 Returns: list of bin values, lowerreallimit, binsize, extrapoints
00513 """
00514     if (defaultreallimits <> None):
00515         if type(defaultreallimits) not in [ListType,TupleType] or len(defaultreallimits)==1: # only one limit given, assumed to be lower one & upper is calc'd
00516             lowerreallimit = defaultreallimits
00517             upperreallimit = 1.000001 * max(inlist)
00518         else: # assume both limits given
00519             lowerreallimit = defaultreallimits[0]
00520             upperreallimit = defaultreallimits[1]
00521         binsize = (upperreallimit-lowerreallimit)/float(numbins)
00522     else:     # no limits given for histogram, both must be calc'd
00523         estbinwidth=(max(inlist)-min(inlist))/float(numbins) +1e-6 #1=>cover all
00524         binsize = ((max(inlist)-min(inlist)+estbinwidth))/float(numbins)
00525         lowerreallimit = min(inlist) - binsize/2 #lower real limit,1st bin
00526     bins = [0]*(numbins)
00527     extrapoints = 0
00528     for num in inlist:
00529         try:
00530             if (num-lowerreallimit) < 0:
00531                 extrapoints = extrapoints + 1
00532             else:
00533                 bintoincrement = int((num-lowerreallimit)/float(binsize))
00534                 bins[bintoincrement] = bins[bintoincrement] + 1
00535         except:
00536             extrapoints = extrapoints + 1
00537     if (extrapoints > 0 and printextras == 1):
00538         print '\nPoints outside given histogram range =',extrapoints
00539     return (bins, lowerreallimit, binsize, extrapoints)
00540 
00541 
00542 def lcumfreq(inlist,numbins=10,defaultreallimits=None):
00543     """
00544 Returns a cumulative frequency histogram, using the histogram function.
00545 
00546 Usage:   lcumfreq(inlist,numbins=10,defaultreallimits=None)
00547 Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints
00548 """
00549     h,l,b,e = histogram(inlist,numbins,defaultreallimits)
00550     cumhist = cumsum(copy.deepcopy(h))
00551     return cumhist,l,b,e
00552 
00553 
00554 def lrelfreq(inlist,numbins=10,defaultreallimits=None):
00555     """
00556 Returns a relative frequency histogram, using the histogram function.
00557 
00558 Usage:   lrelfreq(inlist,numbins=10,defaultreallimits=None)
00559 Returns: list of cumfreq bin values, lowerreallimit, binsize, extrapoints
00560 """
00561     h,l,b,e = histogram(inlist,numbins,defaultreallimits)
00562     for i in range(len(h)):
00563         h[i] = h[i]/float(len(inlist))
00564     return h,l,b,e
00565 
00566 
00567 ####################################
00568 #####  VARIABILITY FUNCTIONS  ######
00569 ####################################
00570 
00571 def lobrientransform(*args):
00572     """
00573 Computes a transform on input data (any number of columns).  Used to
00574 test for homogeneity of variance prior to running one-way stats.  From
00575 Maxwell and Delaney, p.112.
00576 
00577 Usage:   lobrientransform(*args)
00578 Returns: transformed data for use in an ANOVA
00579 """
00580     TINY = 1e-10
00581     k = len(args)
00582     n = [0.0]*k
00583     v = [0.0]*k
00584     m = [0.0]*k
00585     nargs = []
00586     for i in range(k):
00587         nargs.append(copy.deepcopy(args[i]))
00588         n[i] = float(len(nargs[i]))
00589         v[i] = var(nargs[i])
00590         m[i] = mean(nargs[i])
00591     for j in range(k):
00592         for i in range(n[j]):
00593             t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2
00594             t2 = 0.5*v[j]*(n[j]-1.0)
00595             t3 = (n[j]-1.0)*(n[j]-2.0)
00596             nargs[j][i] = (t1-t2) / float(t3)
00597     check = 1
00598     for j in range(k):
00599         if v[j] - mean(nargs[j]) > TINY:
00600             check = 0
00601     if check <> 1:
00602         raise ValueError, 'Problem in obrientransform.'
00603     else:
00604         return nargs
00605 
00606 
00607 def lsamplevar (inlist):
00608     """
00609 Returns the variance of the values in the passed list using
00610 N for the denominator (i.e., DESCRIBES the sample variance only).
00611 
00612 Usage:   lsamplevar(inlist)
00613 """
00614     n = len(inlist)
00615     mn = mean(inlist)
00616     deviations = []
00617     for item in inlist:
00618         deviations.append(item-mn)
00619     return ss(deviations)/float(n)
00620 
00621 
00622 def lsamplestdev (inlist):
00623     """
00624 Returns the standard deviation of the values in the passed list using
00625 N for the denominator (i.e., DESCRIBES the sample stdev only).
00626 
00627 Usage:   lsamplestdev(inlist)
00628 """
00629     return math.sqrt(samplevar(inlist))
00630 
00631 
00632 def lcov (x,y, keepdims=0):
00633     """
00634 Returns the estimated covariance of the values in the passed
00635 array (i.e., N-1).  Dimension can equal None (ravel array first), an
00636 integer (the dimension over which to operate), or a sequence (operate
00637 over multiple dimensions).  Set keepdims=1 to return an array with the
00638 same number of dimensions as inarray.
00639 
00640 Usage:   lcov(x,y,keepdims=0)
00641 """
00642 
00643     n = len(x)
00644     xmn = mean(x)
00645     ymn = mean(y)
00646     xdeviations = [0]*len(x)
00647     ydeviations = [0]*len(y)
00648     for i in range(len(x)):
00649         xdeviations[i] = x[i] - xmn
00650         ydeviations[i] = y[i] - ymn
00651     ss = 0.0
00652     for i in range(len(xdeviations)):
00653         ss = ss + xdeviations[i]*ydeviations[i]
00654     return ss/float(n-1)
00655 
00656 
00657 def lvar (inlist):
00658     """
00659 Returns the variance of the values in the passed list using N-1
00660 for the denominator (i.e., for estimating population variance).
00661 
00662 Usage:   lvar(inlist)
00663 """
00664     n = len(inlist)
00665     mn = mean(inlist)
00666     deviations = [0]*len(inlist)
00667     for i in range(len(inlist)):
00668         deviations[i] = inlist[i] - mn
00669     return ss(deviations)/float(n-1)
00670 
00671 
00672 def lstdev (inlist):
00673     """
00674 Returns the standard deviation of the values in the passed list
00675 using N-1 in the denominator (i.e., to estimate population stdev).
00676 
00677 Usage:   lstdev(inlist)
00678 """
00679     return math.sqrt(var(inlist))
00680 
00681 
00682 def lsterr(inlist):
00683     """
00684 Returns the standard error of the values in the passed list using N-1
00685 in the denominator (i.e., to estimate population standard error).
00686 
00687 Usage:   lsterr(inlist)
00688 """
00689     return stdev(inlist) / float(math.sqrt(len(inlist)))
00690 
00691 
00692 def lsem (inlist):
00693     """
00694 Returns the estimated standard error of the mean (sx-bar) of the
00695 values in the passed list.  sem = stdev / sqrt(n)
00696 
00697 Usage:   lsem(inlist)
00698 """
00699     sd = stdev(inlist)
00700     n = len(inlist)
00701     return sd/math.sqrt(n)
00702 
00703 
00704 def lz (inlist, score):
00705     """
00706 Returns the z-score for a given input score, given that score and the
00707 list from which that score came.  Not appropriate for population calculations.
00708 
00709 Usage:   lz(inlist, score)
00710 """
00711     z = (score-mean(inlist))/samplestdev(inlist)
00712     return z
00713 
00714 
00715 def lzs (inlist):
00716     """
00717 Returns a list of z-scores, one for each score in the passed list.
00718 
00719 Usage:   lzs(inlist)
00720 """
00721     zscores = []
00722     for item in inlist:
00723         zscores.append(z(inlist,item))
00724     return zscores
00725 
00726 
00727 ####################################
00728 #######  TRIMMING FUNCTIONS  #######
00729 ####################################
00730 
00731 def ltrimboth (l,proportiontocut):
00732     """
00733 Slices off the passed proportion of items from BOTH ends of the passed
00734 list (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND 'rightmost'
00735 10% of scores.  Assumes list is sorted by magnitude.  Slices off LESS if
00736 proportion results in a non-integer slice index (i.e., conservatively
00737 slices off proportiontocut).
00738 
00739 Usage:   ltrimboth (l,proportiontocut)
00740 Returns: trimmed version of list l
00741 """
00742     lowercut = int(proportiontocut*len(l))
00743     uppercut = len(l) - lowercut
00744     return l[lowercut:uppercut]
00745 
00746 
00747 def ltrim1 (l,proportiontocut,tail='right'):
00748     """
00749 Slices off the passed proportion of items from ONE end of the passed
00750 list (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
00751 10% of scores).  Slices off LESS if proportion results in a non-integer
00752 slice index (i.e., conservatively slices off proportiontocut).
00753 
00754 Usage:   ltrim1 (l,proportiontocut,tail='right')  or set tail='left'
00755 Returns: trimmed version of list l
00756 """
00757     if tail == 'right':
00758         lowercut = 0
00759         uppercut = len(l) - int(proportiontocut*len(l))
00760     elif tail == 'left':
00761         lowercut = int(proportiontocut*len(l))
00762         uppercut = len(l)
00763     return l[lowercut:uppercut]
00764 
00765 
00766 ####################################
00767 #####  CORRELATION FUNCTIONS  ######
00768 ####################################
00769 
00770 def lpaired(x,y):
00771     """
00772 Interactively determines the type of data and then runs the
00773 appropriated statistic for paired group data.
00774 
00775 Usage:   lpaired(x,y)
00776 Returns: appropriate statistic name, value, and probability
00777 """
00778     samples = ''
00779     while samples not in ['i','r','I','R','c','C']:
00780         print '\nIndependent or related samples, or correlation (i,r,c): ',
00781         samples = raw_input()
00782 
00783     if samples in ['i','I','r','R']:
00784         print '\nComparing variances ...',
00785 # USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112
00786         r = obrientransform(x,y)
00787         f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1))
00788         if p<0.05:
00789             vartype='unequal, p='+str(round(p,4))
00790         else:
00791             vartype='equal'
00792         print vartype
00793         if samples in ['i','I']:
00794             if vartype[0]=='e':
00795                 t,p = ttest_ind(x,y,0)
00796                 print '\nIndependent samples t-test:  ', round(t,4),round(p,4)
00797             else:
00798                 if len(x)>20 or len(y)>20:
00799                     z,p = ranksums(x,y)
00800                     print '\nRank Sums test (NONparametric, n>20):  ', round(z,4),round(p,4)
00801                 else:
00802                     u,p = mannwhitneyu(x,y)
00803                     print '\nMann-Whitney U-test (NONparametric, ns<20):  ', round(u,4),round(p,4)
00804 
00805         else:  # RELATED SAMPLES
00806             if vartype[0]=='e':
00807                 t,p = ttest_rel(x,y,0)
00808                 print '\nRelated samples t-test:  ', round(t,4),round(p,4)
00809             else:
00810                 t,p = ranksums(x,y)
00811                 print '\nWilcoxon T-test (NONparametric):  ', round(t,4),round(p,4)
00812     else:  # CORRELATION ANALYSIS
00813         corrtype = ''
00814         while corrtype not in ['c','C','r','R','d','D']:
00815             print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ',
00816             corrtype = raw_input()
00817         if corrtype in ['c','C']:
00818             m,b,r,p,see = linregress(x,y)
00819             print '\nLinear regression for continuous variables ...'
00820             lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]]
00821             pstat.printcc(lol)
00822         elif corrtype in ['r','R']:
00823             r,p = spearmanr(x,y)
00824             print '\nCorrelation for ranked variables ...'
00825             print "Spearman's r: ",round(r,4),round(p,4)
00826         else: # DICHOTOMOUS
00827             r,p = pointbiserialr(x,y)
00828             print '\nAssuming x contains a dichotomous variable ...'
00829             print 'Point Biserial r: ',round(r,4),round(p,4)
00830     print '\n\n'
00831     return None
00832 
00833 
00834 def lpearsonr(x,y):
00835     """
00836 Calculates a Pearson correlation coefficient and the associated
00837 probability value.  Taken from Heiman's Basic Statistics for the Behav.
00838 Sci (2nd), p.195.
00839 
00840 Usage:   lpearsonr(x,y)      where x and y are equal-length lists
00841 Returns: Pearson's r value, two-tailed p-value
00842 """
00843     TINY = 1.0e-30
00844     if len(x) <> len(y):
00845         raise ValueError, 'Input values not paired in pearsonr.  Aborting.'
00846     n = len(x)
00847     x = map(float,x)
00848     y = map(float,y)
00849     xmean = mean(x)
00850     ymean = mean(y)
00851     r_num = n*(summult(x,y)) - sum(x)*sum(y)
00852     r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y)))
00853     r = (r_num / r_den)  # denominator already a float
00854     df = n-2
00855     t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
00856     prob = betai(0.5*df,0.5,df/float(df+t*t))
00857     return r, prob
00858 
00859 
00860 def llincc(x,y):
00861     """
00862 Calculates Lin's concordance correlation coefficient.
00863 
00864 Usage:   alincc(x,y)    where x, y are equal-length arrays
00865 Returns: Lin's CC
00866 """
00867     covar = lcov(x,y)*(len(x)-1)/float(len(x))  # correct denom to n
00868     xvar = lvar(x)*(len(x)-1)/float(len(x))  # correct denom to n
00869     yvar = lvar(y)*(len(y)-1)/float(len(y))  # correct denom to n
00870     lincc = (2 * covar) / ((xvar+yvar) +((amean(x)-amean(y))**2))
00871     return lincc
00872 
00873 
00874 def lspearmanr(x,y):
00875     """
00876 Calculates a Spearman rank-order correlation coefficient.  Taken
00877 from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
00878 
00879 Usage:   lspearmanr(x,y)      where x and y are equal-length lists
00880 Returns: Spearman's r, two-tailed p-value
00881 """
00882     TINY = 1e-30
00883     if len(x) <> len(y):
00884         raise ValueError, 'Input values not paired in spearmanr.  Aborting.'
00885     n = len(x)
00886     rankx = rankdata(x)
00887     ranky = rankdata(y)
00888     dsq = sumdiffsquared(rankx,ranky)
00889     rs = 1 - 6*dsq / float(n*(n**2-1))
00890     t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
00891     df = n-2
00892     probrs = betai(0.5*df,0.5,df/(df+t*t))  # t already a float
00893 # probability values for rs are from part 2 of the spearman function in
00894 # Numerical Recipies, p.510.  They are close to tables, but not exact. (?)
00895     return rs, probrs
00896 
00897 
00898 def lpointbiserialr(x,y):
00899     """
00900 Calculates a point-biserial correlation coefficient and the associated
00901 probability value.  Taken from Heiman's Basic Statistics for the Behav.
00902 Sci (1st), p.194.
00903 
00904 Usage:   lpointbiserialr(x,y)      where x,y are equal-length lists
00905 Returns: Point-biserial r, two-tailed p-value
00906 """
00907     TINY = 1e-30
00908     if len(x) <> len(y):
00909         raise ValueError, 'INPUT VALUES NOT PAIRED IN pointbiserialr.  ABORTING.'
00910     data = pstat.abut(x,y)
00911     categories = pstat.unique(x)
00912     if len(categories) <> 2:
00913         raise ValueError, "Exactly 2 categories required for pointbiserialr()."
00914     else:   # there are 2 categories, continue
00915         codemap = pstat.abut(categories,range(2))
00916         recoded = pstat.recode(data,codemap,0)
00917         x = pstat.linexand(data,0,categories[0])
00918         y = pstat.linexand(data,0,categories[1])
00919         xmean = mean(pstat.colex(x,1))
00920         ymean = mean(pstat.colex(y,1))
00921         n = len(data)
00922         adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n)))
00923         rpb = (ymean - xmean)/samplestdev(pstat.colex(data,1))*adjust
00924         df = n-2
00925         t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
00926         prob = betai(0.5*df,0.5,df/(df+t*t))  # t already a float
00927         return rpb, prob
00928 
00929 
00930 def lkendalltau(x,y):
00931     """
00932 Calculates Kendall's tau ... correlation of ordinal data.  Adapted
00933 from function kendl1 in Numerical Recipies.  Needs good test-routine.@@@
00934 
00935 Usage:   lkendalltau(x,y)
00936 Returns: Kendall's tau, two-tailed p-value
00937 """
00938     n1 = 0
00939     n2 = 0
00940     iss = 0
00941     for j in range(len(x)-1):
00942         for k in range(j,len(y)):
00943             a1 = x[j] - x[k]
00944             a2 = y[j] - y[k]
00945             aa = a1 * a2
00946             if (aa):             # neither list has a tie
00947                 n1 = n1 + 1
00948                 n2 = n2 + 1
00949                 if aa > 0:
00950                     iss = iss + 1
00951                 else:
00952                     iss = iss -1
00953             else:
00954                 if (a1):
00955                     n1 = n1 + 1
00956                 else:
00957                     n2 = n2 + 1
00958     tau = iss / math.sqrt(n1*n2)
00959     svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
00960     z = tau / math.sqrt(svar)
00961     prob = erfcc(abs(z)/1.4142136)
00962     return tau, prob
00963 
00964 
00965 def llinregress(x,y):
00966     """
00967 Calculates a regression line on x,y pairs.  
00968 
00969 Usage:   llinregress(x,y)      x,y are equal-length lists of x-y coordinates
00970 Returns: slope, intercept, r, two-tailed prob, sterr-of-estimate
00971 """
00972     TINY = 1.0e-20
00973     if len(x) <> len(y):
00974         raise ValueError, 'Input values not paired in linregress.  Aborting.'
00975     n = len(x)
00976     x = map(float,x)
00977     y = map(float,y)
00978     xmean = mean(x)
00979     ymean = mean(y)
00980     r_num = float(n*(summult(x,y)) - sum(x)*sum(y))
00981     r_den = math.sqrt((n*ss(x) - square_of_sums(x))*(n*ss(y)-square_of_sums(y)))
00982     r = r_num / r_den
00983     z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY))
00984     df = n-2
00985     t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
00986     prob = betai(0.5*df,0.5,df/(df+t*t))
00987     slope = r_num / float(n*ss(x) - square_of_sums(x))
00988     intercept = ymean - slope*xmean
00989     sterrest = math.sqrt(1-r*r)*samplestdev(y)
00990     return slope, intercept, r, prob, sterrest
00991 
00992 
00993 ####################################
00994 #####  INFERENTIAL STATISTICS  #####
00995 ####################################
00996 
00997 def lttest_1samp(a,popmean,printit=0,name='Sample',writemode='a'):
00998     """
00999 Calculates the t-obtained for the independent samples T-test on ONE group
01000 of scores a, given a population mean.  If printit=1, results are printed
01001 to the screen.  If printit='filename', the results are output to 'filename'
01002 using the given writemode (default=append).  Returns t-value, and prob.
01003 
01004 Usage:   lttest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
01005 Returns: t-value, two-tailed prob
01006 """
01007     x = mean(a)
01008     v = var(a)
01009     n = len(a)
01010     df = n-1
01011     svar = ((n-1)*v)/float(df)
01012     t = (x-popmean)/math.sqrt(svar*(1.0/n))
01013     prob = betai(0.5*df,0.5,float(df)/(df+t*t))
01014 
01015     if printit <> 0:
01016         statname = 'Single-sample T-test.'
01017         outputpairedstats(printit,writemode,
01018                           'Population','--',popmean,0,0,0,
01019                           name,n,x,v,min(a),max(a),
01020                           statname,t,prob)
01021     return t,prob
01022 
01023 
01024 def lttest_ind (a, b, printit=0, name1='Samp1', name2='Samp2', writemode='a'):
01025     """
01026 Calculates the t-obtained T-test on TWO INDEPENDENT samples of
01027 scores a, and b.  From Numerical Recipies, p.483.  If printit=1, results
01028 are printed to the screen.  If printit='filename', the results are output
01029 to 'filename' using the given writemode (default=append).  Returns t-value,
01030 and prob.
01031 
01032 Usage:   lttest_ind(a,b,printit=0,name1='Samp1',name2='Samp2',writemode='a')
01033 Returns: t-value, two-tailed prob
01034 """
01035     x1 = mean(a)
01036     x2 = mean(b)
01037     v1 = stdev(a)**2
01038     v2 = stdev(b)**2
01039     n1 = len(a)
01040     n2 = len(b)
01041     df = n1+n2-2
01042     svar = ((n1-1)*v1+(n2-1)*v2)/float(df)
01043     t = (x1-x2)/math.sqrt(svar*(1.0/n1 + 1.0/n2))
01044     prob = betai(0.5*df,0.5,df/(df+t*t))
01045 
01046     if printit <> 0:
01047         statname = 'Independent samples T-test.'
01048         outputpairedstats(printit,writemode,
01049                           name1,n1,x1,v1,min(a),max(a),
01050                           name2,n2,x2,v2,min(b),max(b),
01051                           statname,t,prob)
01052     return t,prob
01053 
01054 
01055 def lttest_rel (a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a'):
01056     """
01057 Calculates the t-obtained T-test on TWO RELATED samples of scores,
01058 a and b.  From Numerical Recipies, p.483.  If printit=1, results are
01059 printed to the screen.  If printit='filename', the results are output to
01060 'filename' using the given writemode (default=append).  Returns t-value,
01061 and prob.
01062 
01063 Usage:   lttest_rel(a,b,printit=0,name1='Sample1',name2='Sample2',writemode='a')
01064 Returns: t-value, two-tailed prob
01065 """
01066     if len(a)<>len(b):
01067         raise ValueError, 'Unequal length lists in ttest_rel.'
01068     x1 = mean(a)
01069     x2 = mean(b)
01070     v1 = var(a)
01071     v2 = var(b)
01072     n = len(a)
01073     cov = 0
01074     for i in range(len(a)):
01075         cov = cov + (a[i]-x1) * (b[i]-x2)
01076     df = n-1
01077     cov = cov / float(df)
01078     sd = math.sqrt((v1+v2 - 2.0*cov)/float(n))
01079     t = (x1-x2)/sd
01080     prob = betai(0.5*df,0.5,df/(df+t*t))
01081 
01082     if printit <> 0:
01083         statname = 'Related samples T-test.'
01084         outputpairedstats(printit,writemode,
01085                           name1,n,x1,v1,min(a),max(a),
01086                           name2,n,x2,v2,min(b),max(b),
01087                           statname,t,prob)
01088     return t, prob
01089 
01090 
01091 def lchisquare(f_obs,f_exp=None):
01092     """
01093 Calculates a one-way chi square for list of observed frequencies and returns
01094 the result.  If no expected frequencies are given, the total N is assumed to
01095 be equally distributed across all groups.
01096 
01097 Usage:   lchisquare(f_obs, f_exp=None)   f_obs = list of observed cell freq.
01098 Returns: chisquare-statistic, associated p-value
01099 """
01100     k = len(f_obs)                 # number of groups
01101     if f_exp == None:
01102         f_exp = [sum(f_obs)/float(k)] * len(f_obs) # create k bins with = freq.
01103     chisq = 0
01104     for i in range(len(f_obs)):
01105         chisq = chisq + (f_obs[i]-f_exp[i])**2 / float(f_exp[i])
01106     return chisq, chisqprob(chisq, k-1)
01107 
01108 
01109 def lks_2samp (data1,data2):
01110     """
01111 Computes the Kolmogorov-Smirnof statistic on 2 samples.  From
01112 Numerical Recipies in C, page 493.
01113 
01114 Usage:   lks_2samp(data1,data2)   data1&2 are lists of values for 2 conditions
01115 Returns: KS D-value, associated p-value
01116 """
01117     j1 = 0
01118     j2 = 0
01119     fn1 = 0.0
01120     fn2 = 0.0
01121     n1 = len(data1)
01122     n2 = len(data2)
01123     en1 = n1
01124     en2 = n2
01125     d = 0.0
01126     data1.sort()
01127     data2.sort()
01128     while j1 < n1 and j2 < n2:
01129         d1=data1[j1]
01130         d2=data2[j2]
01131         if d1 <= d2:
01132             fn1 = (j1)/float(en1)
01133             j1 = j1 + 1
01134         if d2 <= d1:
01135             fn2 = (j2)/float(en2)
01136             j2 = j2 + 1
01137         dt = (fn2-fn1)
01138         if math.fabs(dt) > math.fabs(d):
01139             d = dt
01140     try:
01141         en = math.sqrt(en1*en2/float(en1+en2))
01142         prob = ksprob((en+0.12+0.11/en)*abs(d))
01143     except:
01144         prob = 1.0
01145     return d, prob
01146 
01147 
01148 def lmannwhitneyu(x,y):
01149     """
01150 Calculates a Mann-Whitney U statistic on the provided scores and
01151 returns the result.  Use only when the n in each condition is < 20 and
01152 you have 2 independent samples of ranks.  NOTE: Mann-Whitney U is
01153 significant if the u-obtained is LESS THAN or equal to the critical
01154 value of U found in the tables.  Equivalent to Kruskal-Wallis H with
01155 just 2 groups.
01156 
01157 Usage:   lmannwhitneyu(data)
01158 Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
01159 """
01160     n1 = len(x)
01161     n2 = len(y)
01162     ranked = rankdata(x+y)
01163     rankx = ranked[0:n1]       # get the x-ranks
01164     ranky = ranked[n1:]        # the rest are y-ranks
01165     u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx)  # calc U for x
01166     u2 = n1*n2 - u1                            # remainder is U for y
01167     bigu = max(u1,u2)
01168     smallu = min(u1,u2)
01169     T = math.sqrt(tiecorrect(ranked))  # correction factor for tied scores
01170     if T == 0:
01171         raise ValueError, 'All numbers are identical in lmannwhitneyu'
01172     sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0)
01173     z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
01174     return smallu, 1.0 - zprob(z)
01175 
01176 
01177 def ltiecorrect(rankvals):
01178     """
01179 Corrects for ties in Mann Whitney U and Kruskal Wallis H tests.  See
01180 Siegel, S. (1956) Nonparametric Statistics for the Behavioral Sciences.
01181 New York: McGraw-Hill.  Code adapted from |Stat rankind.c code.
01182 
01183 Usage:   ltiecorrect(rankvals)
01184 Returns: T correction factor for U or H
01185 """
01186     sorted,posn = shellsort(rankvals)
01187     n = len(sorted)
01188     T = 0.0
01189     i = 0
01190     while (i<n-1):
01191         if sorted[i] == sorted[i+1]:
01192             nties = 1
01193             while (i<n-1) and (sorted[i] == sorted[i+1]):
01194                 nties = nties +1
01195                 i = i +1
01196             T = T + nties**3 - nties
01197         i = i+1
01198     T = T / float(n**3-n)
01199     return 1.0 - T
01200 
01201 
01202 def lranksums(x,y):
01203     """
01204 Calculates the rank sums statistic on the provided scores and
01205 returns the result.  Use only when the n in each condition is > 20 and you
01206 have 2 independent samples of ranks.
01207 
01208 Usage:   lranksums(x,y)
01209 Returns: a z-statistic, two-tailed p-value
01210 """
01211     n1 = len(x)
01212     n2 = len(y)
01213     alldata = x+y
01214     ranked = rankdata(alldata)
01215     x = ranked[:n1]
01216     y = ranked[n1:]
01217     s = sum(x)
01218     expected = n1*(n1+n2+1) / 2.0
01219     z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0)
01220     prob = 2*(1.0 -zprob(abs(z)))
01221     return z, prob
01222 
01223 
01224 def lwilcoxont(x,y):
01225     """
01226 Calculates the Wilcoxon T-test for related samples and returns the
01227 result.  A non-parametric T-test.
01228 
01229 Usage:   lwilcoxont(x,y)
01230 Returns: a t-statistic, two-tail probability estimate
01231 """
01232     if len(x) <> len(y):
01233         raise ValueError, 'Unequal N in wilcoxont.  Aborting.'
01234     d=[]
01235     for i in range(len(x)):
01236         diff = x[i] - y[i]
01237         if diff <> 0:
01238             d.append(diff)
01239     count = len(d)
01240     absd = map(abs,d)
01241     absranked = rankdata(absd)
01242     r_plus = 0.0
01243     r_minus = 0.0
01244     for i in range(len(absd)):
01245         if d[i] < 0:
01246             r_minus = r_minus + absranked[i]
01247         else:
01248             r_plus = r_plus + absranked[i]
01249     wt = min(r_plus, r_minus)
01250     mn = count * (count+1) * 0.25
01251     se =  math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0)
01252     z = math.fabs(wt-mn) / se
01253     prob = 2*(1.0 -zprob(abs(z)))
01254     return wt, prob
01255 
01256 
01257 def lkruskalwallish(*args):
01258     """
01259 The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more
01260 groups, requiring at least 5 subjects in each group.  This function
01261 calculates the Kruskal-Wallis H-test for 3 or more independent samples
01262 and returns the result.  
01263 
01264 Usage:   lkruskalwallish(*args)
01265 Returns: H-statistic (corrected for ties), associated p-value
01266 """
01267     args = list(args)
01268     n = [0]*len(args)
01269     all = []
01270     n = map(len,args)
01271     for i in range(len(args)):
01272         all = all + args[i]
01273     ranked = rankdata(all)
01274     T = tiecorrect(ranked)
01275     for i in range(len(args)):
01276         args[i] = ranked[0:n[i]]
01277         del ranked[0:n[i]]
01278     rsums = []
01279     for i in range(len(args)):
01280         rsums.append(sum(args[i])**2)
01281         rsums[i] = rsums[i] / float(n[i])
01282     ssbn = sum(rsums)
01283     totaln = sum(n)
01284     h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
01285     df = len(args) - 1
01286     if T == 0:
01287         raise ValueError, 'All numbers are identical in lkruskalwallish'
01288     h = h / float(T)
01289     return h, chisqprob(h,df)
01290 
01291 
01292 def lfriedmanchisquare(*args):
01293     """
01294 Friedman Chi-Square is a non-parametric, one-way within-subjects
01295 ANOVA.  This function calculates the Friedman Chi-square test for repeated
01296 measures and returns the result, along with the associated probability
01297 value.  It assumes 3 or more repeated measures.  Only 3 levels requires a
01298 minimum of 10 subjects in the study.  Four levels requires 5 subjects per
01299 level(??).
01300 
01301 Usage:   lfriedmanchisquare(*args)
01302 Returns: chi-square statistic, associated p-value
01303 """
01304     k = len(args)
01305     if k < 3:
01306         raise ValueError, 'Less than 3 levels.  Friedman test not appropriate.'
01307     n = len(args[0])
01308     data = apply(pstat.abut,tuple(args))
01309     for i in range(len(data)):
01310         data[i] = rankdata(data[i])
01311     ssbn = 0
01312     for i in range(k):
01313         ssbn = ssbn + sum(args[i])**2
01314     chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
01315     return chisq, chisqprob(chisq,k-1)
01316 
01317 
01318 ####################################
01319 ####  PROBABILITY CALCULATIONS  ####
01320 ####################################
01321 
01322 def lchisqprob(chisq,df):
01323     """
01324 Returns the (1-tailed) probability value associated with the provided
01325 chi-square value and df.  Adapted from chisq.c in Gary Perlman's |Stat.
01326 
01327 Usage:   lchisqprob(chisq,df)
01328 """
01329     BIG = 20.0
01330     def ex(x):
01331         BIG = 20.0
01332         if x < -BIG:
01333             return 0.0
01334         else:
01335             return math.exp(x)
01336 
01337     if chisq <=0 or df < 1:
01338         return 1.0
01339     a = 0.5 * chisq
01340     if df%2 == 0:
01341         even = 1
01342     else:
01343         even = 0
01344     if df > 1:
01345         y = ex(-a)
01346     if even:
01347         s = y
01348     else:
01349         s = 2.0 * zprob(-math.sqrt(chisq))
01350     if (df > 2):
01351         chisq = 0.5 * (df - 1.0)
01352         if even:
01353             z = 1.0
01354         else:
01355             z = 0.5
01356         if a > BIG:
01357             if even:
01358                 e = 0.0
01359             else:
01360                 e = math.log(math.sqrt(math.pi))
01361             c = math.log(a)
01362             while (z <= chisq):
01363                 e = math.log(z) + e
01364                 s = s + ex(c*z-a-e)
01365                 z = z + 1.0
01366             return s
01367         else:
01368             if even:
01369                 e = 1.0
01370             else:
01371                 e = 1.0 / math.sqrt(math.pi) / math.sqrt(a)
01372             c = 0.0
01373             while (z <= chisq):
01374                 e = e * (a/float(z))
01375                 c = c + e
01376                 z = z + 1.0
01377             return (c*y+s)
01378     else:
01379         return s
01380 
01381 
01382 def lerfcc(x):
01383     """
01384 Returns the complementary error function erfc(x) with fractional
01385 error everywhere less than 1.2e-7.  Adapted from Numerical Recipies.
01386 
01387 Usage:   lerfcc(x)
01388 """
01389     z = abs(x)
01390     t = 1.0 / (1.0+0.5*z)
01391     ans = t * math.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277)))))))))
01392     if x >= 0:
01393         return ans
01394     else:
01395         return 2.0 - ans
01396 
01397 
01398 def lzprob(z):
01399     """
01400 Returns the area under the normal curve 'to the left of' the given z value.
01401 Thus, 
01402     for z<0, zprob(z) = 1-tail probability
01403     for z>0, 1.0-zprob(z) = 1-tail probability
01404     for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
01405 Adapted from z.c in Gary Perlman's |Stat.
01406 
01407 Usage:   lzprob(z)
01408 """
01409     Z_MAX = 6.0    # maximum meaningful z-value
01410     if z == 0.0:
01411         x = 0.0
01412     else:
01413         y = 0.5 * math.fabs(z)
01414         if y >= (Z_MAX*0.5):
01415             x = 1.0
01416         elif (y < 1.0):
01417             w = y*y
01418             x = ((((((((0.000124818987 * w
01419                         -0.001075204047) * w +0.005198775019) * w
01420                       -0.019198292004) * w +0.059054035642) * w
01421                     -0.151968751364) * w +0.319152932694) * w
01422                   -0.531923007300) * w +0.797884560593) * y * 2.0
01423         else:
01424             y = y - 2.0
01425             x = (((((((((((((-0.000045255659 * y
01426                              +0.000152529290) * y -0.000019538132) * y
01427                            -0.000676904986) * y +0.001390604284) * y
01428                          -0.000794620820) * y -0.002034254874) * y
01429                        +0.006549791214) * y -0.010557625006) * y
01430                      +0.011630447319) * y -0.009279453341) * y
01431                    +0.005353579108) * y -0.002141268741) * y
01432                  +0.000535310849) * y +0.999936657524
01433     if z > 0.0:
01434         prob = ((x+1.0)*0.5)
01435     else:
01436         prob = ((1.0-x)*0.5)
01437     return prob
01438 
01439 
01440 def lksprob(alam):
01441     """
01442 Computes a Kolmolgorov-Smirnov t-test significance level.  Adapted from
01443 Numerical Recipies.
01444 
01445 Usage:   lksprob(alam)
01446 """
01447     fac = 2.0
01448     sum = 0.0
01449     termbf = 0.0
01450     a2 = -2.0*alam*alam
01451     for j in range(1,201):
01452         term = fac*math.exp(a2*j*j)
01453         sum = sum + term
01454         if math.fabs(term) <= (0.001*termbf) or math.fabs(term) < (1.0e-8*sum):
01455             return sum
01456         fac = -fac
01457         termbf = math.fabs(term)
01458     return 1.0             # Get here only if fails to converge; was 0.0!!
01459 
01460 
01461 def lfprob (dfnum, dfden, F):
01462     """
01463 Returns the (1-tailed) significance level (p-value) of an F
01464 statistic given the degrees of freedom for the numerator (dfR-dfF) and
01465 the degrees of freedom for the denominator (dfF).
01466 
01467 Usage:   lfprob(dfnum, dfden, F)   where usually dfnum=dfbn, dfden=dfwn
01468 """
01469     p = betai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F))
01470     return p
01471 
01472 
01473 def lbetacf(a,b,x):
01474     """
01475 This function evaluates the continued fraction form of the incomplete
01476 Beta function, betai.  (Adapted from: Numerical Recipies in C.)
01477 
01478 Usage:   lbetacf(a,b,x)
01479 """
01480     ITMAX = 200
01481     EPS = 3.0e-7
01482 
01483     bm = az = am = 1.0
01484     qab = a+b
01485     qap = a+1.0
01486     qam = a-1.0
01487     bz = 1.0-qab*x/qap
01488     for i in range(ITMAX+1):
01489         em = float(i+1)
01490         tem = em + em
01491         d = em*(b-em)*x/((qam+tem)*(a+tem))
01492         ap = az + d*am
01493         bp = bz+d*bm
01494         d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem))
01495         app = ap+d*az
01496         bpp = bp+d*bz
01497         aold = az
01498         am = ap/bpp
01499         bm = bp/bpp
01500         az = app/bpp
01501         bz = 1.0
01502         if (abs(az-aold)<(EPS*abs(az))):
01503             return az
01504     print 'a or b too big, or ITMAX too small in Betacf.'
01505 
01506 
01507 def lgammln(xx):
01508     """
01509 Returns the gamma function of xx.
01510     Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
01511 (Adapted from: Numerical Recipies in C.)
01512 
01513 Usage:   lgammln(xx)
01514 """
01515 
01516     coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
01517              0.120858003e-2, -0.536382e-5]
01518     x = xx - 1.0
01519     tmp = x + 5.5
01520     tmp = tmp - (x+0.5)*math.log(tmp)
01521     ser = 1.0
01522     for j in range(len(coeff)):
01523         x = x + 1
01524         ser = ser + coeff[j]/x
01525     return -tmp + math.log(2.50662827465*ser)
01526 
01527 
01528 def lbetai(a,b,x):
01529     """
01530 Returns the incomplete beta function:
01531 
01532     I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
01533 
01534 where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma
01535 function of a.  The continued fraction formulation is implemented here,
01536 using the betacf function.  (Adapted from: Numerical Recipies in C.)
01537 
01538 Usage:   lbetai(a,b,x)
01539 """
01540     if (x<0.0 or x>1.0):
01541         raise ValueError, 'Bad x in lbetai'
01542     if (x==0.0 or x==1.0):
01543         bt = 0.0
01544     else:
01545         bt = math.exp(gammln(a+b)-gammln(a)-gammln(b)+a*math.log(x)+b*
01546                       math.log(1.0-x))
01547     if (x<(a+1.0)/(a+b+2.0)):
01548         return bt*betacf(a,b,x)/float(a)
01549     else:
01550         return 1.0-bt*betacf(b,a,1.0-x)/float(b)
01551 
01552 
01553 ####################################
01554 #######  ANOVA CALCULATIONS  #######
01555 ####################################
01556 
01557 def lF_oneway(*lists):
01558     """
01559 Performs a 1-way ANOVA, returning an F-value and probability given
01560 any number of groups.  From Heiman, pp.394-7.
01561 
01562 Usage:   F_oneway(*lists)    where *lists is any number of lists, one per
01563                                   treatment group
01564 Returns: F value, one-tailed p-value
01565 """
01566     a = len(lists)           # ANOVA on 'a' groups, each in it's own list
01567     means = [0]*a
01568     vars = [0]*a
01569     ns = [0]*a
01570     alldata = []
01571     tmp = map(N.array,lists)
01572     means = map(amean,tmp)
01573     vars = map(avar,tmp)
01574     ns = map(len,lists)
01575     for i in range(len(lists)):
01576         alldata = alldata + lists[i]
01577     alldata = N.array(alldata)
01578     bign = len(alldata)
01579     sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign))
01580     ssbn = 0
01581     for list in lists:
01582         ssbn = ssbn + asquare_of_sums(N.array(list))/float(len(list))
01583     ssbn = ssbn - (asquare_of_sums(alldata)/float(bign))
01584     sswn = sstot-ssbn
01585     dfbn = a-1
01586     dfwn = bign - a
01587     msb = ssbn/float(dfbn)
01588     msw = sswn/float(dfwn)
01589     f = msb/msw
01590     prob = fprob(dfbn,dfwn,f)
01591     return f, prob
01592 
01593 
01594 def lF_value (ER,EF,dfnum,dfden):
01595     """
01596 Returns an F-statistic given the following:
01597         ER  = error associated with the null hypothesis (the Restricted model)
01598         EF  = error associated with the alternate hypothesis (the Full model)
01599         dfR-dfF = degrees of freedom of the numerator
01600         dfF = degrees of freedom associated with the denominator/Full model
01601 
01602 Usage:   lF_value(ER,EF,dfnum,dfden)
01603 """
01604     return ((ER-EF)/float(dfnum) / (EF/float(dfden)))
01605 
01606 
01607 ####################################
01608 ########  SUPPORT FUNCTIONS  #######
01609 ####################################
01610 
01611 def writecc (listoflists,file,writetype='w',extra=2):
01612     """
01613 Writes a list of lists to a file in columns, customized by the max
01614 size of items within the columns (max size of items in col, +2 characters)
01615 to specified file.  File-overwrite is the default.
01616 
01617 Usage:   writecc (listoflists,file,writetype='w',extra=2)
01618 Returns: None
01619 """
01620     if type(listoflists[0]) not in [ListType,TupleType]:
01621         listoflists = [listoflists]
01622     outfile = open(file,writetype)
01623     rowstokill = []
01624     list2print = copy.deepcopy(listoflists)
01625     for i in range(len(listoflists)):
01626         if listoflists[i] == ['\n'] or listoflists[i]=='\n' or listoflists[i]=='dashes':
01627             rowstokill = rowstokill + [i]
01628     rowstokill.reverse()
01629     for row in rowstokill:
01630         del list2print[row]
01631     maxsize = [0]*len(list2print[0])
01632     for col in range(len(list2print[0])):
01633         items = pstat.colex(list2print,col)
01634         items = map(pstat.makestr,items)
01635         maxsize[col] = max(map(len,items)) + extra
01636     for row in listoflists:
01637         if row == ['\n'] or row == '\n':
01638             outfile.write('\n')
01639         elif row == ['dashes'] or row == 'dashes':
01640             dashes = [0]*len(maxsize)
01641             for j in range(len(maxsize)):
01642                 dashes[j] = '-'*(maxsize[j]-2)
01643             outfile.write(pstat.lineincustcols(dashes,maxsize))
01644         else:
01645             outfile.write(pstat.lineincustcols(row,maxsize))
01646         outfile.write('\n')
01647     outfile.close()
01648     return None
01649 
01650 
01651 def lincr(l,cap):        # to increment a list up to a max-list of 'cap'
01652     """
01653 Simulate a counting system from an n-dimensional list.
01654 
01655 Usage:   lincr(l,cap)   l=list to increment, cap=max values for each list pos'n
01656 Returns: next set of values for list l, OR -1 (if overflow)
01657 """
01658     l[0] = l[0] + 1     # e.g., [0,0,0] --> [2,4,3] (=cap)
01659     for i in range(len(l)):
01660         if l[i] > cap[i] and i < len(l)-1: # if carryover AND not done
01661             l[i] = 0
01662             l[i+1] = l[i+1] + 1
01663         elif l[i] > cap[i] and i == len(l)-1: # overflow past last column, must be finished
01664             l = -1
01665     return l
01666 
01667 
01668 def lsum (inlist):
01669     """
01670 Returns the sum of the items in the passed list.
01671 
01672 Usage:   lsum(inlist)
01673 """
01674     s = 0
01675     for item in inlist:
01676         s = s + item
01677     return s
01678 
01679 
01680 def lcumsum (inlist):
01681     """
01682 Returns a list consisting of the cumulative sum of the items in the
01683 passed list.
01684 
01685 Usage:   lcumsum(inlist)
01686 """
01687     newlist = copy.deepcopy(inlist)
01688     for i in range(1,len(newlist)):
01689         newlist[i] = newlist[i] + newlist[i-1]
01690     return newlist
01691 
01692 
01693 def lss(inlist):
01694     """
01695 Squares each value in the passed list, adds up these squares and
01696 returns the result.
01697 
01698 Usage:   lss(inlist)
01699 """
01700     ss = 0
01701     for item in inlist:
01702         ss = ss + item*item
01703     return ss
01704 
01705 
01706 def lsummult (list1,list2):
01707     """
01708 Multiplies elements in list1 and list2, element by element, and
01709 returns the sum of all resulting multiplications.  Must provide equal
01710 length lists.
01711 
01712 Usage:   lsummult(list1,list2)
01713 """
01714     if len(list1) <> len(list2):
01715         raise ValueError, "Lists not equal length in summult."
01716     s = 0
01717     for item1,item2 in pstat.abut(list1,list2):
01718         s = s + item1*item2
01719     return s
01720 
01721 
01722 def lsumdiffsquared(x,y):
01723     """
01724 Takes pairwise differences of the values in lists x and y, squares
01725 these differences, and returns the sum of these squares.
01726 
01727 Usage:   lsumdiffsquared(x,y)
01728 Returns: sum[(x[i]-y[i])**2]
01729 """
01730     sds = 0
01731     for i in range(len(x)):
01732         sds = sds + (x[i]-y[i])**2
01733     return sds
01734 
01735 
01736 def lsquare_of_sums(inlist):
01737     """
01738 Adds the values in the passed list, squares the sum, and returns
01739 the result.
01740 
01741 Usage:   lsquare_of_sums(inlist)
01742 Returns: sum(inlist[i])**2
01743 """
01744     s = sum(inlist)
01745     return float(s)*s
01746 
01747 
01748 def lshellsort(inlist):
01749     """
01750 Shellsort algorithm.  Sorts a 1D-list.
01751 
01752 Usage:   lshellsort(inlist)
01753 Returns: sorted-inlist, sorting-index-vector (for original list)
01754 """
01755     n = len(inlist)
01756     svec = copy.deepcopy(inlist)
01757     ivec = range(n)
01758     gap = n/2   # integer division needed
01759     while gap >0:
01760         for i in range(gap,n):
01761             for j in range(i-gap,-1,-gap):
01762                 while j>=0 and svec[j]>svec[j+gap]:
01763                     temp        = svec[j]
01764                     svec[j]     = svec[j+gap]
01765                     svec[j+gap] = temp
01766                     itemp       = ivec[j]
01767                     ivec[j]     = ivec[j+gap]
01768                     ivec[j+gap] = itemp
01769         gap = gap / 2  # integer division needed
01770 # svec is now sorted inlist, and ivec has the order svec[i] = vec[ivec[i]]
01771     return svec, ivec
01772 
01773 
01774 def lrankdata(inlist):
01775     """
01776 Ranks the data in inlist, dealing with ties appropritely.  Assumes
01777 a 1D inlist.  Adapted from Gary Perlman's |Stat ranksort.
01778 
01779 Usage:   lrankdata(inlist)
01780 Returns: a list of length equal to inlist, containing rank scores
01781 """
01782     n = len(inlist)
01783     svec, ivec = shellsort(inlist)
01784     sumranks = 0
01785     dupcount = 0
01786     newlist = [0]*n
01787     for i in range(n):
01788         sumranks = sumranks + i
01789         dupcount = dupcount + 1
01790         if i==n-1 or svec[i] <> svec[i+1]:
01791             averank = sumranks / float(dupcount) + 1
01792             for j in range(i-dupcount+1,i+1):
01793                 newlist[ivec[j]] = averank
01794             sumranks = 0
01795             dupcount = 0
01796     return newlist
01797 
01798 
01799 def outputpairedstats(fname,writemode,name1,n1,m1,se1,min1,max1,name2,n2,m2,se2,min2,max2,statname,stat,prob):
01800     """
01801 Prints or write to a file stats for two groups, using the name, n,
01802 mean, sterr, min and max for each group, as well as the statistic name,
01803 its value, and the associated p-value.
01804 
01805 Usage:   outputpairedstats(fname,writemode,
01806                            name1,n1,mean1,stderr1,min1,max1,
01807                            name2,n2,mean2,stderr2,min2,max2,
01808                            statname,stat,prob)
01809 Returns: None
01810 """
01811     suffix = ''                       # for *s after the p-value
01812     try:
01813         x = prob.shape
01814         prob = prob[0]
01815     except:
01816         pass
01817     if  prob < 0.001:  suffix = '  ***'
01818     elif prob < 0.01:  suffix = '  **'
01819     elif prob < 0.05:  suffix = '  *'
01820     title = [['Name','N','Mean','SD','Min','Max']]
01821     lofl = title+[[name1,n1,round(m1,3),round(math.sqrt(se1),3),min1,max1],
01822                   [name2,n2,round(m2,3),round(math.sqrt(se2),3),min2,max2]]
01823     if type(fname)<>StringType or len(fname)==0:
01824         print
01825         print statname
01826         print
01827         pstat.printcc(lofl)
01828         print
01829         try:
01830             if stat.shape == ():
01831                 stat = stat[0]
01832             if prob.shape == ():
01833                 prob = prob[0]
01834         except:
01835             pass
01836         print 'Test statistic = ',round(stat,3),'   p = ',round(prob,3),suffix
01837         print
01838     else:
01839         file = open(fname,writemode)
01840         file.write('\n'+statname+'\n\n')
01841         file.close()
01842         writecc(lofl,fname,'a')
01843         file = open(fname,'a')
01844         try:
01845             if stat.shape == ():
01846                 stat = stat[0]
01847             if prob.shape == ():
01848                 prob = prob[0]
01849         except:
01850             pass
01851         file.write(pstat.list2string(['\nTest statistic = ',round(stat,4),'   p = ',round(prob,4),suffix,'\n\n']))
01852         file.close()
01853     return None
01854 
01855 
01856 def lfindwithin (data):
01857     """
01858 Returns an integer representing a binary vector, where 1=within-
01859 subject factor, 0=between.  Input equals the entire data 2D list (i.e.,
01860 column 0=random factor, column -1=measured values (those two are skipped).
01861 Note: input data is in |Stat format ... a list of lists ("2D list") with 
01862 one row per measured value, first column=subject identifier, last column=
01863 score, one in-between column per factor (these columns contain level
01864 designations on each factor).  See also stats.anova.__doc__.
01865 
01866 Usage:   lfindwithin(data)     data in |Stat format
01867 """
01868 
01869     numfact = len(data[0])-1
01870     withinvec = 0
01871     for col in range(1,numfact):
01872         examplelevel = pstat.unique(pstat.colex(data,col))[0]
01873         rows = pstat.linexand(data,col,examplelevel)  # get 1 level of this factor
01874         factsubjs = pstat.unique(pstat.colex(rows,0))
01875         allsubjs = pstat.unique(pstat.colex(data,0))
01876         if len(factsubjs) == len(allsubjs):  # fewer Ss than scores on this factor?
01877             withinvec = withinvec + (1 << col)
01878     return withinvec
01879 
01880 
01881 #########################################################
01882 #########################################################
01883 ####### DISPATCH LISTS AND TUPLES TO ABOVE FCNS #########
01884 #########################################################
01885 #########################################################
01886 
01887 ## CENTRAL TENDENCY:
01888 geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)), )
01889 harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)), )
01890 mean = Dispatch ( (lmean, (ListType, TupleType)), )
01891 median = Dispatch ( (lmedian, (ListType, TupleType)), )
01892 medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)), )
01893 mode = Dispatch ( (lmode, (ListType, TupleType)), )
01894 
01895 ## MOMENTS:
01896 moment = Dispatch ( (lmoment, (ListType, TupleType)), )
01897 variation = Dispatch ( (lvariation, (ListType, TupleType)), )
01898 skew = Dispatch ( (lskew, (ListType, TupleType)), )
01899 kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)), )
01900 describe = Dispatch ( (ldescribe, (ListType, TupleType)), )
01901 
01902 ## FREQUENCY STATISTICS:
01903 itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)), )
01904 scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)), )
01905 percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)), )
01906 histogram = Dispatch ( (lhistogram, (ListType, TupleType)), )
01907 cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)), )
01908 relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)), )
01909 
01910 ## VARIABILITY:
01911 obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)), )
01912 samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)), )
01913 samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)), )
01914 var = Dispatch ( (lvar, (ListType, TupleType)), )
01915 stdev = Dispatch ( (lstdev, (ListType, TupleType)), )
01916 sterr = Dispatch ( (lsterr, (ListType, TupleType)), )
01917 sem = Dispatch ( (lsem, (ListType, TupleType)), )
01918 z = Dispatch ( (lz, (ListType, TupleType)), )
01919 zs = Dispatch ( (lzs, (ListType, TupleType)), )
01920 
01921 ## TRIMMING FCNS:
01922 trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)), )
01923 trim1 = Dispatch ( (ltrim1, (ListType, TupleType)), )
01924 
01925 ## CORRELATION FCNS:
01926 paired = Dispatch ( (lpaired, (ListType, TupleType)), )
01927 pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)), )
01928 spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)), )
01929 pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)), )
01930 kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)), )
01931 linregress = Dispatch ( (llinregress, (ListType, TupleType)), )
01932 
01933 ## INFERENTIAL STATS:
01934 ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)), )
01935 ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)), )
01936 ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)), )
01937 chisquare = Dispatch ( (lchisquare, (ListType, TupleType)), )
01938 ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)), )
01939 mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)), )
01940 ranksums = Dispatch ( (lranksums, (ListType, TupleType)), )
01941 tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)), )
01942 wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)), )
01943 kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)), )
01944 friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)), )
01945 
01946 ## PROBABILITY CALCS:
01947 chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)), )
01948 zprob = Dispatch ( (lzprob, (IntType, FloatType)), )
01949 ksprob = Dispatch ( (lksprob, (IntType, FloatType)), )
01950 fprob = Dispatch ( (lfprob, (IntType, FloatType)), )
01951 betacf = Dispatch ( (lbetacf, (IntType, FloatType)), )
01952 betai = Dispatch ( (lbetai, (IntType, FloatType)), )
01953 erfcc = Dispatch ( (lerfcc, (IntType, FloatType)), )
01954 gammln = Dispatch ( (lgammln, (IntType, FloatType)), )
01955 
01956 ## ANOVA FUNCTIONS:
01957 F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)), )
01958 F_value = Dispatch ( (lF_value, (ListType, TupleType)), )
01959 
01960 ## SUPPORT FUNCTIONS:
01961 incr = Dispatch ( (lincr, (ListType, TupleType)), )
01962 sum = Dispatch ( (lsum, (ListType, TupleType)), )
01963 cumsum = Dispatch ( (lcumsum, (ListType, TupleType)), )
01964 ss = Dispatch ( (lss, (ListType, TupleType)), )
01965 summult = Dispatch ( (lsummult, (ListType, TupleType)), )
01966 square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)), )
01967 sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)), )
01968 shellsort = Dispatch ( (lshellsort, (ListType, TupleType)), )
01969 rankdata = Dispatch ( (lrankdata, (ListType, TupleType)), )
01970 findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)), )
01971 
01972 
01973 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01974 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01975 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01976 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01977 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01978 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01979 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01980 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01981 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01982 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01983 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01984 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01985 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01986 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01987 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01988 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01989 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01990 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01991 #=============  THE ARRAY-VERSION OF THE STATS FUNCTIONS  ===============
01992 
01993 try:                         # DEFINE THESE *ONLY* IF NUMERIC IS AVAILABLE
01994  import numpy as N
01995  import numpy.linalg as LA
01996 
01997 
01998 #####################################
01999 ########  ACENTRAL TENDENCY  ########
02000 #####################################
02001 
02002  def ageometricmean (inarray,dimension=None,keepdims=0):
02003     """
02004 Calculates the geometric mean of the values in the passed array.
02005 That is:  n-th root of (x1 * x2 * ... * xn).  Defaults to ALL values in
02006 the passed array.  Use dimension=None to flatten array first.  REMEMBER: if
02007 dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
02008 if dimension is a sequence, it collapses over all specified dimensions.  If
02009 keepdims is set to 1, the resulting array will have as many dimensions as
02010 inarray, with only 1 'level' per dim that was collapsed over.
02011 
02012 Usage:   ageometricmean(inarray,dimension=None,keepdims=0)
02013 Returns: geometric mean computed over dim(s) listed in dimension
02014 """
02015     inarray = N.array(inarray,N.float_)
02016     if dimension == None:
02017         inarray = N.ravel(inarray)
02018         size = len(inarray)
02019         mult = N.power(inarray,1.0/size)
02020         mult = N.multiply.reduce(mult)
02021     elif type(dimension) in [IntType,FloatType]:
02022         size = inarray.shape[dimension]
02023         mult = N.power(inarray,1.0/size)
02024         mult = N.multiply.reduce(mult,dimension)
02025         if keepdims == 1:
02026             shp = list(inarray.shape)
02027             shp[dimension] = 1
02028             sum = N.reshape(sum,shp)
02029     else: # must be a SEQUENCE of dims to average over
02030         dims = list(dimension)
02031         dims.sort()
02032         dims.reverse()
02033         size = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.float_)
02034         mult = N.power(inarray,1.0/size)
02035         for dim in dims:
02036             mult = N.multiply.reduce(mult,dim)
02037         if keepdims == 1:
02038             shp = list(inarray.shape)
02039             for dim in dims:
02040                 shp[dim] = 1
02041             mult = N.reshape(mult,shp)
02042     return mult
02043 
02044 
02045  def aharmonicmean (inarray,dimension=None,keepdims=0):
02046     """
02047 Calculates the harmonic mean of the values in the passed array.
02048 That is:  n / (1/x1 + 1/x2 + ... + 1/xn).  Defaults to ALL values in
02049 the passed array.  Use dimension=None to flatten array first.  REMEMBER: if
02050 dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
02051 if dimension is a sequence, it collapses over all specified dimensions.  If
02052 keepdims is set to 1, the resulting array will have as many dimensions as
02053 inarray, with only 1 'level' per dim that was collapsed over.
02054 
02055 Usage:   aharmonicmean(inarray,dimension=None,keepdims=0)
02056 Returns: harmonic mean computed over dim(s) in dimension
02057 """
02058     inarray = inarray.astype(N.float_)
02059     if dimension == None:
02060         inarray = N.ravel(inarray)
02061         size = len(inarray)
02062         s = N.add.reduce(1.0 / inarray)
02063     elif type(dimension) in [IntType,FloatType]:
02064         size = float(inarray.shape[dimension])
02065         s = N.add.reduce(1.0/inarray, dimension)
02066         if keepdims == 1:
02067             shp = list(inarray.shape)
02068             shp[dimension] = 1
02069             s = N.reshape(s,shp)
02070     else: # must be a SEQUENCE of dims to average over
02071         dims = list(dimension)
02072         dims.sort()
02073         nondims = []
02074         for i in range(len(inarray.shape)):
02075             if i not in dims:
02076                 nondims.append(i)
02077         tinarray = N.transpose(inarray,nondims+dims) # put keep-dims first
02078         idx = [0] *len(nondims)
02079         if idx == []:
02080             size = len(N.ravel(inarray))
02081             s = asum(1.0 / inarray)
02082             if keepdims == 1:
02083                 s = N.reshape([s],N.ones(len(inarray.shape)))
02084         else:
02085             idx[0] = -1
02086             loopcap = N.array(tinarray.shape[0:len(nondims)]) -1
02087             s = N.zeros(loopcap+1,N.float_)
02088             while incr(idx,loopcap) <> -1:
02089                 s[idx] = asum(1.0/tinarray[idx])
02090             size = N.multiply.reduce(N.take(inarray.shape,dims))
02091             if keepdims == 1:
02092                 shp = list(inarray.shape)
02093                 for dim in dims:
02094                     shp[dim] = 1
02095                 s = N.reshape(s,shp)
02096     return size / s
02097 
02098 
02099  def amean (inarray,dimension=None,keepdims=0):
02100     """
02101 Calculates the arithmatic mean of the values in the passed array.
02102 That is:  1/n * (x1 + x2 + ... + xn).  Defaults to ALL values in the
02103 passed array.  Use dimension=None to flatten array first.  REMEMBER: if
02104 dimension=0, it collapses over dimension 0 ('rows' in a 2D array) only, and
02105 if dimension is a sequence, it collapses over all specified dimensions.  If
02106 keepdims is set to 1, the resulting array will have as many dimensions as
02107 inarray, with only 1 'level' per dim that was collapsed over.
02108 
02109 Usage:   amean(inarray,dimension=None,keepdims=0)
02110 Returns: arithematic mean calculated over dim(s) in dimension
02111 """
02112     if inarray.dtype in [N.int_, N.short,N.ubyte]:
02113         inarray = inarray.astype(N.float_)
02114     if dimension == None:
02115         inarray = N.ravel(inarray)
02116         sum = N.add.reduce(inarray)
02117         denom = float(len(inarray))
02118     elif type(dimension) in [IntType,FloatType]:
02119         sum = asum(inarray,dimension)
02120         denom = float(inarray.shape[dimension])
02121         if keepdims == 1:
02122             shp = list(inarray.shape)
02123             shp[dimension] = 1
02124             sum = N.reshape(sum,shp)
02125     else: # must be a TUPLE of dims to average over
02126         dims = list(dimension)
02127         dims.sort()
02128         dims.reverse()
02129         sum = inarray *1.0
02130         for dim in dims:
02131             sum = N.add.reduce(sum,dim)
02132         denom = N.array(N.multiply.reduce(N.take(inarray.shape,dims)),N.float_)
02133         if keepdims == 1:
02134             shp = list(inarray.shape)
02135             for dim in dims:
02136                 shp[dim] = 1
02137             sum = N.reshape(sum,shp)
02138     return sum/denom
02139 
02140 
02141  def amedian (inarray,numbins=1000):
02142     """
02143 Calculates the COMPUTED median value of an array of numbers, given the
02144 number of bins to use for the histogram (more bins approaches finding the
02145 precise median value of the array; default number of bins = 1000).  From
02146 G.W. Heiman's Basic Stats, or CRC Probability & Statistics.
02147 NOTE:  THIS ROUTINE ALWAYS uses the entire passed array (flattens it first).
02148 
02149 Usage:   amedian(inarray,numbins=1000)
02150 Returns: median calculated over ALL values in inarray
02151 """
02152     inarray = N.ravel(inarray)
02153     (hist, smallest, binsize, extras) = ahistogram(inarray,numbins,[min(inarray),max(inarray)])
02154     cumhist = N.cumsum(hist)            # make cumulative histogram
02155     otherbins = N.greater_equal(cumhist,len(inarray)/2.0)
02156     otherbins = list(otherbins)         # list of 0/1s, 1s start at median bin
02157     cfbin = otherbins.index(1)                # get 1st(!) index holding 50%ile score
02158     LRL = smallest + binsize*cfbin        # get lower read limit of that bin
02159     cfbelow = N.add.reduce(hist[0:cfbin])        # cum. freq. below bin
02160     freq = hist[cfbin]                        # frequency IN the 50%ile bin
02161     median = LRL + ((len(inarray)/2.0-cfbelow)/float(freq))*binsize # MEDIAN
02162     return median
02163 
02164 
02165  def amedianscore (inarray,dimension=None):
02166     """
02167 Returns the 'middle' score of the passed array.  If there is an even
02168 number of scores, the mean of the 2 middle scores is returned.  Can function
02169 with 1D arrays, or on the FIRST dimension of 2D arrays (i.e., dimension can
02170 be None, to pre-flatten the array, or else dimension must equal 0).
02171 
02172 Usage:   amedianscore(inarray,dimension=None)
02173 Returns: 'middle' score of the array, or the mean of the 2 middle scores
02174 """
02175     if dimension == None:
02176         inarray = N.ravel(inarray)
02177         dimension = 0
02178     inarray = N.sort(inarray,dimension)
02179     if inarray.shape[dimension] % 2 == 0:   # if even number of elements
02180         indx = inarray.shape[dimension]/2   # integer division correct
02181         median = N.asarray(inarray[indx]+inarray[indx-1]) / 2.0
02182     else:
02183         indx = inarray.shape[dimension] / 2 # integer division correct
02184         median = N.take(inarray,[indx],dimension)
02185         if median.shape == (1,):
02186             median = median[0]
02187     return median
02188 
02189 
02190  def amode(a, dimension=None):
02191     """
02192 Returns an array of the modal (most common) score in the passed array.
02193 If there is more than one such score, ONLY THE FIRST is returned.
02194 The bin-count for the modal values is also returned.  Operates on whole
02195 array (dimension=None), or on a given dimension.
02196 
02197 Usage:   amode(a, dimension=None)
02198 Returns: array of bin-counts for mode(s), array of corresponding modal values
02199 """
02200 
02201     if dimension == None:
02202         a = N.ravel(a)
02203         dimension = 0
02204     scores = pstat.aunique(N.ravel(a))       # get ALL unique values
02205     testshape = list(a.shape)
02206     testshape[dimension] = 1
02207     oldmostfreq = N.zeros(testshape)
02208     oldcounts = N.zeros(testshape)
02209     for score in scores:
02210         template = N.equal(a,score)
02211         counts = asum(template,dimension,1)
02212         mostfrequent = N.where(counts>oldcounts,score,oldmostfreq)
02213         oldcounts = N.where(counts>oldcounts,counts,oldcounts)
02214         oldmostfreq = mostfrequent
02215     return oldcounts, mostfrequent
02216 
02217 
02218  def atmean(a,limits=None,inclusive=(1,1)):
02219      """
02220 Returns the arithmetic mean of all values in an array, ignoring values
02221 strictly outside the sequence passed to 'limits'.   Note: either limit
02222 in the sequence, or the value of limits itself, can be set to None.  The
02223 inclusive list/tuple determines whether the lower and upper limiting bounds
02224 (respectively) are open/exclusive (0) or closed/inclusive (1).
02225 
02226 Usage:   atmean(a,limits=None,inclusive=(1,1))
02227 """
02228      if a.dtype in [N.int_, N.short,N.ubyte]:
02229          a = a.astype(N.float_)
02230      if limits == None:
02231          return mean(a)
02232      assert type(limits) in [ListType,TupleType,N.ndarray], "Wrong type for limits in atmean"
02233      if inclusive[0]:         lowerfcn = N.greater_equal
02234      else:               lowerfcn = N.greater
02235      if inclusive[1]:         upperfcn = N.less_equal
02236      else:               upperfcn = N.less
02237      if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
02238          raise ValueError, "No array values within given limits (atmean)."
02239      elif limits[0]==None and limits[1]<>None:
02240          mask = upperfcn(a,limits[1])
02241      elif limits[0]<>None and limits[1]==None:
02242          mask = lowerfcn(a,limits[0])
02243      elif limits[0]<>None and limits[1]<>None:
02244          mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
02245      s = float(N.add.reduce(N.ravel(a*mask)))
02246      n = float(N.add.reduce(N.ravel(mask)))
02247      return s/n
02248 
02249 
02250  def atvar(a,limits=None,inclusive=(1,1)):
02251      """
02252 Returns the sample variance of values in an array, (i.e., using N-1),
02253 ignoring values strictly outside the sequence passed to 'limits'.  
02254 Note: either limit in the sequence, or the value of limits itself,
02255 can be set to None.  The inclusive list/tuple determines whether the lower
02256 and upper limiting bounds (respectively) are open/exclusive (0) or
02257 closed/inclusive (1). ASSUMES A FLAT ARRAY (OR ELSE PREFLATTENS).
02258 
02259 Usage:   atvar(a,limits=None,inclusive=(1,1))
02260 """
02261      a = a.astype(N.float_)
02262      if limits == None or limits == [None,None]:
02263          return avar(a)
02264      assert type(limits) in [ListType,TupleType,N.ndarray], "Wrong type for limits in atvar"
02265      if inclusive[0]:    lowerfcn = N.greater_equal
02266      else:               lowerfcn = N.greater
02267      if inclusive[1]:    upperfcn = N.less_equal
02268      else:               upperfcn = N.less
02269      if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
02270          raise ValueError, "No array values within given limits (atvar)."
02271      elif limits[0]==None and limits[1]<>None:
02272          mask = upperfcn(a,limits[1])
02273      elif limits[0]<>None and limits[1]==None:
02274          mask = lowerfcn(a,limits[0])
02275      elif limits[0]<>None and limits[1]<>None:
02276          mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
02277 
02278      a = N.compress(mask,a)  # squish out excluded values
02279      return avar(a)
02280 
02281 
02282  def atmin(a,lowerlimit=None,dimension=None,inclusive=1):
02283      """
02284 Returns the minimum value of a, along dimension, including only values less
02285 than (or equal to, if inclusive=1) lowerlimit.  If the limit is set to None,
02286 all values in the array are used.
02287 
02288 Usage:   atmin(a,lowerlimit=None,dimension=None,inclusive=1)
02289 """
02290      if inclusive:         lowerfcn = N.greater
02291      else:               lowerfcn = N.greater_equal
02292      if dimension == None:
02293          a = N.ravel(a)
02294          dimension = 0
02295      if lowerlimit == None:
02296          lowerlimit = N.minimum.reduce(N.ravel(a))-11
02297      biggest = N.maximum.reduce(N.ravel(a))
02298      ta = N.where(lowerfcn(a,lowerlimit),a,biggest)
02299      return N.minimum.reduce(ta,dimension)
02300 
02301 
02302  def atmax(a,upperlimit,dimension=None,inclusive=1):
02303      """
02304 Returns the maximum value of a, along dimension, including only values greater
02305 than (or equal to, if inclusive=1) upperlimit.  If the limit is set to None,
02306 a limit larger than the max value in the array is used.
02307 
02308 Usage:   atmax(a,upperlimit,dimension=None,inclusive=1)
02309 """
02310      if inclusive:         upperfcn = N.less
02311      else:               upperfcn = N.less_equal
02312      if dimension == None:
02313          a = N.ravel(a)
02314          dimension = 0
02315      if upperlimit == None:
02316          upperlimit = N.maximum.reduce(N.ravel(a))+1
02317      smallest = N.minimum.reduce(N.ravel(a))
02318      ta = N.where(upperfcn(a,upperlimit),a,smallest)
02319      return N.maximum.reduce(ta,dimension)
02320 
02321 
02322  def atstdev(a,limits=None,inclusive=(1,1)):
02323      """
02324 Returns the standard deviation of all values in an array, ignoring values
02325 strictly outside the sequence passed to 'limits'.   Note: either limit
02326 in the sequence, or the value of limits itself, can be set to None.  The
02327 inclusive list/tuple determines whether the lower and upper limiting bounds
02328 (respectively) are open/exclusive (0) or closed/inclusive (1).
02329 
02330 Usage:   atstdev(a,limits=None,inclusive=(1,1))
02331 """
02332      return N.sqrt(tvar(a,limits,inclusive))
02333 
02334 
02335  def atsem(a,limits=None,inclusive=(1,1)):
02336      """
02337 Returns the standard error of the mean for the values in an array,
02338 (i.e., using N for the denominator), ignoring values strictly outside
02339 the sequence passed to 'limits'.   Note: either limit in the sequence,
02340 or the value of limits itself, can be set to None.  The inclusive list/tuple
02341 determines whether the lower and upper limiting bounds (respectively) are
02342 open/exclusive (0) or closed/inclusive (1).
02343 
02344 Usage:   atsem(a,limits=None,inclusive=(1,1))
02345 """
02346      sd = tstdev(a,limits,inclusive)
02347      if limits == None or limits == [None,None]:
02348          n = float(len(N.ravel(a)))
02349          limits = [min(a)-1, max(a)+1]
02350      assert type(limits) in [ListType,TupleType,N.ndarray], "Wrong type for limits in atsem"
02351      if inclusive[0]:         lowerfcn = N.greater_equal
02352      else:               lowerfcn = N.greater
02353      if inclusive[1]:         upperfcn = N.less_equal
02354      else:               upperfcn = N.less
02355      if limits[0] > N.maximum.reduce(N.ravel(a)) or limits[1] < N.minimum.reduce(N.ravel(a)):
02356          raise ValueError, "No array values within given limits (atsem)."
02357      elif limits[0]==None and limits[1]<>None:
02358          mask = upperfcn(a,limits[1])
02359      elif limits[0]<>None and limits[1]==None:
02360          mask = lowerfcn(a,limits[0])
02361      elif limits[0]<>None and limits[1]<>None:
02362          mask = lowerfcn(a,limits[0])*upperfcn(a,limits[1])
02363      term1 = N.add.reduce(N.ravel(a*a*mask))
02364      n = float(N.add.reduce(N.ravel(mask)))
02365      return sd/math.sqrt(n)
02366 
02367 
02368 #####################################
02369 ############  AMOMENTS  #############
02370 #####################################
02371 
02372  def amoment(a,moment=1,dimension=None):
02373     """
02374 Calculates the nth moment about the mean for a sample (defaults to the
02375 1st moment).  Generally used to calculate coefficients of skewness and
02376 kurtosis.  Dimension can equal None (ravel array first), an integer
02377 (the dimension over which to operate), or a sequence (operate over
02378 multiple dimensions).
02379 
02380 Usage:   amoment(a,moment=1,dimension=None)
02381 Returns: appropriate moment along given dimension
02382 """
02383     if dimension == None:
02384         a = N.ravel(a)
02385         dimension = 0
02386     if moment == 1:
02387         return 0.0
02388     else:
02389         mn = amean(a,dimension,1)  # 1=keepdims
02390         s = N.power((a-mn),moment)
02391         return amean(s,dimension)
02392 
02393 
02394  def avariation(a,dimension=None):
02395     """
02396 Returns the coefficient of variation, as defined in CRC Standard
02397 Probability and Statistics, p.6. Dimension can equal None (ravel array
02398 first), an integer (the dimension over which to operate), or a
02399 sequence (operate over multiple dimensions).
02400 
02401 Usage:   avariation(a,dimension=None)
02402 """
02403     return 100.0*asamplestdev(a,dimension)/amean(a,dimension)
02404 
02405 
02406  def askew(a,dimension=None): 
02407     """ 
02408 Returns the skewness of a distribution (normal ==> 0.0; >0 means extra
02409 weight in left tail).  Use askewtest() to see if it's close enough.
02410 Dimension can equal None (ravel array first), an integer (the
02411 dimension over which to operate), or a sequence (operate over multiple
02412 dimensions).
02413 
02414 Usage:   askew(a, dimension=None)
02415 Returns: skew of vals in a along dimension, returning ZERO where all vals equal
02416 """
02417     denom = N.power(amoment(a,2,dimension),1.5)
02418     zero = N.equal(denom,0)
02419     if type(denom) == N.ndarray and asum(zero) <> 0:
02420         print "Number of zeros in askew: ",asum(zero)
02421     denom = denom + zero  # prevent divide-by-zero
02422     return N.where(zero, 0, amoment(a,3,dimension)/denom)
02423 
02424 
02425  def akurtosis(a,dimension=None):
02426     """
02427 Returns the kurtosis of a distribution (normal ==> 3.0; >3 means
02428 heavier in the tails, and usually more peaked).  Use akurtosistest()
02429 to see if it's close enough.  Dimension can equal None (ravel array
02430 first), an integer (the dimension over which to operate), or a
02431 sequence (operate over multiple dimensions).
02432 
02433 Usage:   akurtosis(a,dimension=None)
02434 Returns: kurtosis of values in a along dimension, and ZERO where all vals equal
02435 """
02436     denom = N.power(amoment(a,2,dimension),2)
02437     zero = N.equal(denom,0)
02438     if type(denom) == N.ndarray and asum(zero) <> 0:
02439         print "Number of zeros in akurtosis: ",asum(zero)
02440     denom = denom + zero  # prevent divide-by-zero
02441     return N.where(zero,0,amoment(a,4,dimension)/denom)
02442 
02443 
02444  def adescribe(inarray,dimension=None):
02445      """
02446 Returns several descriptive statistics of the passed array.  Dimension
02447 can equal None (ravel array first), an integer (the dimension over
02448 which to operate), or a sequence (operate over multiple dimensions).
02449 
02450 Usage:   adescribe(inarray,dimension=None)
02451 Returns: n, (min,max), mean, standard deviation, skew, kurtosis
02452 """
02453      if dimension == None:
02454          inarray = N.ravel(inarray)
02455          dimension = 0
02456      n = inarray.shape[dimension]
02457      mm = (N.minimum.reduce(inarray),N.maximum.reduce(inarray))
02458      m = amean(inarray,dimension)
02459      sd = astdev(inarray,dimension)
02460      skew = askew(inarray,dimension)
02461      kurt = akurtosis(inarray,dimension)
02462      return n, mm, m, sd, skew, kurt
02463 
02464 
02465 #####################################
02466 ########  NORMALITY TESTS  ##########
02467 #####################################
02468 
02469  def askewtest(a,dimension=None):
02470     """
02471 Tests whether the skew is significantly different from a normal
02472 distribution.  Dimension can equal None (ravel array first), an
02473 integer (the dimension over which to operate), or a sequence (operate
02474 over multiple dimensions).
02475 
02476 Usage:   askewtest(a,dimension=None)
02477 Returns: z-score and 2-tail z-probability
02478 """
02479     if dimension == None:
02480         a = N.ravel(a)
02481         dimension = 0
02482     b2 = askew(a,dimension)
02483     n = float(a.shape[dimension])
02484     y = b2 * N.sqrt(((n+1)*(n+3)) / (6.0*(n-2)) )
02485     beta2 = ( 3.0*(n*n+27*n-70)*(n+1)*(n+3) ) / ( (n-2.0)*(n+5)*(n+7)*(n+9) )
02486     W2 = -1 + N.sqrt(2*(beta2-1))
02487     delta = 1/N.sqrt(N.log(N.sqrt(W2)))
02488     alpha = N.sqrt(2/(W2-1))
02489     y = N.where(y==0,1,y)
02490     Z = delta*N.log(y/alpha + N.sqrt((y/alpha)**2+1))
02491     return Z, (1.0-zprob(Z))*2
02492 
02493 
02494  def akurtosistest(a,dimension=None):
02495     """
02496 Tests whether a dataset has normal kurtosis (i.e.,
02497 kurtosis=3(n-1)/(n+1)) Valid only for n>20.  Dimension can equal None
02498 (ravel array first), an integer (the dimension over which to operate),
02499 or a sequence (operate over multiple dimensions).
02500 
02501 Usage:   akurtosistest(a,dimension=None)
02502 Returns: z-score and 2-tail z-probability, returns 0 for bad pixels
02503 """
02504     if dimension == None:
02505         a = N.ravel(a)
02506         dimension = 0
02507     n = float(a.shape[dimension])
02508     if n<20:
02509         print "akurtosistest only valid for n>=20 ... continuing anyway, n=",n
02510     b2 = akurtosis(a,dimension)
02511     E = 3.0*(n-1) /(n+1)
02512     varb2 = 24.0*n*(n-2)*(n-3) / ((n+1)*(n+1)*(n+3)*(n+5))
02513     x = (b2-E)/N.sqrt(varb2)
02514     sqrtbeta1 = 6.0*(n*n-5*n+2)/((n+7)*(n+9)) * N.sqrt((6.0*(n+3)*(n+5))/
02515                                                        (n*(n-2)*(n-3)))
02516     A = 6.0 + 8.0/sqrtbeta1 *(2.0/sqrtbeta1 + N.sqrt(1+4.0/(sqrtbeta1**2)))
02517     term1 = 1 -2/(9.0*A)
02518     denom = 1 +x*N.sqrt(2/(A-4.0))
02519     denom = N.where(N.less(denom,0), 99, denom)
02520     term2 = N.where(N.equal(denom,0), term1, N.power((1-2.0/A)/denom,1/3.0))
02521     Z = ( term1 - term2 ) / N.sqrt(2/(9.0*A))
02522     Z = N.where(N.equal(denom,99), 0, Z)
02523     return Z, (1.0-zprob(Z))*2
02524 
02525 
02526  def anormaltest(a,dimension=None):
02527     """
02528 Tests whether skew and/OR kurtosis of dataset differs from normal
02529 curve.  Can operate over multiple dimensions.  Dimension can equal
02530 None (ravel array first), an integer (the dimension over which to
02531 operate), or a sequence (operate over multiple dimensions).
02532 
02533 Usage:   anormaltest(a,dimension=None)
02534 Returns: z-score and 2-tail probability
02535 """
02536     if dimension == None:
02537         a = N.ravel(a)
02538         dimension = 0
02539     s,p = askewtest(a,dimension)
02540     k,p = akurtosistest(a,dimension)
02541     k2 = N.power(s,2) + N.power(k,2)
02542     return k2, achisqprob(k2,2)
02543 
02544 
02545 #####################################
02546 ######  AFREQUENCY FUNCTIONS  #######
02547 #####################################
02548 
02549  def aitemfreq(a):
02550     """
02551 Returns a 2D array of item frequencies.  Column 1 contains item values,
02552 column 2 contains their respective counts.  Assumes a 1D array is passed.
02553 @@@sorting OK?
02554 
02555 Usage:   aitemfreq(a)
02556 Returns: a 2D frequency table (col [0:n-1]=scores, col n=frequencies)
02557 """
02558     scores = pstat.aunique(a)
02559     scores = N.sort(scores)
02560     freq = N.zeros(len(scores))
02561     for i in range(len(scores)):
02562         freq[i] = N.add.reduce(N.equal(a,scores[i]))
02563     return N.array(pstat.aabut(scores, freq))
02564 
02565 
02566  def ascoreatpercentile (inarray, percent):
02567     """
02568 Usage:   ascoreatpercentile(inarray,percent)   0<percent<100
02569 Returns: score at given percentile, relative to inarray distribution
02570 """
02571     percent = percent / 100.0
02572     targetcf = percent*len(inarray)
02573     h, lrl, binsize, extras = histogram(inarray)
02574     cumhist = cumsum(h*1)
02575     for i in range(len(cumhist)):
02576         if cumhist[i] >= targetcf:
02577             break
02578     score = binsize * ((targetcf - cumhist[i-1]) / float(h[i])) + (lrl+binsize*i)
02579     return score
02580 
02581 
02582  def apercentileofscore (inarray,score,histbins=10,defaultlimits=None):
02583     """
02584 Note: result of this function depends on the values used to histogram
02585 the data(!).
02586 
02587 Usage:   apercentileofscore(inarray,score,histbins=10,defaultlimits=None)
02588 Returns: percentile-position of score (0-100) relative to inarray
02589 """
02590     h, lrl, binsize, extras = histogram(inarray,histbins,defaultlimits)
02591     cumhist = cumsum(h*1)
02592     i = int((score - lrl)/float(binsize))
02593     pct = (cumhist[i-1]+((score-(lrl+binsize*i))/float(binsize))*h[i])/float(len(inarray)) * 100
02594     return pct
02595 
02596 
02597  def ahistogram (inarray,numbins=10,defaultlimits=None,printextras=1):
02598     """
02599 Returns (i) an array of histogram bin counts, (ii) the smallest value
02600 of the histogram binning, and (iii) the bin width (the last 2 are not
02601 necessarily integers).  Default number of bins is 10.  Defaultlimits
02602 can be None (the routine picks bins spanning all the numbers in the
02603 inarray) or a 2-sequence (lowerlimit, upperlimit).  Returns all of the
02604 following: array of bin values, lowerreallimit, binsize, extrapoints.
02605 
02606 Usage:   ahistogram(inarray,numbins=10,defaultlimits=None,printextras=1)
02607 Returns: (array of bin counts, bin-minimum, min-width, #-points-outside-range)
02608 """
02609     inarray = N.ravel(inarray)               # flatten any >1D arrays
02610     if (defaultlimits <> None):
02611         lowerreallimit = defaultlimits[0]
02612         upperreallimit = defaultlimits[1]
02613         binsize = (upperreallimit-lowerreallimit) / float(numbins)
02614     else:
02615         Min = N.minimum.reduce(inarray)
02616         Max = N.maximum.reduce(inarray)
02617         estbinwidth = float(Max - Min)/float(numbins) + 1e-6
02618         binsize = (Max-Min+estbinwidth)/float(numbins)
02619         lowerreallimit = Min - binsize/2.0  #lower real limit,1st bin
02620     bins = N.zeros(numbins)
02621     extrapoints = 0
02622     for num in inarray:
02623         try:
02624             if (num-lowerreallimit) < 0:
02625                 extrapoints = extrapoints + 1
02626             else:
02627                 bintoincrement = int((num-lowerreallimit) / float(binsize))
02628                 bins[bintoincrement] = bins[bintoincrement] + 1
02629         except:                           # point outside lower/upper limits
02630             extrapoints = extrapoints + 1
02631     if (extrapoints > 0 and printextras == 1):
02632         print '\nPoints outside given histogram range =',extrapoints
02633     return (bins, lowerreallimit, binsize, extrapoints)
02634 
02635 
02636  def acumfreq(a,numbins=10,defaultreallimits=None):
02637     """
02638 Returns a cumulative frequency histogram, using the histogram function.
02639 Defaultreallimits can be None (use all data), or a 2-sequence containing
02640 lower and upper limits on values to include.
02641 
02642 Usage:   acumfreq(a,numbins=10,defaultreallimits=None)
02643 Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
02644 """
02645     h,l,b,e = histogram(a,numbins,defaultreallimits)
02646     cumhist = cumsum(h*1)
02647     return cumhist,l,b,e
02648 
02649 
02650  def arelfreq(a,numbins=10,defaultreallimits=None):
02651     """
02652 Returns a relative frequency histogram, using the histogram function.
02653 Defaultreallimits can be None (use all data), or a 2-sequence containing
02654 lower and upper limits on values to include.
02655 
02656 Usage:   arelfreq(a,numbins=10,defaultreallimits=None)
02657 Returns: array of cumfreq bin values, lowerreallimit, binsize, extrapoints
02658 """
02659     h,l,b,e = histogram(a,numbins,defaultreallimits)
02660     h = N.array(h/float(a.shape[0]))
02661     return h,l,b,e
02662 
02663 
02664 #####################################
02665 ######  AVARIABILITY FUNCTIONS  #####
02666 #####################################
02667 
02668  def aobrientransform(*args):
02669     """
02670 Computes a transform on input data (any number of columns).  Used to
02671 test for homogeneity of variance prior to running one-way stats.  Each
02672 array in *args is one level of a factor.  If an F_oneway() run on the
02673 transformed data and found significant, variances are unequal.   From
02674 Maxwell and Delaney, p.112.
02675 
02676 Usage:   aobrientransform(*args)    *args = 1D arrays, one per level of factor
02677 Returns: transformed data for use in an ANOVA
02678 """
02679     TINY = 1e-10
02680     k = len(args)
02681     n = N.zeros(k,N.float_)
02682     v = N.zeros(k,N.float_)
02683     m = N.zeros(k,N.float_)
02684     nargs = []
02685     for i in range(k):
02686         nargs.append(args[i].astype(N.float_))
02687         n[i] = float(len(nargs[i]))
02688         v[i] = var(nargs[i])
02689         m[i] = mean(nargs[i])
02690     for j in range(k):
02691         for i in range(n[j]):
02692             t1 = (n[j]-1.5)*n[j]*(nargs[j][i]-m[j])**2
02693             t2 = 0.5*v[j]*(n[j]-1.0)
02694             t3 = (n[j]-1.0)*(n[j]-2.0)
02695             nargs[j][i] = (t1-t2) / float(t3)
02696     check = 1
02697     for j in range(k):
02698         if v[j] - mean(nargs[j]) > TINY:
02699             check = 0
02700     if check <> 1:
02701         raise ValueError, 'Lack of convergence in obrientransform.'
02702     else:
02703         return N.array(nargs)
02704 
02705 
02706  def asamplevar (inarray,dimension=None,keepdims=0):
02707     """
02708 Returns the sample standard deviation of the values in the passed
02709 array (i.e., using N).  Dimension can equal None (ravel array first),
02710 an integer (the dimension over which to operate), or a sequence
02711 (operate over multiple dimensions).  Set keepdims=1 to return an array
02712 with the same number of dimensions as inarray.
02713 
02714 Usage:   asamplevar(inarray,dimension=None,keepdims=0)
02715 """
02716     if dimension == None:
02717         inarray = N.ravel(inarray)
02718         dimension = 0
02719     if dimension == 1:
02720         mn = amean(inarray,dimension)[:,N.NewAxis]
02721     else:
02722         mn = amean(inarray,dimension,keepdims=1)
02723     deviations = inarray - mn 
02724     if type(dimension) == ListType:
02725         n = 1
02726         for d in dimension:
02727             n = n*inarray.shape[d]
02728     else:
02729         n = inarray.shape[dimension]
02730     svar = ass(deviations,dimension,keepdims) / float(n)
02731     return svar
02732 
02733 
02734  def asamplestdev (inarray, dimension=None, keepdims=0):
02735     """
02736 Returns the sample standard deviation of the values in the passed
02737 array (i.e., using N).  Dimension can equal None (ravel array first),
02738 an integer (the dimension over which to operate), or a sequence
02739 (operate over multiple dimensions).  Set keepdims=1 to return an array
02740 with the same number of dimensions as inarray.
02741 
02742 Usage:   asamplestdev(inarray,dimension=None,keepdims=0)
02743 """
02744     return N.sqrt(asamplevar(inarray,dimension,keepdims))
02745 
02746 
02747  def asignaltonoise(instack,dimension=0):
02748     """
02749 Calculates signal-to-noise.  Dimension can equal None (ravel array
02750 first), an integer (the dimension over which to operate), or a
02751 sequence (operate over multiple dimensions).
02752 
02753 Usage:   asignaltonoise(instack,dimension=0):
02754 Returns: array containing the value of (mean/stdev) along dimension,
02755          or 0 when stdev=0
02756 """
02757     m = mean(instack,dimension)
02758     sd = stdev(instack,dimension)
02759     return N.where(sd==0,0,m/sd)
02760 
02761 
02762  def acov (x,y, dimension=None,keepdims=0):
02763     """
02764 Returns the estimated covariance of the values in the passed
02765 array (i.e., N-1).  Dimension can equal None (ravel array first), an
02766 integer (the dimension over which to operate), or a sequence (operate
02767 over multiple dimensions).  Set keepdims=1 to return an array with the
02768 same number of dimensions as inarray.
02769 
02770 Usage:   acov(x,y,dimension=None,keepdims=0)
02771 """
02772     if dimension == None:
02773         x = N.ravel(x)
02774         y = N.ravel(y)
02775         dimension = 0
02776     xmn = amean(x,dimension,1)  # keepdims
02777     xdeviations = x - xmn
02778     ymn = amean(y,dimension,1)  # keepdims
02779     ydeviations = y - ymn
02780     if type(dimension) == ListType:
02781         n = 1
02782         for d in dimension:
02783             n = n*x.shape[d]
02784     else:
02785         n = x.shape[dimension]
02786     covar = N.sum(xdeviations*ydeviations)/float(n-1)
02787     return covar
02788 
02789 
02790  def avar (inarray, dimension=None,keepdims=0):
02791     """
02792 Returns the estimated population variance of the values in the passed
02793 array (i.e., N-1).  Dimension can equal None (ravel array first), an
02794 integer (the dimension over which to operate), or a sequence (operate
02795 over multiple dimensions).  Set keepdims=1 to return an array with the
02796 same number of dimensions as inarray.
02797 
02798 Usage:   avar(inarray,dimension=None,keepdims=0)
02799 """
02800     if dimension == None:
02801         inarray = N.ravel(inarray)
02802         dimension = 0
02803     mn = amean(inarray,dimension,1)
02804     deviations = inarray - mn
02805     if type(dimension) == ListType:
02806         n = 1
02807         for d in dimension:
02808             n = n*inarray.shape[d]
02809     else:
02810         n = inarray.shape[dimension]
02811     var = ass(deviations,dimension,keepdims)/float(n-1)
02812     return var
02813 
02814 
02815  def astdev (inarray, dimension=None, keepdims=0):
02816     """
02817 Returns the estimated population standard deviation of the values in
02818 the passed array (i.e., N-1).  Dimension can equal None (ravel array
02819 first), an integer (the dimension over which to operate), or a
02820 sequence (operate over multiple dimensions).  Set keepdims=1 to return
02821 an array with the same number of dimensions as inarray.
02822 
02823 Usage:   astdev(inarray,dimension=None,keepdims=0)
02824 """
02825     return N.sqrt(avar(inarray,dimension,keepdims))
02826 
02827 
02828  def asterr (inarray, dimension=None, keepdims=0):
02829     """
02830 Returns the estimated population standard error of the values in the
02831 passed array (i.e., N-1).  Dimension can equal None (ravel array
02832 first), an integer (the dimension over which to operate), or a
02833 sequence (operate over multiple dimensions).  Set keepdims=1 to return
02834 an array with the same number of dimensions as inarray.
02835 
02836 Usage:   asterr(inarray,dimension=None,keepdims=0)
02837 """
02838     if dimension == None:
02839         inarray = N.ravel(inarray)
02840         dimension = 0
02841     return astdev(inarray,dimension,keepdims) / float(N.sqrt(inarray.shape[dimension]))
02842 
02843 
02844  def asem (inarray, dimension=None, keepdims=0):
02845     """
02846 Returns the standard error of the mean (i.e., using N) of the values
02847 in the passed array.  Dimension can equal None (ravel array first), an
02848 integer (the dimension over which to operate), or a sequence (operate
02849 over multiple dimensions).  Set keepdims=1 to return an array with the
02850 same number of dimensions as inarray.
02851 
02852 Usage:   asem(inarray,dimension=None, keepdims=0)
02853 """
02854     if dimension == None:
02855         inarray = N.ravel(inarray)
02856         dimension = 0
02857     if type(dimension) == ListType:
02858         n = 1
02859         for d in dimension:
02860             n = n*inarray.shape[d]
02861     else:
02862         n = inarray.shape[dimension]
02863     s = asamplestdev(inarray,dimension,keepdims) / N.sqrt(n-1)
02864     return s
02865 
02866 
02867  def az (a, score):
02868     """
02869 Returns the z-score of a given input score, given thearray from which
02870 that score came.  Not appropriate for population calculations, nor for
02871 arrays > 1D.
02872 
02873 Usage:   az(a, score)
02874 """
02875     z = (score-amean(a)) / asamplestdev(a)
02876     return z
02877 
02878 
02879  def azs (a):
02880     """
02881 Returns a 1D array of z-scores, one for each score in the passed array,
02882 computed relative to the passed array.
02883 
02884 Usage:   azs(a)
02885 """
02886     zscores = []
02887     for item in a:
02888         zscores.append(z(a,item))
02889     return N.array(zscores)
02890 
02891 
02892  def azmap (scores, compare, dimension=0):
02893     """
02894 Returns an array of z-scores the shape of scores (e.g., [x,y]), compared to
02895 array passed to compare (e.g., [time,x,y]).  Assumes collapsing over dim 0
02896 of the compare array.
02897 
02898 Usage:   azs(scores, compare, dimension=0)
02899 """
02900     mns = amean(compare,dimension)
02901     sstd = asamplestdev(compare,0)
02902     return (scores - mns) / sstd
02903 
02904 
02905 #####################################
02906 #######  ATRIMMING FUNCTIONS  #######
02907 #####################################
02908 
02909 ## deleted around() as it's in numpy now
02910 
02911  def athreshold(a,threshmin=None,threshmax=None,newval=0):
02912     """
02913 Like Numeric.clip() except that values <threshmid or >threshmax are replaced
02914 by newval instead of by threshmin/threshmax (respectively).
02915 
02916 Usage:   athreshold(a,threshmin=None,threshmax=None,newval=0)
02917 Returns: a, with values <threshmin or >threshmax replaced with newval
02918 """
02919     mask = N.zeros(a.shape)
02920     if threshmin <> None:
02921         mask = mask + N.where(a<threshmin,1,0)
02922     if threshmax <> None:
02923         mask = mask + N.where(a>threshmax,1,0)
02924     mask = N.clip(mask,0,1)
02925     return N.where(mask,newval,a)
02926 
02927 
02928  def atrimboth (a,proportiontocut):
02929     """
02930 Slices off the passed proportion of items from BOTH ends of the passed
02931 array (i.e., with proportiontocut=0.1, slices 'leftmost' 10% AND
02932 'rightmost' 10% of scores.  You must pre-sort the array if you want
02933 "proper" trimming.  Slices off LESS if proportion results in a
02934 non-integer slice index (i.e., conservatively slices off
02935 proportiontocut).
02936 
02937 Usage:   atrimboth (a,proportiontocut)
02938 Returns: trimmed version of array a
02939 """
02940     lowercut = int(proportiontocut*len(a))
02941     uppercut = len(a) - lowercut
02942     return a[lowercut:uppercut]
02943 
02944 
02945  def atrim1 (a,proportiontocut,tail='right'):
02946     """
02947 Slices off the passed proportion of items from ONE end of the passed
02948 array (i.e., if proportiontocut=0.1, slices off 'leftmost' or 'rightmost'
02949 10% of scores).  Slices off LESS if proportion results in a non-integer
02950 slice index (i.e., conservatively slices off proportiontocut).
02951 
02952 Usage:   atrim1(a,proportiontocut,tail='right')  or set tail='left'
02953 Returns: trimmed version of array a
02954 """
02955     if string.lower(tail) == 'right':
02956         lowercut = 0
02957         uppercut = len(a) - int(proportiontocut*len(a))
02958     elif string.lower(tail) == 'left':
02959         lowercut = int(proportiontocut*len(a))
02960         uppercut = len(a)
02961     return a[lowercut:uppercut]
02962 
02963 
02964 #####################################
02965 #####  ACORRELATION FUNCTIONS  ######
02966 #####################################
02967 
02968  def acovariance(X):
02969     """
02970 Computes the covariance matrix of a matrix X.  Requires a 2D matrix input.
02971 
02972 Usage:   acovariance(X)
02973 Returns: covariance matrix of X
02974 """
02975     if len(X.shape) <> 2:
02976         raise TypeError, "acovariance requires 2D matrices"
02977     n = X.shape[0]
02978     mX = amean(X,0)
02979     return N.dot(N.transpose(X),X) / float(n) - N.multiply.outer(mX,mX)
02980 
02981 
02982  def acorrelation(X):
02983     """
02984 Computes the correlation matrix of a matrix X.  Requires a 2D matrix input.
02985 
02986 Usage:   acorrelation(X)
02987 Returns: correlation matrix of X
02988 """
02989     C = acovariance(X)
02990     V = N.diagonal(C)
02991     return C / N.sqrt(N.multiply.outer(V,V))
02992 
02993 
02994  def apaired(x,y):
02995     """
02996 Interactively determines the type of data in x and y, and then runs the
02997 appropriated statistic for paired group data.
02998 
02999 Usage:   apaired(x,y)     x,y = the two arrays of values to be compared
03000 Returns: appropriate statistic name, value, and probability
03001 """
03002     samples = ''
03003     while samples not in ['i','r','I','R','c','C']:
03004         print '\nIndependent or related samples, or correlation (i,r,c): ',
03005         samples = raw_input()
03006 
03007     if samples in ['i','I','r','R']:
03008         print '\nComparing variances ...',
03009 # USE O'BRIEN'S TEST FOR HOMOGENEITY OF VARIANCE, Maxwell & delaney, p.112
03010         r = obrientransform(x,y)
03011         f,p = F_oneway(pstat.colex(r,0),pstat.colex(r,1))
03012         if p<0.05:
03013             vartype='unequal, p='+str(round(p,4))
03014         else:
03015             vartype='equal'
03016         print vartype
03017         if samples in ['i','I']:
03018             if vartype[0]=='e':
03019                 t,p = ttest_ind(x,y,None,0)
03020                 print '\nIndependent samples t-test:  ', round(t,4),round(p,4)
03021             else:
03022                 if len(x)>20 or len(y)>20:
03023                     z,p = ranksums(x,y)
03024                     print '\nRank Sums test (NONparametric, n>20):  ', round(z,4),round(p,4)
03025                 else:
03026                     u,p = mannwhitneyu(x,y)
03027                     print '\nMann-Whitney U-test (NONparametric, ns<20):  ', round(u,4),round(p,4)
03028 
03029         else:  # RELATED SAMPLES
03030             if vartype[0]=='e':
03031                 t,p = ttest_rel(x,y,0)
03032                 print '\nRelated samples t-test:  ', round(t,4),round(p,4)
03033             else:
03034                 t,p = ranksums(x,y)
03035                 print '\nWilcoxon T-test (NONparametric):  ', round(t,4),round(p,4)
03036     else:  # CORRELATION ANALYSIS
03037         corrtype = ''
03038         while corrtype not in ['c','C','r','R','d','D']:
03039             print '\nIs the data Continuous, Ranked, or Dichotomous (c,r,d): ',
03040             corrtype = raw_input()
03041         if corrtype in ['c','C']:
03042             m,b,r,p,see = linregress(x,y)
03043             print '\nLinear regression for continuous variables ...'
03044             lol = [['Slope','Intercept','r','Prob','SEestimate'],[round(m,4),round(b,4),round(r,4),round(p,4),round(see,4)]]
03045             pstat.printcc(lol)
03046         elif corrtype in ['r','R']:
03047             r,p = spearmanr(x,y)
03048             print '\nCorrelation for ranked variables ...'
03049             print "Spearman's r: ",round(r,4),round(p,4)
03050         else: # DICHOTOMOUS
03051             r,p = pointbiserialr(x,y)
03052             print '\nAssuming x contains a dichotomous variable ...'
03053             print 'Point Biserial r: ',round(r,4),round(p,4)
03054     print '\n\n'
03055     return None
03056 
03057 
03058  def dices(x,y):
03059     """
03060 Calculates Dice's coefficient ... (2*number of common terms)/(number of terms in x +
03061 number of terms in y). Returns a value between 0 (orthogonal) and 1.
03062 
03063 Usage:  dices(x,y)
03064 """
03065     import sets
03066     x = sets.Set(x)
03067     y = sets.Set(y)
03068     common = len(x.intersection(y))
03069     total = float(len(x) + len(y))
03070     return 2*common/total
03071 
03072 
03073  def icc(x,y=None,verbose=0):
03074     """
03075 Calculates intraclass correlation coefficients using simple, Type I sums of squares.
03076 If only one variable is passed, assumed it's an Nx2 matrix
03077 
03078 Usage:   icc(x,y=None,verbose=0)
03079 Returns: icc rho, prob ####PROB IS A GUESS BASED ON PEARSON
03080 """
03081     TINY = 1.0e-20
03082     if y:
03083         all = N.concatenate([x,y],0)
03084     else:
03085         all = x+0
03086         x = all[:,0]
03087         y = all[:,1]
03088     totalss = ass(all-mean(all))
03089     pairmeans = (x+y)/2.
03090     withinss = ass(x-pairmeans) + ass(y-pairmeans)
03091     withindf = float(len(x))
03092     betwdf = float(len(x)-1)
03093     withinms = withinss / withindf
03094     betweenms = (totalss-withinss) / betwdf
03095     rho = (betweenms-withinms)/(withinms+betweenms)
03096     t = rho*math.sqrt(betwdf/((1.0-rho+TINY)*(1.0+rho+TINY)))
03097     prob = abetai(0.5*betwdf,0.5,betwdf/(betwdf+t*t),verbose)
03098     return rho, prob
03099 
03100 
03101  def alincc(x,y):
03102     """
03103 Calculates Lin's concordance correlation coefficient.
03104 
03105 Usage:   alincc(x,y)    where x, y are equal-length arrays
03106 Returns: Lin's CC
03107 """
03108     x = N.ravel(x)
03109     y = N.ravel(y)
03110     covar = acov(x,y)*(len(x)-1)/float(len(x))  # correct denom to n
03111     xvar = avar(x)*(len(x)-1)/float(len(x))  # correct denom to n
03112     yvar = avar(y)*(len(y)-1)/float(len(y))  # correct denom to n
03113     lincc = (2 * covar) / ((xvar+yvar) +((amean(x)-amean(y))**2))
03114     return lincc
03115 
03116 
03117  def apearsonr(x,y,verbose=1):
03118     """
03119 Calculates a Pearson correlation coefficient and returns p.  Taken
03120 from Heiman's Basic Statistics for the Behav. Sci (2nd), p.195.
03121 
03122 Usage:   apearsonr(x,y,verbose=1)      where x,y are equal length arrays
03123 Returns: Pearson's r, two-tailed p-value
03124 """
03125     TINY = 1.0e-20
03126     n = len(x)
03127     xmean = amean(x)
03128     ymean = amean(y)
03129     r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y)
03130     r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y)))
03131     r = (r_num / r_den)
03132     df = n-2
03133     t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
03134     prob = abetai(0.5*df,0.5,df/(df+t*t),verbose)
03135     return r,prob
03136 
03137 
03138  def aspearmanr(x,y):
03139     """
03140 Calculates a Spearman rank-order correlation coefficient.  Taken
03141 from Heiman's Basic Statistics for the Behav. Sci (1st), p.192.
03142 
03143 Usage:   aspearmanr(x,y)      where x,y are equal-length arrays
03144 Returns: Spearman's r, two-tailed p-value
03145 """
03146     TINY = 1e-30
03147     n = len(x)
03148     rankx = rankdata(x)
03149     ranky = rankdata(y)
03150     dsq = N.add.reduce((rankx-ranky)**2)
03151     rs = 1 - 6*dsq / float(n*(n**2-1))
03152     t = rs * math.sqrt((n-2) / ((rs+1.0)*(1.0-rs)))
03153     df = n-2
03154     probrs = abetai(0.5*df,0.5,df/(df+t*t))
03155 # probability values for rs are from part 2 of the spearman function in
03156 # Numerical Recipies, p.510.  They close to tables, but not exact.(?)
03157     return rs, probrs
03158 
03159 
03160  def apointbiserialr(x,y):
03161     """
03162 Calculates a point-biserial correlation coefficient and the associated
03163 probability value.  Taken from Heiman's Basic Statistics for the Behav.
03164 Sci (1st), p.194.
03165 
03166 Usage:   apointbiserialr(x,y)      where x,y are equal length arrays
03167 Returns: Point-biserial r, two-tailed p-value
03168 """
03169     TINY = 1e-30
03170     categories = pstat.aunique(x)
03171     data = pstat.aabut(x,y)
03172     if len(categories) <> 2:
03173         raise ValueError, "Exactly 2 categories required (in x) for pointbiserialr()."
03174     else:   # there are 2 categories, continue
03175         codemap = pstat.aabut(categories,N.arange(2))
03176         recoded = pstat.arecode(data,codemap,0)
03177         x = pstat.alinexand(data,0,categories[0])
03178         y = pstat.alinexand(data,0,categories[1])
03179         xmean = amean(pstat.acolex(x,1))
03180         ymean = amean(pstat.acolex(y,1))
03181         n = len(data)
03182         adjust = math.sqrt((len(x)/float(n))*(len(y)/float(n)))
03183         rpb = (ymean - xmean)/asamplestdev(pstat.acolex(data,1))*adjust
03184         df = n-2
03185         t = rpb*math.sqrt(df/((1.0-rpb+TINY)*(1.0+rpb+TINY)))
03186         prob = abetai(0.5*df,0.5,df/(df+t*t))
03187         return rpb, prob
03188 
03189 
03190  def akendalltau(x,y):
03191     """
03192 Calculates Kendall's tau ... correlation of ordinal data.  Adapted
03193 from function kendl1 in Numerical Recipies.  Needs good test-cases.@@@
03194 
03195 Usage:   akendalltau(x,y)
03196 Returns: Kendall's tau, two-tailed p-value
03197 """
03198     n1 = 0
03199     n2 = 0
03200     iss = 0
03201     for j in range(len(x)-1):
03202         for k in range(j,len(y)):
03203             a1 = x[j] - x[k]
03204             a2 = y[j] - y[k]
03205             aa = a1 * a2
03206             if (aa):             # neither array has a tie
03207                 n1 = n1 + 1
03208                 n2 = n2 + 1
03209                 if aa > 0:
03210                     iss = iss + 1
03211                 else:
03212                     iss = iss -1
03213             else:
03214                 if (a1):
03215                     n1 = n1 + 1
03216                 else:
03217                     n2 = n2 + 1
03218     tau = iss / math.sqrt(n1*n2)
03219     svar = (4.0*len(x)+10.0) / (9.0*len(x)*(len(x)-1))
03220     z = tau / math.sqrt(svar)
03221     prob = erfcc(abs(z)/1.4142136)
03222     return tau, prob
03223 
03224 
03225  def alinregress(*args):
03226     """
03227 Calculates a regression line on two arrays, x and y, corresponding to x,y
03228 pairs.  If a single 2D array is passed, alinregress finds dim with 2 levels
03229 and splits data into x,y pairs along that dim.
03230 
03231 Usage:   alinregress(*args)    args=2 equal-length arrays, or one 2D array
03232 Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n
03233 """
03234     TINY = 1.0e-20
03235     if len(args) == 1:  # more than 1D array?
03236         args = args[0]
03237         if len(args) == 2:
03238             x = args[0]
03239             y = args[1]
03240         else:
03241             x = args[:,0]
03242             y = args[:,1]
03243     else:
03244         x = args[0]
03245         y = args[1]
03246     n = len(x)
03247     xmean = amean(x)
03248     ymean = amean(y)
03249     r_num = n*(N.add.reduce(x*y)) - N.add.reduce(x)*N.add.reduce(y)
03250     r_den = math.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y)-asquare_of_sums(y)))
03251     r = r_num / r_den
03252     z = 0.5*math.log((1.0+r+TINY)/(1.0-r+TINY))
03253     df = n-2
03254     t = r*math.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
03255     prob = abetai(0.5*df,0.5,df/(df+t*t))
03256     slope = r_num / (float(n)*ass(x) - asquare_of_sums(x))
03257     intercept = ymean - slope*xmean
03258     sterrest = math.sqrt(1-r*r)*asamplestdev(y)
03259     return slope, intercept, r, prob, sterrest, n
03260 
03261  def amasslinregress(*args):
03262     """
03263 Calculates a regression line on one 1D array (x) and one N-D array (y).
03264 
03265 Returns: slope, intercept, r, two-tailed prob, sterr-of-the-estimate, n
03266 """
03267     TINY = 1.0e-20
03268     if len(args) == 1:  # more than 1D array?
03269         args = args[0]
03270         if len(args) == 2:
03271             x = N.ravel(args[0])
03272             y = args[1]
03273         else:
03274             x = N.ravel(args[:,0])
03275             y = args[:,1]
03276     else:
03277         x = args[0]
03278         y = args[1]
03279     x = x.astype(N.float_)
03280     y = y.astype(N.float_)
03281     n = len(x)
03282     xmean = amean(x)
03283     ymean = amean(y,0)
03284     shp = N.ones(len(y.shape))
03285     shp[0] = len(x)
03286     x.shape = shp
03287     print x.shape, y.shape
03288     r_num = n*(N.add.reduce(x*y,0)) - N.add.reduce(x)*N.add.reduce(y,0)
03289     r_den = N.sqrt((n*ass(x) - asquare_of_sums(x))*(n*ass(y,0)-asquare_of_sums(y,0)))
03290     zerodivproblem = N.equal(r_den,0)
03291     r_den = N.where(zerodivproblem,1,r_den)  # avoid zero-division in 1st place
03292     r = r_num / r_den  # need to do this nicely for matrix division
03293     r = N.where(zerodivproblem,0.0,r)
03294     z = 0.5*N.log((1.0+r+TINY)/(1.0-r+TINY))
03295     df = n-2
03296     t = r*N.sqrt(df/((1.0-r+TINY)*(1.0+r+TINY)))
03297     prob = abetai(0.5*df,0.5,df/(df+t*t))
03298 
03299     ss = float(n)*ass(x)-asquare_of_sums(x)
03300     s_den = N.where(ss==0,1,ss)  # avoid zero-division in 1st place
03301     slope = r_num / s_den
03302     intercept = ymean - slope*xmean
03303     sterrest = N.sqrt(1-r*r)*asamplestdev(y,0)
03304     return slope, intercept, r, prob, sterrest, n
03305 
03306 
03307 #####################################
03308 #####  AINFERENTIAL STATISTICS  #####
03309 #####################################
03310 
03311  def attest_1samp(a,popmean,printit=0,name='Sample',writemode='a'):
03312     """
03313 Calculates the t-obtained for the independent samples T-test on ONE group
03314 of scores a, given a population mean.  If printit=1, results are printed
03315 to the screen.  If printit='filename', the results are output to 'filename'
03316 using the given writemode (default=append).  Returns t-value, and prob.
03317 
03318 Usage:   attest_1samp(a,popmean,Name='Sample',printit=0,writemode='a')
03319 Returns: t-value, two-tailed prob
03320 """
03321     if type(a) != N.ndarray:
03322         a = N.array(a)
03323     x = amean(a)
03324     v = avar(a)
03325     n = len(a)
03326     df = n-1
03327     svar = ((n-1)*v) / float(df)
03328     t = (x-popmean)/math.sqrt(svar*(1.0/n))
03329     prob = abetai(0.5*df,0.5,df/(df+t*t))
03330 
03331     if printit <> 0:
03332         statname = 'Single-sample T-test.'
03333         outputpairedstats(printit,writemode,
03334                           'Population','--',popmean,0,0,0,
03335                           name,n,x,v,N.minimum.reduce(N.ravel(a)),
03336                           N.maximum.reduce(N.ravel(a)),
03337                           statname,t,prob)
03338     return t,prob
03339 
03340 
03341  def attest_ind (a, b, dimension=None, printit=0, name1='Samp1', name2='Samp2',writemode='a'):
03342     """
03343 Calculates the t-obtained T-test on TWO INDEPENDENT samples of scores
03344 a, and b.  From Numerical Recipies, p.483.  If printit=1, results are
03345 printed to the screen.  If printit='filename', the results are output
03346 to 'filename' using the given writemode (default=append).  Dimension
03347 can equal None (ravel array first), or an integer (the dimension over
03348 which to operate on a and b).
03349 
03350 Usage:   attest_ind (a,b,dimension=None,printit=0,
03351                      Name1='Samp1',Name2='Samp2',writemode='a')
03352 Returns: t-value, two-tailed p-value
03353 """
03354     if dimension == None:
03355         a = N.ravel(a)
03356         b = N.ravel(b)
03357         dimension = 0
03358     x1 = amean(a,dimension)
03359     x2 = amean(b,dimension)
03360     v1 = avar(a,dimension)
03361     v2 = avar(b,dimension)
03362     n1 = a.shape[dimension]
03363     n2 = b.shape[dimension]
03364     df = n1+n2-2
03365     svar = ((n1-1)*v1+(n2-1)*v2) / float(df)
03366     zerodivproblem = N.equal(svar,0)
03367     svar = N.where(zerodivproblem,1,svar)  # avoid zero-division in 1st place
03368     t = (x1-x2)/N.sqrt(svar*(1.0/n1 + 1.0/n2))  # N-D COMPUTATION HERE!!!!!!
03369     t = N.where(zerodivproblem,1.0,t)     # replace NaN/wrong t-values with 1.0
03370     probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
03371 
03372     if type(t) == N.ndarray:
03373         probs = N.reshape(probs,t.shape)
03374     if probs.shape == (1,):
03375         probs = probs[0]
03376         
03377     if printit <> 0:
03378         if type(t) == N.ndarray:
03379             t = t[0]
03380         if type(probs) == N.ndarray:
03381             probs = probs[0]
03382         statname = 'Independent samples T-test.'
03383         outputpairedstats(printit,writemode,
03384                           name1,n1,x1,v1,N.minimum.reduce(N.ravel(a)),
03385                           N.maximum.reduce(N.ravel(a)),
03386                           name2,n2,x2,v2,N.minimum.reduce(N.ravel(b)),
03387                           N.maximum.reduce(N.ravel(b)),
03388                           statname,t,probs)
03389         return
03390     return t, probs
03391 
03392  def ap2t(pval,df):
03393     """
03394 Tries to compute a t-value from a p-value (or pval array) and associated df.
03395 SLOW for large numbers of elements(!) as it re-computes p-values 20 times
03396 (smaller step-sizes) at which point it decides it's done. Keeps the signs
03397 of the input array. Returns 1000 (or -1000) if t>100.
03398 
03399 Usage:  ap2t(pval,df)
03400 Returns: an array of t-values with the shape of pval
03401     """
03402     pval = N.array(pval)
03403     signs = N.sign(pval)
03404     pval = abs(pval)
03405     t = N.ones(pval.shape,N.float_)*50
03406     step = N.ones(pval.shape,N.float_)*25
03407     print "Initial ap2t() prob calc"
03408     prob = abetai(0.5*df,0.5,float(df)/(df+t*t))
03409     print 'ap2t() iter: ',
03410     for i in range(10):
03411         print i,' ',
03412         t = N.where(pval<prob,t+step,t-step)
03413         prob = abetai(0.5*df,0.5,float(df)/(df+t*t))
03414         step = step/2
03415     print
03416     # since this is an ugly hack, we get ugly boundaries
03417     t = N.where(t>99.9,1000,t)      # hit upper-boundary
03418     t = t+signs
03419     return t #, prob, pval
03420 
03421 
03422  def attest_rel (a,b,dimension=None,printit=0,name1='Samp1',name2='Samp2',writemode='a'):
03423     """
03424 Calculates the t-obtained T-test on TWO RELATED samples of scores, a
03425 and b.  From Numerical Recipies, p.483.  If printit=1, results are
03426 printed to the screen.  If printit='filename', the results are output
03427 to 'filename' using the given writemode (default=append).  Dimension
03428 can equal None (ravel array first), or an integer (the dimension over
03429 which to operate on a and b).
03430 
03431 Usage:   attest_rel(a,b,dimension=None,printit=0,
03432                     name1='Samp1',name2='Samp2',writemode='a')
03433 Returns: t-value, two-tailed p-value
03434 """
03435     if dimension == None:
03436         a = N.ravel(a)
03437         b = N.ravel(b)
03438         dimension = 0
03439     if len(a)<>len(b):
03440         raise ValueError, 'Unequal length arrays.'
03441     x1 = amean(a,dimension)
03442     x2 = amean(b,dimension)
03443     v1 = avar(a,dimension)
03444     v2 = avar(b,dimension)
03445     n = a.shape[dimension]
03446     df = float(n-1)
03447     d = (a-b).astype('d')
03448 
03449     denom = N.sqrt((n*N.add.reduce(d*d,dimension) - N.add.reduce(d,dimension)**2) /df)
03450     zerodivproblem = N.equal(denom,0)
03451     denom = N.where(zerodivproblem,1,denom)  # avoid zero-division in 1st place
03452     t = N.add.reduce(d,dimension) / denom      # N-D COMPUTATION HERE!!!!!!
03453     t = N.where(zerodivproblem,1.0,t)     # replace NaN/wrong t-values with 1.0
03454     probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
03455     if type(t) == N.ndarray:
03456         probs = N.reshape(probs,t.shape)
03457     if probs.shape == (1,):
03458         probs = probs[0]
03459 
03460     if printit <> 0:
03461         statname = 'Related samples T-test.'
03462         outputpairedstats(printit,writemode,
03463                           name1,n,x1,v1,N.minimum.reduce(N.ravel(a)),
03464                           N.maximum.reduce(N.ravel(a)),
03465                           name2,n,x2,v2,N.minimum.reduce(N.ravel(b)),
03466                           N.maximum.reduce(N.ravel(b)),
03467                           statname,t,probs)
03468         return
03469     return t, probs
03470 
03471 
03472  def achisquare(f_obs,f_exp=None):
03473     """
03474 Calculates a one-way chi square for array of observed frequencies and returns
03475 the result.  If no expected frequencies are given, the total N is assumed to
03476 be equally distributed across all groups.
03477 @@@NOT RIGHT??
03478 
03479 Usage:   achisquare(f_obs, f_exp=None)   f_obs = array of observed cell freq.
03480 Returns: chisquare-statistic, associated p-value
03481 """
03482 
03483     k = len(f_obs)
03484     if f_exp == None:
03485         f_exp = N.array([sum(f_obs)/float(k)] * len(f_obs),N.float_)
03486     f_exp = f_exp.astype(N.float_)
03487     chisq = N.add.reduce((f_obs-f_exp)**2 / f_exp)
03488     return chisq, achisqprob(chisq, k-1)
03489 
03490 
03491  def aks_2samp (data1,data2):
03492     """
03493 Computes the Kolmogorov-Smirnof statistic on 2 samples.  Modified from
03494 Numerical Recipies in C, page 493.  Returns KS D-value, prob.  Not ufunc-
03495 like.
03496 
03497 Usage:   aks_2samp(data1,data2)  where data1 and data2 are 1D arrays
03498 Returns: KS D-value, p-value
03499 """
03500     j1 = 0    # N.zeros(data1.shape[1:]) TRIED TO MAKE THIS UFUNC-LIKE
03501     j2 = 0    # N.zeros(data2.shape[1:])
03502     fn1 = 0.0 # N.zeros(data1.shape[1:],N.float_)
03503     fn2 = 0.0 # N.zeros(data2.shape[1:],N.float_)
03504     n1 = data1.shape[0]
03505     n2 = data2.shape[0]
03506     en1 = n1*1
03507     en2 = n2*1
03508     d = N.zeros(data1.shape[1:],N.float_)
03509     data1 = N.sort(data1,0)
03510     data2 = N.sort(data2,0)
03511     while j1 < n1 and j2 < n2:
03512         d1=data1[j1]
03513         d2=data2[j2]
03514         if d1 <= d2:
03515             fn1 = (j1)/float(en1)
03516             j1 = j1 + 1
03517         if d2 <= d1:
03518             fn2 = (j2)/float(en2)
03519             j2 = j2 + 1
03520         dt = (fn2-fn1)
03521         if abs(dt) > abs(d):
03522             d = dt
03523 #    try:
03524     en = math.sqrt(en1*en2/float(en1+en2))
03525     prob = aksprob((en+0.12+0.11/en)*N.fabs(d))
03526 #    except:
03527 #        prob = 1.0
03528     return d, prob
03529 
03530 
03531  def amannwhitneyu(x,y):
03532     """
03533 Calculates a Mann-Whitney U statistic on the provided scores and
03534 returns the result.  Use only when the n in each condition is < 20 and
03535 you have 2 independent samples of ranks.  REMEMBER: Mann-Whitney U is
03536 significant if the u-obtained is LESS THAN or equal to the critical
03537 value of U.
03538 
03539 Usage:   amannwhitneyu(x,y)     where x,y are arrays of values for 2 conditions
03540 Returns: u-statistic, one-tailed p-value (i.e., p(z(U)))
03541 """
03542     n1 = len(x)
03543     n2 = len(y)
03544     ranked = rankdata(N.concatenate((x,y)))
03545     rankx = ranked[0:n1]       # get the x-ranks
03546     ranky = ranked[n1:]        # the rest are y-ranks
03547     u1 = n1*n2 + (n1*(n1+1))/2.0 - sum(rankx)  # calc U for x
03548     u2 = n1*n2 - u1                            # remainder is U for y
03549     bigu = max(u1,u2)
03550     smallu = min(u1,u2)
03551     T = math.sqrt(tiecorrect(ranked))  # correction factor for tied scores
03552     if T == 0:
03553         raise ValueError, 'All numbers are identical in amannwhitneyu'
03554     sd = math.sqrt(T*n1*n2*(n1+n2+1)/12.0)
03555     z = abs((bigu-n1*n2/2.0) / sd)  # normal approximation for prob calc
03556     return smallu, 1.0 - azprob(z)
03557 
03558 
03559  def atiecorrect(rankvals):
03560     """
03561 Tie-corrector for ties in Mann Whitney U and Kruskal Wallis H tests.
03562 See Siegel, S. (1956) Nonparametric Statistics for the Behavioral
03563 Sciences.  New York: McGraw-Hill.  Code adapted from |Stat rankind.c
03564 code.
03565 
03566 Usage:   atiecorrect(rankvals)
03567 Returns: T correction factor for U or H
03568 """
03569     sorted,posn = ashellsort(N.array(rankvals))
03570     n = len(sorted)
03571     T = 0.0
03572     i = 0
03573     while (i<n-1):
03574         if sorted[i] == sorted[i+1]:
03575             nties = 1
03576             while (i<n-1) and (sorted[i] == sorted[i+1]):
03577                 nties = nties +1
03578                 i = i +1
03579             T = T + nties**3 - nties
03580         i = i+1
03581     T = T / float(n**3-n)
03582     return 1.0 - T
03583 
03584 
03585  def aranksums(x,y):
03586     """
03587 Calculates the rank sums statistic on the provided scores and returns
03588 the result.
03589 
03590 Usage:   aranksums(x,y)     where x,y are arrays of values for 2 conditions
03591 Returns: z-statistic, two-tailed p-value
03592 """
03593     n1 = len(x)
03594     n2 = len(y)
03595     alldata = N.concatenate((x,y))
03596     ranked = arankdata(alldata)
03597     x = ranked[:n1]
03598     y = ranked[n1:]
03599     s = sum(x)
03600     expected = n1*(n1+n2+1) / 2.0
03601     z = (s - expected) / math.sqrt(n1*n2*(n1+n2+1)/12.0)
03602     prob = 2*(1.0 - azprob(abs(z)))
03603     return z, prob
03604 
03605 
03606  def awilcoxont(x,y):
03607     """
03608 Calculates the Wilcoxon T-test for related samples and returns the
03609 result.  A non-parametric T-test.
03610 
03611 Usage:   awilcoxont(x,y)     where x,y are equal-length arrays for 2 conditions
03612 Returns: t-statistic, two-tailed p-value
03613 """
03614     if len(x) <> len(y):
03615         raise ValueError, 'Unequal N in awilcoxont.  Aborting.'
03616     d = x-y
03617     d = N.compress(N.not_equal(d,0),d) # Keep all non-zero differences
03618     count = len(d)
03619     absd = abs(d)
03620     absranked = arankdata(absd)
03621     r_plus = 0.0
03622     r_minus = 0.0
03623     for i in range(len(absd)):
03624         if d[i] < 0:
03625             r_minus = r_minus + absranked[i]
03626         else:
03627             r_plus = r_plus + absranked[i]
03628     wt = min(r_plus, r_minus)
03629     mn = count * (count+1) * 0.25
03630     se =  math.sqrt(count*(count+1)*(2.0*count+1.0)/24.0)
03631     z = math.fabs(wt-mn) / se
03632     z = math.fabs(wt-mn) / se
03633     prob = 2*(1.0 -zprob(abs(z)))
03634     return wt, prob
03635 
03636 
03637  def akruskalwallish(*args):
03638     """
03639 The Kruskal-Wallis H-test is a non-parametric ANOVA for 3 or more
03640 groups, requiring at least 5 subjects in each group.  This function
03641 calculates the Kruskal-Wallis H and associated p-value for 3 or more
03642 independent samples.
03643 
03644 Usage:   akruskalwallish(*args)     args are separate arrays for 3+ conditions
03645 Returns: H-statistic (corrected for ties), associated p-value
03646 """
03647     assert len(args) == 3, "Need at least 3 groups in stats.akruskalwallish()"
03648     args = list(args)
03649     n = [0]*len(args)
03650     n = map(len,args)
03651     all = []
03652     for i in range(len(args)):
03653         all = all + args[i].tolist()
03654     ranked = rankdata(all)
03655     T = tiecorrect(ranked)
03656     for i in range(len(args)):
03657         args[i] = ranked[0:n[i]]
03658         del ranked[0:n[i]]
03659     rsums = []
03660     for i in range(len(args)):
03661         rsums.append(sum(args[i])**2)
03662         rsums[i] = rsums[i] / float(n[i])
03663     ssbn = sum(rsums)
03664     totaln = sum(n)
03665     h = 12.0 / (totaln*(totaln+1)) * ssbn - 3*(totaln+1)
03666     df = len(args) - 1
03667     if T == 0:
03668         raise ValueError, 'All numbers are identical in akruskalwallish'
03669     h = h / float(T)
03670     return h, chisqprob(h,df)
03671 
03672 
03673  def afriedmanchisquare(*args):
03674     """
03675 Friedman Chi-Square is a non-parametric, one-way within-subjects
03676 ANOVA.  This function calculates the Friedman Chi-square test for
03677 repeated measures and returns the result, along with the associated
03678 probability value.  It assumes 3 or more repeated measures.  Only 3
03679 levels requires a minimum of 10 subjects in the study.  Four levels
03680 requires 5 subjects per level(??).
03681 
03682 Usage:   afriedmanchisquare(*args)   args are separate arrays for 2+ conditions
03683 Returns: chi-square statistic, associated p-value
03684 """
03685     k = len(args)
03686     if k < 3:
03687         raise ValueError, '\nLess than 3 levels.  Friedman test not appropriate.\n'
03688     n = len(args[0])
03689     data = apply(pstat.aabut,args)
03690     data = data.astype(N.float_)
03691     for i in range(len(data)):
03692         data[i] = arankdata(data[i])
03693     ssbn = asum(asum(args,1)**2)
03694     chisq = 12.0 / (k*n*(k+1)) * ssbn - 3*n*(k+1)
03695     return chisq, achisqprob(chisq,k-1)
03696 
03697 
03698 #####################################
03699 ####  APROBABILITY CALCULATIONS  ####
03700 #####################################
03701 
03702  def achisqprob(chisq,df):
03703     """
03704 Returns the (1-tail) probability value associated with the provided chi-square
03705 value and df.  Heavily modified from chisq.c in Gary Perlman's |Stat.  Can
03706 handle multiple dimensions.
03707 
03708 Usage:   achisqprob(chisq,df)    chisq=chisquare stat., df=degrees of freedom
03709 """
03710     BIG = 200.0
03711     def ex(x):
03712         BIG = 200.0
03713         exponents = N.where(N.less(x,-BIG),-BIG,x)
03714         return N.exp(exponents)
03715 
03716     if type(chisq) == N.ndarray:
03717         arrayflag = 1
03718     else:
03719         arrayflag = 0
03720         chisq = N.array([chisq])
03721     if df < 1:
03722         return N.ones(chisq.shape,N.float)
03723     probs = N.zeros(chisq.shape,N.float_)
03724     probs = N.where(N.less_equal(chisq,0),1.0,probs)  # set prob=1 for chisq<0
03725     a = 0.5 * chisq
03726     if df > 1:
03727         y = ex(-a)
03728     if df%2 == 0:
03729         even = 1
03730         s = y*1
03731         s2 = s*1
03732     else:
03733         even = 0
03734         s = 2.0 * azprob(-N.sqrt(chisq))
03735         s2 = s*1
03736     if (df > 2):
03737         chisq = 0.5 * (df - 1.0)
03738         if even:
03739             z = N.ones(probs.shape,N.float_)
03740         else:
03741             z = 0.5 *N.ones(probs.shape,N.float_)
03742         if even:
03743             e = N.zeros(probs.shape,N.float_)
03744         else:
03745             e = N.log(N.sqrt(N.pi)) *N.ones(probs.shape,N.float_)
03746         c = N.log(a)
03747         mask = N.zeros(probs.shape)
03748         a_big = N.greater(a,BIG)
03749         a_big_frozen = -1 *N.ones(probs.shape,N.float_)
03750         totalelements = N.multiply.reduce(N.array(probs.shape))
03751         while asum(mask)<>totalelements:
03752             e = N.log(z) + e
03753             s = s + ex(c*z-a-e)
03754             z = z + 1.0
03755 #            print z, e, s
03756             newmask = N.greater(z,chisq)
03757             a_big_frozen = N.where(newmask*N.equal(mask,0)*a_big, s, a_big_frozen)
03758             mask = N.clip(newmask+mask,0,1)
03759         if even:
03760             z = N.ones(probs.shape,N.float_)
03761             e = N.ones(probs.shape,N.float_)
03762         else:
03763             z = 0.5 *N.ones(probs.shape,N.float_)
03764             e = 1.0 / N.sqrt(N.pi) / N.sqrt(a) * N.ones(probs.shape,N.float_)
03765         c = 0.0
03766         mask = N.zeros(probs.shape)
03767         a_notbig_frozen = -1 *N.ones(probs.shape,N.float_)
03768         while asum(mask)<>totalelements:
03769             e = e * (a/z.astype(N.float_))
03770             c = c + e
03771             z = z + 1.0
03772 #            print '#2', z, e, c, s, c*y+s2
03773             newmask = N.greater(z,chisq)
03774             a_notbig_frozen = N.where(newmask*N.equal(mask,0)*(1-a_big),
03775                                       c*y+s2, a_notbig_frozen)
03776             mask = N.clip(newmask+mask,0,1)
03777         probs = N.where(N.equal(probs,1),1,
03778                         N.where(N.greater(a,BIG),a_big_frozen,a_notbig_frozen))
03779         return probs
03780     else:
03781         return s
03782 
03783 
03784  def aerfcc(x):
03785     """
03786 Returns the complementary error function erfc(x) with fractional error
03787 everywhere less than 1.2e-7.  Adapted from Numerical Recipies.  Can
03788 handle multiple dimensions.
03789 
03790 Usage:   aerfcc(x)
03791 """
03792     z = abs(x)
03793     t = 1.0 / (1.0+0.5*z)
03794     ans = t * N.exp(-z*z-1.26551223 + t*(1.00002368+t*(0.37409196+t*(0.09678418+t*(-0.18628806+t*(0.27886807+t*(-1.13520398+t*(1.48851587+t*(-0.82215223+t*0.17087277)))))))))
03795     return N.where(N.greater_equal(x,0), ans, 2.0-ans)
03796 
03797 
03798  def azprob(z):
03799     """
03800 Returns the area under the normal curve 'to the left of' the given z value.
03801 Thus, 
03802     for z<0, zprob(z) = 1-tail probability
03803     for z>0, 1.0-zprob(z) = 1-tail probability
03804     for any z, 2.0*(1.0-zprob(abs(z))) = 2-tail probability
03805 Adapted from z.c in Gary Perlman's |Stat.  Can handle multiple dimensions.
03806 
03807 Usage:   azprob(z)    where z is a z-value
03808 """
03809     def yfunc(y):
03810         x = (((((((((((((-0.000045255659 * y
03811                          +0.000152529290) * y -0.000019538132) * y
03812                        -0.000676904986) * y +0.001390604284) * y
03813                      -0.000794620820) * y -0.002034254874) * y
03814                    +0.006549791214) * y -0.010557625006) * y
03815                  +0.011630447319) * y -0.009279453341) * y
03816                +0.005353579108) * y -0.002141268741) * y
03817              +0.000535310849) * y +0.999936657524
03818         return x
03819 
03820     def wfunc(w):
03821         x = ((((((((0.000124818987 * w
03822                     -0.001075204047) * w +0.005198775019) * w
03823                   -0.019198292004) * w +0.059054035642) * w
03824                 -0.151968751364) * w +0.319152932694) * w
03825               -0.531923007300) * w +0.797884560593) * N.sqrt(w) * 2.0
03826         return x
03827 
03828     Z_MAX = 6.0    # maximum meaningful z-value
03829     x = N.zeros(z.shape,N.float_) # initialize
03830     y = 0.5 * N.fabs(z)
03831     x = N.where(N.less(y,1.0),wfunc(y*y),yfunc(y-2.0)) # get x's
03832     x = N.where(N.greater(y,Z_MAX*0.5),1.0,x)          # kill those with big Z
03833     prob = N.where(N.greater(z,0),(x+1)*0.5,(1-x)*0.5)
03834     return prob
03835 
03836 
03837  def aksprob(alam):
03838      """
03839 Returns the probability value for a K-S statistic computed via ks_2samp.
03840 Adapted from Numerical Recipies.  Can handle multiple dimensions.
03841 
03842 Usage:   aksprob(alam)
03843 """
03844      if type(alam) == N.ndarray:
03845          frozen = -1 *N.ones(alam.shape,N.float64)
03846          alam = alam.astype(N.float64)
03847          arrayflag = 1
03848      else:
03849          frozen = N.array(-1.)
03850          alam = N.array(alam,N.float64)
03851          arrayflag = 1
03852      mask = N.zeros(alam.shape)
03853      fac = 2.0 *N.ones(alam.shape,N.float_)
03854      sum = N.zeros(alam.shape,N.float_)
03855      termbf = N.zeros(alam.shape,N.float_)
03856      a2 = N.array(-2.0*alam*alam,N.float64)
03857      totalelements = N.multiply.reduce(N.array(mask.shape))
03858      for j in range(1,201):
03859          if asum(mask) == totalelements:
03860              break
03861          exponents = (a2*j*j)
03862          overflowmask = N.less(exponents,-746)
03863          frozen = N.where(overflowmask,0,frozen)
03864          mask = mask+overflowmask
03865          term = fac*N.exp(exponents)
03866          sum = sum + term
03867          newmask = N.where(N.less_equal(abs(term),(0.001*termbf)) +
03868                            N.less(abs(term),1.0e-8*sum), 1, 0)
03869          frozen = N.where(newmask*N.equal(mask,0), sum, frozen)
03870          mask = N.clip(mask+newmask,0,1)
03871          fac = -fac
03872          termbf = abs(term)
03873      if arrayflag:
03874          return N.where(N.equal(frozen,-1), 1.0, frozen)  # 1.0 if doesn't converge
03875      else:
03876          return N.where(N.equal(frozen,-1), 1.0, frozen)[0]  # 1.0 if doesn't converge
03877 
03878 
03879  def afprob (dfnum, dfden, F):
03880     """
03881 Returns the 1-tailed significance level (p-value) of an F statistic
03882 given the degrees of freedom for the numerator (dfR-dfF) and the degrees
03883 of freedom for the denominator (dfF).  Can handle multiple dims for F.
03884 
03885 Usage:   afprob(dfnum, dfden, F)   where usually dfnum=dfbn, dfden=dfwn
03886 """
03887     if type(F) == N.ndarray:
03888         return abetai(0.5*dfden, 0.5*dfnum, dfden/(1.0*dfden+dfnum*F))
03889     else:
03890         return abetai(0.5*dfden, 0.5*dfnum, dfden/float(dfden+dfnum*F))
03891 
03892 
03893  def abetacf(a,b,x,verbose=1):
03894     """
03895 Evaluates the continued fraction form of the incomplete Beta function,
03896 betai.  (Adapted from: Numerical Recipies in C.)  Can handle multiple
03897 dimensions for x.
03898 
03899 Usage:   abetacf(a,b,x,verbose=1)
03900 """
03901     ITMAX = 200
03902     EPS = 3.0e-7
03903 
03904     arrayflag = 1
03905     if type(x) == N.ndarray:
03906         frozen = N.ones(x.shape,N.float_) *-1  #start out w/ -1s, should replace all
03907     else:
03908         arrayflag = 0
03909         frozen = N.array([-1])
03910         x = N.array([x])
03911     mask = N.zeros(x.shape)
03912     bm = az = am = 1.0
03913     qab = a+b
03914     qap = a+1.0
03915     qam = a-1.0
03916     bz = 1.0-qab*x/qap
03917     for i in range(ITMAX+1):
03918         if N.sum(N.ravel(N.equal(frozen,-1)))==0:
03919             break
03920         em = float(i+1)
03921         tem = em + em
03922         d = em*(b-em)*x/((qam+tem)*(a+tem))
03923         ap = az + d*am
03924         bp = bz+d*bm
03925         d = -(a+em)*(qab+em)*x/((qap+tem)*(a+tem))
03926         app = ap+d*az
03927         bpp = bp+d*bz
03928         aold = az*1
03929         am = ap/bpp
03930         bm = bp/bpp
03931         az = app/bpp
03932         bz = 1.0
03933         newmask = N.less(abs(az-aold),EPS*abs(az))
03934         frozen = N.where(newmask*N.equal(mask,0), az, frozen)
03935         mask = N.clip(mask+newmask,0,1)
03936     noconverge = asum(N.equal(frozen,-1))
03937     if noconverge <> 0 and verbose:
03938         print 'a or b too big, or ITMAX too small in Betacf for ',noconverge,' elements'
03939     if arrayflag:
03940         return frozen
03941     else:
03942         return frozen[0]
03943 
03944 
03945  def agammln(xx):
03946     """
03947 Returns the gamma function of xx.
03948     Gamma(z) = Integral(0,infinity) of t^(z-1)exp(-t) dt.
03949 Adapted from: Numerical Recipies in C.  Can handle multiple dims ... but
03950 probably doesn't normally have to.
03951 
03952 Usage:   agammln(xx)
03953 """
03954     coeff = [76.18009173, -86.50532033, 24.01409822, -1.231739516,
03955              0.120858003e-2, -0.536382e-5]
03956     x = xx - 1.0
03957     tmp = x + 5.5
03958     tmp = tmp - (x+0.5)*N.log(tmp)
03959     ser = 1.0
03960     for j in range(len(coeff)):
03961         x = x + 1
03962         ser = ser + coeff[j]/x
03963     return -tmp + N.log(2.50662827465*ser)
03964 
03965 
03966  def abetai(a,b,x,verbose=1):
03967     """
03968 Returns the incomplete beta function:
03969 
03970     I-sub-x(a,b) = 1/B(a,b)*(Integral(0,x) of t^(a-1)(1-t)^(b-1) dt)
03971 
03972 where a,b>0 and B(a,b) = G(a)*G(b)/(G(a+b)) where G(a) is the gamma
03973 function of a.  The continued fraction formulation is implemented
03974 here, using the betacf function.  (Adapted from: Numerical Recipies in
03975 C.)  Can handle multiple dimensions.
03976 
03977 Usage:   abetai(a,b,x,verbose=1)
03978 """
03979     TINY = 1e-15
03980     if type(a) == N.ndarray:
03981         if asum(N.less(x,0)+N.greater(x,1)) <> 0:
03982             raise ValueError, 'Bad x in abetai'
03983     x = N.where(N.equal(x,0),TINY,x)
03984     x = N.where(N.equal(x,1.0),1-TINY,x)
03985 
03986     bt = N.where(N.equal(x,0)+N.equal(x,1), 0, -1)
03987     exponents = ( gammln(a+b)-gammln(a)-gammln(b)+a*N.log(x)+b*
03988                   N.log(1.0-x) )
03989     # 746 (below) is the MAX POSSIBLE BEFORE OVERFLOW
03990     exponents = N.where(N.less(exponents,-740),-740,exponents)
03991     bt = N.exp(exponents)
03992     if type(x) == N.ndarray:
03993         ans = N.where(N.less(x,(a+1)/(a+b+2.0)),
03994                       bt*abetacf(a,b,x,verbose)/float(a),
03995                       1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b))
03996     else:
03997         if x<(a+1)/(a+b+2.0):
03998             ans = bt*abetacf(a,b,x,verbose)/float(a)
03999         else:
04000             ans = 1.0-bt*abetacf(b,a,1.0-x,verbose)/float(b)
04001     return ans
04002 
04003 
04004 #####################################
04005 #######  AANOVA CALCULATIONS  #######
04006 #####################################
04007 
04008  import LinearAlgebra, operator
04009  LA = LinearAlgebra
04010 
04011  def aglm(data,para):
04012     """
04013 Calculates a linear model fit ... anova/ancova/lin-regress/t-test/etc. Taken
04014 from:
04015     Peterson et al. Statistical limitations in functional neuroimaging
04016     I. Non-inferential methods and statistical models.  Phil Trans Royal Soc
04017     Lond B 354: 1239-1260.
04018 
04019 Usage:   aglm(data,para)
04020 Returns: statistic, p-value ???
04021 """
04022     if len(para) <> len(data):
04023         print "data and para must be same length in aglm"
04024         return
04025     n = len(para)
04026     p = pstat.aunique(para)
04027     x = N.zeros((n,len(p)))  # design matrix
04028     for l in range(len(p)):
04029         x[:,l] = N.equal(para,p[l])
04030     b = N.dot(N.dot(LA.inv(N.dot(N.transpose(x),x)),  # i.e., b=inv(X'X)X'Y
04031                     N.transpose(x)),
04032               data)
04033     diffs = (data - N.dot(x,b))
04034     s_sq = 1./(n-len(p)) * N.dot(N.transpose(diffs), diffs)
04035 
04036     if len(p) == 2:  # ttest_ind
04037         c = N.array([1,-1])
04038         df = n-2
04039         fact = asum(1.0/asum(x,0))  # i.e., 1/n1 + 1/n2 + 1/n3 ...
04040         t = N.dot(c,b) / N.sqrt(s_sq*fact)
04041         probs = abetai(0.5*df,0.5,float(df)/(df+t*t))
04042         return t, probs
04043 
04044 
04045  def aF_oneway(*args):
04046     """
04047 Performs a 1-way ANOVA, returning an F-value and probability given
04048 any number of groups.  From Heiman, pp.394-7.
04049 
04050 Usage:   aF_oneway (*args)    where *args is 2 or more arrays, one per
04051                                   treatment group
04052 Returns: f-value, probability
04053 """
04054     na = len(args)            # ANOVA on 'na' groups, each in it's own array
04055     means = [0]*na
04056     vars = [0]*na
04057     ns = [0]*na
04058     alldata = []
04059     tmp = map(N.array,args)
04060     means = map(amean,tmp)
04061     vars = map(avar,tmp)
04062     ns = map(len,args)
04063     alldata = N.concatenate(args)
04064     bign = len(alldata)
04065     sstot = ass(alldata)-(asquare_of_sums(alldata)/float(bign))
04066     ssbn = 0
04067     for a in args:
04068         ssbn = ssbn + asquare_of_sums(N.array(a))/float(len(a))
04069     ssbn = ssbn - (asquare_of_sums(alldata)/float(bign))
04070     sswn = sstot-ssbn
04071     dfbn = na-1
04072     dfwn = bign - na
04073     msb = ssbn/float(dfbn)
04074     msw = sswn/float(dfwn)
04075     f = msb/msw
04076     prob = fprob(dfbn,dfwn,f)
04077     return f, prob
04078 
04079 
04080  def aF_value (ER,EF,dfR,dfF):
04081     """
04082 Returns an F-statistic given the following:
04083         ER  = error associated with the null hypothesis (the Restricted model)
04084         EF  = error associated with the alternate hypothesis (the Full model)
04085         dfR = degrees of freedom the Restricted model
04086         dfF = degrees of freedom associated with the Restricted model
04087 """
04088     return ((ER-EF)/float(dfR-dfF) / (EF/float(dfF)))
04089 
04090 
04091  def outputfstats(Enum, Eden, dfnum, dfden, f, prob):
04092      Enum = round(Enum,3)
04093      Eden = round(Eden,3)
04094      dfnum = round(Enum,3)
04095      dfden = round(dfden,3)
04096      f = round(f,3)
04097      prob = round(prob,3)
04098      suffix = ''                       # for *s after the p-value
04099      if  prob < 0.001:  suffix = '  ***'
04100      elif prob < 0.01:  suffix = '  **'
04101      elif prob < 0.05:  suffix = '  *'
04102      title = [['EF/ER','DF','Mean Square','F-value','prob','']]
04103      lofl = title+[[Enum, dfnum, round(Enum/float(dfnum),3), f, prob, suffix],
04104                    [Eden, dfden, round(Eden/float(dfden),3),'','','']]
04105      pstat.printcc(lofl)
04106      return
04107 
04108 
04109  def F_value_multivariate(ER, EF, dfnum, dfden):
04110      """
04111 Returns an F-statistic given the following:
04112         ER  = error associated with the null hypothesis (the Restricted model)
04113         EF  = error associated with the alternate hypothesis (the Full model)
04114         dfR = degrees of freedom the Restricted model
04115         dfF = degrees of freedom associated with the Restricted model
04116 where ER and EF are matrices from a multivariate F calculation.
04117 """
04118      if type(ER) in [IntType, FloatType]:
04119          ER = N.array([[ER]])
04120      if type(EF) in [IntType, FloatType]:
04121          EF = N.array([[EF]])
04122      n_um = (LA.det(ER) - LA.det(EF)) / float(dfnum)
04123      d_en = LA.det(EF) / float(dfden)
04124      return n_um / d_en
04125 
04126 
04127 #####################################
04128 #######  ASUPPORT FUNCTIONS  ########
04129 #####################################
04130 
04131  def asign(a):
04132     """
04133 Usage:   asign(a)
04134 Returns: array shape of a, with -1 where a<0 and +1 where a>=0
04135 """
04136     a = N.asarray(a)
04137     if ((type(a) == type(1.4)) or (type(a) == type(1))):
04138         return a-a-N.less(a,0)+N.greater(a,0)
04139     else:
04140         return N.zeros(N.shape(a))-N.less(a,0)+N.greater(a,0)
04141 
04142 
04143  def asum (a, dimension=None,keepdims=0):
04144      """
04145 An alternative to the Numeric.add.reduce function, which allows one to
04146 (1) collapse over multiple dimensions at once, and/or (2) to retain
04147 all dimensions in the original array (squashing one down to size.
04148 Dimension can equal None (ravel array first), an integer (the
04149 dimension over which to operate), or a sequence (operate over multiple
04150 dimensions).  If keepdims=1, the resulting array will have as many
04151 dimensions as the input array.
04152 
04153 Usage:   asum(a, dimension=None, keepdims=0)
04154 Returns: array summed along 'dimension'(s), same _number_ of dims if keepdims=1
04155 """
04156      if type(a) == N.ndarray and a.dtype in [N.int_, N.short, N.ubyte]:
04157          a = a.astype(N.float_)
04158      if dimension == None:
04159          s = N.sum(N.ravel(a))
04160      elif type(dimension) in [IntType,FloatType]:
04161          s = N.add.reduce(a, dimension)
04162          if keepdims == 1:
04163              shp = list(a.shape)
04164              shp[dimension] = 1
04165              s = N.reshape(s,shp)
04166      else: # must be a SEQUENCE of dims to sum over
04167         dims = list(dimension)
04168         dims.sort()
04169         dims.reverse()
04170         s = a *1.0
04171         for dim in dims:
04172             s = N.add.reduce(s,dim)
04173         if keepdims == 1:
04174             shp = list(a.shape)
04175             for dim in dims:
04176                 shp[dim] = 1
04177             s = N.reshape(s,shp)
04178      return s
04179 
04180 
04181  def acumsum (a,dimension=None):
04182     """
04183 Returns an array consisting of the cumulative sum of the items in the
04184 passed array.  Dimension can equal None (ravel array first), an
04185 integer (the dimension over which to operate), or a sequence (operate
04186 over multiple dimensions, but this last one just barely makes sense).
04187 
04188 Usage:   acumsum(a,dimension=None)
04189 """
04190     if dimension == None:
04191         a = N.ravel(a)
04192         dimension = 0
04193     if type(dimension) in [ListType, TupleType, N.ndarray]:
04194         dimension = list(dimension)
04195         dimension.sort()
04196         dimension.reverse()
04197         for d in dimension:
04198             a = N.add.accumulate(a,d)
04199         return a
04200     else:
04201         return N.add.accumulate(a,dimension)
04202 
04203 
04204  def ass(inarray, dimension=None, keepdims=0):
04205     """
04206 Squares each value in the passed array, adds these squares & returns
04207 the result.  Unfortunate function name. :-) Defaults to ALL values in
04208 the array.  Dimension can equal None (ravel array first), an integer
04209 (the dimension over which to operate), or a sequence (operate over
04210 multiple dimensions).  Set keepdims=1 to maintain the original number
04211 of dimensions.
04212 
04213 Usage:   ass(inarray, dimension=None, keepdims=0)
04214 Returns: sum-along-'dimension' for (inarray*inarray)
04215 """
04216     if dimension == None:
04217         inarray = N.ravel(inarray)
04218         dimension = 0
04219     return asum(inarray*inarray,dimension,keepdims)
04220 
04221 
04222  def asummult (array1,array2,dimension=None,keepdims=0):
04223     """
04224 Multiplies elements in array1 and array2, element by element, and
04225 returns the sum (along 'dimension') of all resulting multiplications.
04226 Dimension can equal None (ravel array first), an integer (the
04227 dimension over which to operate), or a sequence (operate over multiple
04228 dimensions).  A trivial function, but included for completeness.
04229 
04230 Usage:   asummult(array1,array2,dimension=None,keepdims=0)
04231 """
04232     if dimension == None:
04233         array1 = N.ravel(array1)
04234         array2 = N.ravel(array2)
04235         dimension = 0
04236     return asum(array1*array2,dimension,keepdims)
04237 
04238 
04239  def asquare_of_sums(inarray, dimension=None, keepdims=0):
04240     """
04241 Adds the values in the passed array, squares that sum, and returns the
04242 result.  Dimension can equal None (ravel array first), an integer (the
04243 dimension over which to operate), or a sequence (operate over multiple
04244 dimensions).  If keepdims=1, the returned array will have the same
04245 NUMBER of dimensions as the original.
04246 
04247 Usage:   asquare_of_sums(inarray, dimension=None, keepdims=0)
04248 Returns: the square of the sum over dim(s) in dimension
04249 """
04250     if dimension == None:
04251         inarray = N.ravel(inarray)
04252         dimension = 0
04253     s = asum(inarray,dimension,keepdims)
04254     if type(s) == N.ndarray:
04255         return s.astype(N.float_)*s
04256     else:
04257         return float(s)*s
04258 
04259 
04260  def asumdiffsquared(a,b, dimension=None, keepdims=0):
04261     """
04262 Takes pairwise differences of the values in arrays a and b, squares
04263 these differences, and returns the sum of these squares.  Dimension
04264 can equal None (ravel array first), an integer (the dimension over
04265 which to operate), or a sequence (operate over multiple dimensions).
04266 keepdims=1 means the return shape = len(a.shape) = len(b.shape)
04267 
04268 Usage:   asumdiffsquared(a,b)
04269 Returns: sum[ravel(a-b)**2]
04270 """
04271     if dimension == None:
04272         inarray = N.ravel(a)
04273         dimension = 0
04274     return asum((a-b)**2,dimension,keepdims)
04275 
04276 
04277  def ashellsort(inarray):
04278     """
04279 Shellsort algorithm.  Sorts a 1D-array.
04280 
04281 Usage:   ashellsort(inarray)
04282 Returns: sorted-inarray, sorting-index-vector (for original array)
04283 """
04284     n = len(inarray)
04285     svec = inarray *1.0
04286     ivec = range(n)
04287     gap = n/2   # integer division needed
04288     while gap >0:
04289         for i in range(gap,n):
04290             for j in range(i-gap,-1,-gap):
04291                 while j>=0 and svec[j]>svec[j+gap]:
04292                     temp        = svec[j]
04293                     svec[j]     = svec[j+gap]
04294                     svec[j+gap] = temp
04295                     itemp       = ivec[j]
04296                     ivec[j]     = ivec[j+gap]
04297                     ivec[j+gap] = itemp
04298         gap = gap / 2  # integer division needed
04299 #    svec is now sorted input vector, ivec has the order svec[i] = vec[ivec[i]]
04300     return svec, ivec
04301 
04302 
04303  def arankdata(inarray):
04304     """
04305 Ranks the data in inarray, dealing with ties appropritely.  Assumes
04306 a 1D inarray.  Adapted from Gary Perlman's |Stat ranksort.
04307 
04308 Usage:   arankdata(inarray)
04309 Returns: array of length equal to inarray, containing rank scores
04310 """
04311     n = len(inarray)
04312     svec, ivec = ashellsort(inarray)
04313     sumranks = 0
04314     dupcount = 0
04315     newarray = N.zeros(n,N.float_)
04316     for i in range(n):
04317         sumranks = sumranks + i
04318         dupcount = dupcount + 1
04319         if i==n-1 or svec[i] <> svec[i+1]:
04320             averank = sumranks / float(dupcount) + 1
04321             for j in range(i-dupcount+1,i+1):
04322                 newarray[ivec[j]] = averank
04323             sumranks = 0
04324             dupcount = 0
04325     return newarray
04326 
04327 
04328  def afindwithin(data):
04329     """
04330 Returns a binary vector, 1=within-subject factor, 0=between.  Input
04331 equals the entire data array (i.e., column 1=random factor, last
04332 column = measured values.
04333 
04334 Usage:   afindwithin(data)     data in |Stat format
04335 """
04336     numfact = len(data[0])-2
04337     withinvec = [0]*numfact
04338     for col in range(1,numfact+1):
04339         rows = pstat.linexand(data,col,pstat.unique(pstat.colex(data,1))[0])  # get 1 level of this factor
04340         if len(pstat.unique(pstat.colex(rows,0))) < len(rows):   # if fewer subjects than scores on this factor
04341             withinvec[col-1] = 1
04342     return withinvec
04343 
04344 
04345  #########################################################
04346  #########################################################
04347  ######  RE-DEFINE DISPATCHES TO INCLUDE ARRAYS  #########
04348  #########################################################
04349  #########################################################
04350 
04351 ## CENTRAL TENDENCY:
04352  geometricmean = Dispatch ( (lgeometricmean, (ListType, TupleType)),
04353                             (ageometricmean, (N.ndarray,)) )
04354  harmonicmean = Dispatch ( (lharmonicmean, (ListType, TupleType)),
04355                            (aharmonicmean, (N.ndarray,)) )
04356  mean = Dispatch ( (lmean, (ListType, TupleType)),
04357                    (amean, (N.ndarray,)) )
04358  median = Dispatch ( (lmedian, (ListType, TupleType)),
04359                      (amedian, (N.ndarray,)) )
04360  medianscore = Dispatch ( (lmedianscore, (ListType, TupleType)),
04361                           (amedianscore, (N.ndarray,)) )
04362  mode = Dispatch ( (lmode, (ListType, TupleType)),
04363                    (amode, (N.ndarray,)) )
04364  tmean = Dispatch ( (atmean, (N.ndarray,)) )
04365  tvar = Dispatch ( (atvar, (N.ndarray,)) )
04366  tstdev = Dispatch ( (atstdev, (N.ndarray,)) )
04367  tsem = Dispatch ( (atsem, (N.ndarray,)) )
04368 
04369 ## VARIATION:
04370  moment = Dispatch ( (lmoment, (ListType, TupleType)),
04371                      (amoment, (N.ndarray,)) )
04372  variation = Dispatch ( (lvariation, (ListType, TupleType)),
04373                         (avariation, (N.ndarray,)) )
04374  skew = Dispatch ( (lskew, (ListType, TupleType)),
04375                    (askew, (N.ndarray,)) )
04376  kurtosis = Dispatch ( (lkurtosis, (ListType, TupleType)),
04377                        (akurtosis, (N.ndarray,)) )
04378  describe = Dispatch ( (ldescribe, (ListType, TupleType)),
04379                        (adescribe, (N.ndarray,)) )
04380 
04381 ## DISTRIBUTION TESTS
04382 
04383  skewtest = Dispatch ( (askewtest, (ListType, TupleType)),
04384                        (askewtest, (N.ndarray,)) )
04385  kurtosistest = Dispatch ( (akurtosistest, (ListType, TupleType)),
04386                            (akurtosistest, (N.ndarray,)) )
04387  normaltest = Dispatch ( (anormaltest, (ListType, TupleType)),
04388                          (anormaltest, (N.ndarray,)) )
04389 
04390 ## FREQUENCY STATS:
04391  itemfreq = Dispatch ( (litemfreq, (ListType, TupleType)),
04392                        (aitemfreq, (N.ndarray,)) )
04393  scoreatpercentile = Dispatch ( (lscoreatpercentile, (ListType, TupleType)),
04394                                 (ascoreatpercentile, (N.ndarray,)) )
04395  percentileofscore = Dispatch ( (lpercentileofscore, (ListType, TupleType)),
04396                                  (apercentileofscore, (N.ndarray,)) )
04397  histogram = Dispatch ( (lhistogram, (ListType, TupleType)),
04398                         (ahistogram, (N.ndarray,)) )
04399  cumfreq = Dispatch ( (lcumfreq, (ListType, TupleType)),
04400                       (acumfreq, (N.ndarray,)) )
04401  relfreq = Dispatch ( (lrelfreq, (ListType, TupleType)),
04402                       (arelfreq, (N.ndarray,)) )
04403  
04404 ## VARIABILITY:
04405  obrientransform = Dispatch ( (lobrientransform, (ListType, TupleType)),
04406                               (aobrientransform, (N.ndarray,)) )
04407  samplevar = Dispatch ( (lsamplevar, (ListType, TupleType)),
04408                         (asamplevar, (N.ndarray,)) )
04409  samplestdev = Dispatch ( (lsamplestdev, (ListType, TupleType)),
04410                           (asamplestdev, (N.ndarray,)) )
04411  signaltonoise = Dispatch( (asignaltonoise, (N.ndarray,)),)
04412  var = Dispatch ( (lvar, (ListType, TupleType)),
04413                   (avar, (N.ndarray,)) )
04414  stdev = Dispatch ( (lstdev, (ListType, TupleType)),
04415                     (astdev, (N.ndarray,)) )
04416  sterr = Dispatch ( (lsterr, (ListType, TupleType)),
04417                     (asterr, (N.ndarray,)) )
04418  sem = Dispatch ( (lsem, (ListType, TupleType)),
04419                   (asem, (N.ndarray,)) )
04420  z = Dispatch ( (lz, (ListType, TupleType)),
04421                 (az, (N.ndarray,)) )
04422  zs = Dispatch ( (lzs, (ListType, TupleType)),
04423                  (azs, (N.ndarray,)) )
04424  
04425 ## TRIMMING FCNS:
04426  threshold = Dispatch( (athreshold, (N.ndarray,)),)
04427  trimboth = Dispatch ( (ltrimboth, (ListType, TupleType)),
04428                        (atrimboth, (N.ndarray,)) )
04429  trim1 = Dispatch ( (ltrim1, (ListType, TupleType)),
04430                     (atrim1, (N.ndarray,)) )
04431  
04432 ## CORRELATION FCNS:
04433  paired = Dispatch ( (lpaired, (ListType, TupleType)),
04434                      (apaired, (N.ndarray,)) )
04435  lincc = Dispatch ( (llincc, (ListType, TupleType)),
04436                        (alincc, (N.ndarray,)) )
04437  pearsonr = Dispatch ( (lpearsonr, (ListType, TupleType)),
04438                        (apearsonr, (N.ndarray,)) )
04439  spearmanr = Dispatch ( (lspearmanr, (ListType, TupleType)),
04440                         (aspearmanr, (N.ndarray,)) )
04441  pointbiserialr = Dispatch ( (lpointbiserialr, (ListType, TupleType)),
04442                              (apointbiserialr, (N.ndarray,)) )
04443  kendalltau = Dispatch ( (lkendalltau, (ListType, TupleType)),
04444                          (akendalltau, (N.ndarray,)) )
04445  linregress = Dispatch ( (llinregress, (ListType, TupleType)),
04446                          (alinregress, (N.ndarray,)) )
04447  
04448 ## INFERENTIAL STATS:
04449  ttest_1samp = Dispatch ( (lttest_1samp, (ListType, TupleType)),
04450                           (attest_1samp, (N.ndarray,)) )
04451  ttest_ind = Dispatch ( (lttest_ind, (ListType, TupleType)),
04452                         (attest_ind, (N.ndarray,)) )
04453  ttest_rel = Dispatch ( (lttest_rel, (ListType, TupleType)),
04454                         (attest_rel, (N.ndarray,)) )
04455  chisquare = Dispatch ( (lchisquare, (ListType, TupleType)),
04456                         (achisquare, (N.ndarray,)) )
04457  ks_2samp = Dispatch ( (lks_2samp, (ListType, TupleType)),
04458                        (aks_2samp, (N.ndarray,)) )
04459  mannwhitneyu = Dispatch ( (lmannwhitneyu, (ListType, TupleType)),
04460                            (amannwhitneyu, (N.ndarray,)) )
04461  tiecorrect = Dispatch ( (ltiecorrect, (ListType, TupleType)),
04462                          (atiecorrect, (N.ndarray,)) )
04463  ranksums = Dispatch ( (lranksums, (ListType, TupleType)),
04464                        (aranksums, (N.ndarray,)) )
04465  wilcoxont = Dispatch ( (lwilcoxont, (ListType, TupleType)),
04466                         (awilcoxont, (N.ndarray,)) )
04467  kruskalwallish = Dispatch ( (lkruskalwallish, (ListType, TupleType)),
04468                              (akruskalwallish, (N.ndarray,)) )
04469  friedmanchisquare = Dispatch ( (lfriedmanchisquare, (ListType, TupleType)),
04470                                 (afriedmanchisquare, (N.ndarray,)) )
04471  
04472 ## PROBABILITY CALCS:
04473  chisqprob = Dispatch ( (lchisqprob, (IntType, FloatType)),
04474                         (achisqprob, (N.ndarray,)) )
04475  zprob = Dispatch ( (lzprob, (IntType, FloatType)),
04476                     (azprob, (N.ndarray,)) )
04477  ksprob = Dispatch ( (lksprob, (IntType, FloatType)),
04478                      (aksprob, (N.ndarray,)) )
04479  fprob = Dispatch ( (lfprob, (IntType, FloatType)),
04480                     (afprob, (N.ndarray,)) )
04481  betacf = Dispatch ( (lbetacf, (IntType, FloatType)),
04482                      (abetacf, (N.ndarray,)) )
04483  betai = Dispatch ( (lbetai, (IntType, FloatType)),
04484                     (abetai, (N.ndarray,)) )
04485  erfcc = Dispatch ( (lerfcc, (IntType, FloatType)),
04486                     (aerfcc, (N.ndarray,)) )
04487  gammln = Dispatch ( (lgammln, (IntType, FloatType)),
04488                      (agammln, (N.ndarray,)) )
04489  
04490 ## ANOVA FUNCTIONS:
04491  F_oneway = Dispatch ( (lF_oneway, (ListType, TupleType)),
04492                        (aF_oneway, (N.ndarray,)) )
04493  F_value = Dispatch ( (lF_value, (ListType, TupleType)),
04494                       (aF_value, (N.ndarray,)) )
04495 
04496 ## SUPPORT FUNCTIONS:
04497  incr = Dispatch ( (lincr, (ListType, TupleType, N.ndarray)), )
04498  sum = Dispatch ( (lsum, (ListType, TupleType)),
04499                   (asum, (N.ndarray,)) )
04500  cumsum = Dispatch ( (lcumsum, (ListType, TupleType)),
04501                      (acumsum, (N.ndarray,)) )
04502  ss = Dispatch ( (lss, (ListType, TupleType)),
04503                  (ass, (N.ndarray,)) )
04504  summult = Dispatch ( (lsummult, (ListType, TupleType)),
04505                       (asummult, (N.ndarray,)) )
04506  square_of_sums = Dispatch ( (lsquare_of_sums, (ListType, TupleType)),
04507                              (asquare_of_sums, (N.ndarray,)) )
04508  sumdiffsquared = Dispatch ( (lsumdiffsquared, (ListType, TupleType)),
04509                              (asumdiffsquared, (N.ndarray,)) )
04510  shellsort = Dispatch ( (lshellsort, (ListType, TupleType)),
04511                         (ashellsort, (N.ndarray,)) )
04512  rankdata = Dispatch ( (lrankdata, (ListType, TupleType)),
04513                        (arankdata, (N.ndarray,)) )
04514  findwithin = Dispatch ( (lfindwithin, (ListType, TupleType)),
04515                          (afindwithin, (N.ndarray,)) )
04516 
04517 ######################  END OF NUMERIC FUNCTION BLOCK  #####################
04518 
04519 ######################  END OF STATISTICAL FUNCTIONS  ######################
04520 
04521 except ImportError:
04522  pass


wiimote
Author(s): Andreas Paepcke, Melonee Wise
autogenerated on Mon Oct 6 2014 01:06:37