""" **stats** --- Statistics functions module.
------------------------------------------
This module contains utility functions used for statistics in ADMIT.
"""
import numpy as np
import numpy.ma as ma
[docs]def rejecto1(data, f=1.5):
""" reject outliers from a distribution
using a hinges-fences style rejection,
using a mean.
Parameters
----------
data : array
f : float
The factor f, such that only data is retained
between mean - f*std and mean + f*std
Returns
-------
returns: an array with only data between
mean - f*std and mean + f*std
"""
u = np.mean(data)
s = np.std(data)
newdata = [e for e in data if (u - f * s < e < u + f * s)]
return newdata
[docs]def rejecto2(data, f=1.5):
""" reject outliers from a distribution
using a hinges-fences style rejection,
using a median.
Parameters
----------
data : array
f : float
The factor f, such that only data is retained
between median - f*std and median + f*std
Returns
-------
returns: an array with only data between
median - f*std and median + f*std
"""
d = np.abs(data - np.median(data))
mdev = np.median(d)
s = d/mdev if mdev else 0.
return data[s<f]
[docs]def mystats(data):
""" return raw and robust statistics for a distribution
Parameters
----------
data : array
The data array for which the statistics is returned.
Returns
-------
returns: N, mean, std for the raw and robust resp.
"""
m1 = data.mean()
s1 = data.std()
n1 = len(data)
d = rejecto2(data)
m2 = d.mean()
s2 = d.std()
n2 = len(d)
return (n1,m1,s1,n2,m2,s2)
[docs]def robust(data,f=1.5):
"""return a subset of the data with outliers robustly removed
data - can be masked
"""
if type(data) == np.ndarray:
d = np.sort(data)
else:
d = np.sort(data).compressed()
n= len(d)
n1 = n/4
n2 = n/2
n3 = (3*n)/4
q1 = d[n1]
q2 = d[n2]
q3 = d[n3]
d = q3-q1
f1 = q1-f*d
f3 = q3+f*d
# median = np.median(d)
# print "robust: Median: ", median
# print "robust: n,Q1,2,3:",n,q1,q2,q3
# print "robust: f1,f3:",f1,f3
dm = ma.masked_outside(data,f1,f3)
return dm
[docs]def reducedchisquared(data, model, numpar, noise=None):
""" Method to compute the reduced chi squared of a fit to data.
Parameters
----------
data : array like
The raw data the fit is based on, acceptable data types
are numpy array, masked array, and list
model : array like
The fit to the data, acceptable data types are numpy array,
masked array, and list. Must be of the same length as data,
no error checking is done
dof : int
Number of free parameters (degrees of freedom) in the model
noise : float
The noise/uncertainty of the data
Returns
-------
Float containing the reduced chi squared value, the closer to 1.0
the better
"""
if noise is None:
chisq = np.sum((data - model) ** 2)
else:
chisq = np.sum(((data - model)/noise)**2)
if isinstance(data, ma.masked_array):
nu = data.count() - numpar
elif isinstance(data, np.ndarray):
nu = data.shape[0] - numpar
else:
nu = len(data) - numpar
return chisq / nu
if __name__ == "__main__":
np.random.seed(123)
np.random.seed()
n = 1000
f = 1.5
if False:
a = np.random.normal(0.0,1.0,n) # gaussian
print "normal(0.0,1.0,%d)" % n
else:
a = np.random.random(n) # uniform
print "random(%d)" % n
a1 = rejecto1(a,f)
print "rejecto1: ",len(a1)
a2 = rejecto2(a,f)
print "rejecto2: ",len(a2)
print "mystats:",mystats(a)
ar = robust(a,f)
#print "robust: ",len(ar),ar
print "robust: len=",len(ar)