Source code for admit.util.stats

""" **stats** --- Statistics functions module.
    ------------------------------------------

    This module contains utility functions used for statistics in ADMIT.
"""

import numpy as np
import numpy.ma as ma


[docs]def rejecto1(data, f=1.5):
    """ reject outliers from a distribution
    using a hinges-fences style rejection,
    using a mean.
    
    Parameters
    ----------
    data : array

    f : float
        The factor f, such that only data is retained
        between mean - f*std and mean + f*std

    Returns
    -------
    returns: an array with only data between
    mean - f*std and mean + f*std
        
    """
    u = np.mean(data)
    s = np.std(data)
    newdata = [e for e in data if (u - f * s < e < u + f * s)]
    return newdata

[docs]def rejecto2(data, f=1.5):
    """ reject outliers from a distribution
        using a hinges-fences style rejection,
        using a median.

    Parameters
    ----------
    data : array

    f : float
        The factor f, such that only data is retained
        between median - f*std and median + f*std

    Returns
    -------
    returns: an array with only data between
    median - f*std and median + f*std
    """
    d = np.abs(data - np.median(data))
    mdev = np.median(d)
    s = d/mdev if mdev else 0.
    return data[s<f]

[docs]def mystats(data):
    """ return raw and robust statistics for a distribution

        Parameters
        ----------
        data : array 
            The data array for which the statistics is returned.

        Returns
        -------
        returns:   N, mean, std for the raw and robust resp.
    """
    m1 = data.mean()
    s1 = data.std()
    n1 = len(data)
    d = rejecto2(data)
    m2 = d.mean()
    s2 = d.std()
    n2 = len(d)
    return (n1,m1,s1,n2,m2,s2)
    
[docs]def robust(data,f=1.5):
    """return a subset of the data with outliers robustly removed
    data - can be masked
    """
    if type(data) == np.ndarray:
        d = np.sort(data)
    else:
        d = np.sort(data).compressed()
    n= len(d)
    n1 = n/4
    n2 = n/2
    n3 = (3*n)/4
    q1 = d[n1]
    q2 = d[n2]
    q3 = d[n3]
    d = q3-q1
    f1 = q1-f*d
    f3 = q3+f*d
    # median = np.median(d)
    # print "robust: Median: ", median
    # print "robust: n,Q1,2,3:",n,q1,q2,q3
    # print "robust: f1,f3:",f1,f3
    dm = ma.masked_outside(data,f1,f3)
    return dm

[docs]def reducedchisquared(data, model, numpar, noise=None):
    """ Method to compute the reduced chi squared of a fit to data.

        Parameters
        ----------
        data : array like
            The raw data the fit is based on, acceptable data types
            are numpy array, masked array, and list

        model : array like
            The fit to the data, acceptable data types are numpy array,
            masked array, and list. Must be of the same length as data,
            no error checking is done

        dof : int
            Number of free parameters (degrees of freedom) in the model

        noise : float
            The noise/uncertainty of the data

        Returns
        -------
        Float containing the reduced chi squared value, the closer to 1.0
        the better

    """
    if noise is None:
        chisq = np.sum((data - model) ** 2)
    else:
        chisq = np.sum(((data - model)/noise)**2)
    if isinstance(data, ma.masked_array):
        nu = data.count() - numpar
    elif isinstance(data, np.ndarray):
        nu = data.shape[0] - numpar
    else:
        nu = len(data) - numpar
    return chisq / nu


if __name__ == "__main__":
    np.random.seed(123)
    np.random.seed()
    n = 1000
    f = 1.5
    if False:
        a = np.random.normal(0.0,1.0,n)     # gaussian
        print "normal(0.0,1.0,%d)" % n
    else:
        a = np.random.random(n)             # uniform
        print "random(%d)" % n
    a1 = rejecto1(a,f)
    print "rejecto1: ",len(a1)
    a2 = rejecto2(a,f)
    print "rejecto2: ",len(a2)
    print "mystats:",mystats(a)
    ar = robust(a,f)
    #print "robust: ",len(ar),ar
    print "robust: len=",len(ar)