Source code for pydivide.bin

# Copyright 2017 Regents of the University of Colorado. All Rights Reserved.
# Released under the MIT license.
# This software was developed at the University of Colorado's Laboratory for Atmospheric and Space Physics.
# Verify current version before use at: https://github.com/MAVENSDC/Pydivide

from .utilities import get_inst_obs_labels
from .utilities import initialize_list
from .utilities import place_values_in_list
from .utilities import get_values_from_list
import math
import numpy


[docs]def bin(kp,
        parameter=None,
        bin_by=None,
        mins=None,
        maxs=None,
        binsize=None,
        std=False,
        avg=False,
        density=False,
        median=False,
        unittest=False):
    '''
    Bins insitu Key Parameters by up to 8 different parameters, specified within
    the data structure. Necessary that at least one of avg, std, median, or
    density be specified.

    Parameters:
        kp: struct
            KP insitu data structure read from file(s).
        parameter: str
            Key Parameter to be binned. Only one may be binned at a time.
        bin_by: int, str
            Parameters (index or name) by which to bin the specified Key Parameter.
        binsize: int, list
        Bin size for each binning dimension. Number of elements must be equal to those in bin_by.
        mins: int, list
            Minimum value(s) for each binning scheme. Number of elements must be equal to those in bin_by.
        maxs: int, list 7
            Maximum value(s) for each binning scheme. Number of elements must be equal to those in bin_by.
        avg: bool
            Calculate average per bin.
        std: bool
            Calculate standard deviation per bin.
        density: bool
            Returns number of items in each bin.
        median: bool
            Calculate median per bin.

    Returns:
        This procedures outputs up to 4 arrays to user-defined variables, corresponding to avg, std, median, and density.

    Examples:
    >>> # Bin STATIC O+ characteristic energy by spacecraft latitude (1° resolution) and longitude (2° resolution).
    >>> output_avg = pydivide.bin(insitu, parameter='static.oplus_char_energy', bin_by=['spacecraft.geo_latitude', 'spacecraft.geo_longitude'], avg=True,binsize=[2,1])

    >>> # Bin SWIA H+ density by spacecraft altitude (10km resolution), return average value and standard deviation for each bin.
    >>> output_avg,output_std = pydivide.bin(insitu, parameter='swia.hplus_density', bin_by='spacecraft.altitude', binsize=10,avg=True,std=True)
    '''
    # ERROR CHECKING
    if not isinstance(bin_by, list):
        bin_by = [bin_by]
    
    if parameter is None:
        print("Must provide an index (or name) for param to be plotted.")
        return
    
    if bin_by is None:
        print("Must provide parameters to be binned by.")
        return
    
    if not avg and not std and not median and not density:
        print("Must select array(s) to return (avg, std, median, density).")
        return
    
    if not hasattr(binsize, "__len__"):
        temp = []
        temp.append(binsize)
        binsize = temp
    if mins is not None and not hasattr(mins, "__len__"):
        temp = []
        temp.append(mins)
        mins = temp
    if maxs is not None and not hasattr(maxs, "__len__"):
        temp = []
        temp.append(maxs)
        maxs = temp

    # Store instrument and observation of parameter in lists
    inst = []
    obs = []
    if type(parameter) is int or type(parameter) is str:
        a, b = get_inst_obs_labels(kp, parameter)
        inst.append(a)
        obs.append(b)
    else:
        for param in parameter:
            a, b = get_inst_obs_labels(kp, param)
            inst.append(a)
            obs.append(b)
    parameter_inst_obs = list(zip(inst, obs))

    # Store instrument and observation of "bin by" values in lists
    inst = []
    obs = []
    for param in bin_by:
        a, b = get_inst_obs_labels(kp, param)
        inst.append(a)
        obs.append(b)       
    bin_by_inst_obs = list(zip(inst, obs))

    # Calculate the dimensions of the binned array
    # Using the min/max values and the bin sizes
    total_fields = len(bin_by)
    ranges = []
    total_bins = []
    if mins is None:
        mins = []
        for inst, obs in bin_by_inst_obs:
            min_temp = kp[inst][obs].min(skipna=True)
            if math.isnan(min_temp):
                print("All " + obs + " data is NaN.  Cannot bin by this parameter.")
                return 
            mins.append(min_temp)
    if maxs is None:
        maxs = []
        for inst, obs in bin_by_inst_obs:
            maxs.append(kp[inst][obs].max(skipna=True))
        
    for i in range(total_fields):
        if maxs[i] - mins[i] < 0:
            print("ERROR: Minimum value of " + str(mins[i]) + " is greater than the maximum value of " + str(maxs[i]))
            print("for bin-by parameter " + bin_by_inst_obs[i][1] + ".  Returning...")
            return
        ranges.append(maxs[i] - mins[i])
        total_bins.append(int(math.ceil(ranges[i] / binsize[i])))

    # Initialize the binned_list (a list of every value at a certain bin)
    # Initialize the density array (the number of values binned into a bin)
    binned_array = numpy.zeros(total_bins)
    density_array = numpy.zeros(total_bins)
    binned_list = binned_array.tolist()
    binned_list = initialize_list(binned_list)

    # Loop through the KP to place the data into the correct bin
    for i in range(len(kp[parameter_inst_obs[0][0]][parameter_inst_obs[0][1]])):
        bad_val = False

        # Cannot do anything with NaNs.  Ignore them and continue.
        if math.isnan(kp[parameter_inst_obs[0][0]][parameter_inst_obs[0][1]][i]):
            continue

        # Find out where to place i
        j = 0
        data_value_indexes = []
        for bin_by_inst, bin_by_obs in bin_by_inst_obs:
            data_value = kp[bin_by_inst][bin_by_obs][i]
            # Ignore if NaN or out of range
            if math.isnan(data_value) or data_value < mins[j] or data_value > maxs[j]:
                bad_val = True
                continue
            dv = math.floor((data_value - mins[j]) / binsize[j])
            data_value_indexes.append(int(dv))
            j += 1
            
        if bad_val:
            continue

        # Populate binned_list in the proper spot, and add one to the density at that spot
        data_value_indexes = tuple(data_value_indexes)
        place_values_in_list(binned_list, data_value_indexes, kp[parameter_inst_obs[0][0]][parameter_inst_obs[0][1]][i])
        density_array[data_value_indexes] = density_array[data_value_indexes] + 1

    # Create arrays based on keywords
    if median:
        median_array = numpy.zeros(total_bins)
        median_array.fill(numpy.nan)
    if avg:
        average_array = numpy.zeros(total_bins)
        average_array.fill(numpy.nan)
    if std:
        std_array = numpy.zeros(total_bins)
        std_array.fill(numpy.nan)

    # Loop through the KP one more time to calculate median, avg, std.
    # This is necessary because we cannot calculate the median without knowing all the numbers
    # in each bin first.
    for i in range(len(kp[parameter_inst_obs[0][0]][parameter_inst_obs[0][1]])):
        bad_val = False

        # Cannot do anything with NaNs.  Ignore them and continue.
        if math.isnan(kp[parameter_inst_obs[0][0]][parameter_inst_obs[0][1]][i]):
            continue

        # Find out where to place i
        j = 0
        data_value_indexes = []
        
        for bin_by_inst, bin_by_obs in bin_by_inst_obs:
            data_value = kp[bin_by_inst][bin_by_obs][i]
            # Ignore if NaN or out of range
            if math.isnan(data_value) or data_value < mins[j] or data_value > maxs[j]:
                bad_val = True
                continue
            dv = math.floor((data_value - mins[j]) / binsize[j])
            data_value_indexes.append(int(dv))
            j += 1
            
        if bad_val:
            continue

        # Calculate the mean/median/mode from the values in "output_list"
        data_value_indexes = tuple(data_value_indexes)
        if median:
            # Jenkins server uses old versions of numpy and scipy
            if unittest:
                median_array[data_value_indexes] = \
                    numpy.nanmedian(get_values_from_list(binned_list, data_value_indexes))
            else:

                median_array[data_value_indexes] = \
                    numpy.nanmedian(get_values_from_list(binned_list, data_value_indexes))
        if avg or std:
            average_array[data_value_indexes] = \
                numpy.nansum(get_values_from_list(binned_list, data_value_indexes)) / density_array[data_value_indexes]
        if std:
            squared_total = []
            for x in get_values_from_list(binned_list, data_value_indexes):
                squared_total.append((x - average_array[data_value_indexes]) * (x - average_array[data_value_indexes]))
            std_array[data_value_indexes] = numpy.sqrt((numpy.sum(squared_total) / density_array[data_value_indexes]))
            
    # RETURN MEDIAN/AVERAGE/STANDARD DEVIATION
    return_list = []
    if median:
        return_list.append(median_array)
        print('Returning binned Medians')
    if avg:
        return_list.append(average_array)
        print('Returning binned Averages')
    if std:
        return_list.append(std_array)
        print('Returning binned standard deviations')
    if density:
        return_list.append(density_array)
        print('Returning binned densities')

    # Print out a little cheat sheet so people know what is in the array they're getting
    print('Now returning binned data')
    dimension = 0
    for bin_by_inst, bin_by_obs in bin_by_inst_obs:
        print('Dimension ' + str(dimension) + ' is ' + bin_by_obs)
        print('    Range: [' + str(mins[dimension]) + ', ' + str(mins[dimension] + binsize[dimension]) +
              ', ... ' + str(mins[dimension] + (binsize[dimension] * (total_bins[dimension] - 2))) +
              ', ' + str(mins[dimension] + (binsize[dimension] * (total_bins[dimension] - 1))) + ']')
        dimension += 1

    return return_list