Source code for pydivide.read

# Copyright 2017 Regents of the University of Colorado. All Rights Reserved.
# Released under the MIT license.
# This software was developed at the University of Colorado's Laboratory for Atmospheric and Space Physics.
# Verify current version before use at: https://github.com/MAVENSDC/Pydivide

import calendar
import numpy as np
from .utilities import kp_regex
from .utilities import param_dict
from .utilities import remove_inst_tag
from .utilities import get_latest_files_from_date_range, read_iuvs_file, get_latest_iuvs_files_from_date_range
from .utilities import get_header_info
from .utilities import orbit_time
from _collections import OrderedDict
import builtins
import os


[docs]def read(filename=None, input_time=None, instruments=None, insitu_only=False, specified_files_only=False):
    '''
    Read in a given filename in situ file into a dictionary object
    Optional keywords maybe used to downselect instruments returned
    and the time windows.

    Parameters:
        filename: str/list of str
            Name of the in situ KP file(s) to read in.
        input_time: list of str/int
            Set a time bounds/filter on the data, must be length 2 with the first
            value being the start time, and the second value being the end time.
        instruments:
            Optional keyword listing the instruments to include 
            in the returned dictionary/structure.
        insitu_only:
            Optional keyword that allows you to specify that you only want 
            to download insitu files.
        specified_files_only:
            Optional keyword that allows you to specify you only want filenames
            given in 'filename' to be read in, not other files close in date/time
            as well.

    Returns:
        A dictionary (data structure) containing up to all of the columns
        included in a MAVEN in-situ Key parameter data file.

    Examples:
        >>> # Retrieve insitu and IUVS data for LPW and MAG on 2015-12-26.
        >>> insitu,iuvs = pydivide.read('2015-12-26', instruments=['lpw','mag'])

        >>> # Retrieve only insitu data for all instruments on 2017-06-19.
        >>> insitu = pydivide.read('2017-06-19', insitu_only=True)


    '''
    import pandas as pd
    import re
    from datetime import datetime, timedelta
    from dateutil.parser import parse

    filenames = []
    iuvs_filenames = []

    if instruments is not None:
        if not isinstance(instruments, builtins.list):
            instruments = [instruments]

    if filename is None and input_time is None:
        print('You must specify either a set of filenames to read in, or a time frame in which '
              'you want to search for downloaded files.')

    if filename is not None:
        if not isinstance(filename, builtins.list):
            filename = [filename]

        dates = []
        for file in filename:
            date = re.findall(r'_(\d{8})', file)[0]
            dates.append(date)
            if 'iuvs' in file:
                iuvs_filenames.append(file)
            else:
                filenames.append(file)

        # To keep the rest of the code consistent, if someone gave a files, or files, to load, but no input_time,
        # go ahead and create an 'input_time'
        if input_time is None:
            if len(dates) == 1:
                input_time = str(dates[0][:4]) + '-' + str(dates[0][4:6]) + '-' + str(dates[0][6:])

            else:
                beg_date = min(dates)
                end_date = max(dates)
                input_time = [str(dates[0][:4]) + '-' + str(dates[0][4:6]) + '-' + str(dates[0][6:]),
                              str(dates[1][:4]) + '-' + str(dates[1][4:6]) + '-' + str(dates[1][6:])]

    # Check for orbit num rather than time string
    if isinstance(input_time, builtins.list):
        if isinstance(input_time[0], int):
            input_time = orbit_time(input_time[0], input_time[1])
    elif isinstance(input_time, int):
        input_time = orbit_time(input_time)

    # Turn string input into datetime objects
    if isinstance(input_time, list):
        if len(input_time[0]) <= 10:
            input_time[0] = input_time[0] + ' 00:00:00'
        if len(input_time[1]) <= 10:
            input_time[1] = input_time[1] + ' 00:00:00'
        date1 = parse(input_time[0])
        date2 = parse(input_time[1])
    else:
        if len(input_time) <= 10:
            input_time += ' 00:00:00'
        date1 = parse(input_time)
        date2 = date1 + timedelta(days=1)

    date1_unix = calendar.timegm(date1.timetuple())
    date2_unix = calendar.timegm(date2.timetuple())

    # Grab insitu and iuvs files for the specified/created date ranges
    date_range_filenames = get_latest_files_from_date_range(date1, date2)
    date_range_iuvs_filenames = get_latest_iuvs_files_from_date_range(date1, date2)

    # Add date range files to respective file lists if desired
    if not specified_files_only:
        filenames.extend(date_range_filenames)
        iuvs_filenames.extend(date_range_iuvs_filenames)

    if not date_range_filenames and not date_range_iuvs_filenames:
        if not filenames and not iuvs_filenames:
            print("No files found for the input date range, and no specific filenames were given. Exiting.")
            return

    # Going to look for files between time frames, but as we might have already specified
    # certain files to load in, we don't want to load them in 2x... so doing a check for that here
    filenames = list(set(filenames))
    iuvs_filenames = list(set(iuvs_filenames))
    iuvs_filenames.sort()
    
    kp_insitu = []
    if filenames:
        # Get column names
        names, inst = [], []
        crus_name, crus_inst = [], []
        c_found = False
        r_found = False
        for f in filenames:
            if kp_regex.match(os.path.basename(f)).group('description') == '_crustal' and not c_found:
                name, inss = get_header_info(f)
                # Strip off the first name for now (Time), and use that as the dataframe index.
                # Seems to make sense for now, but will it always?
                crus_name.extend(name[1:])
                crus_inst.extend(inss[1:])
                c_found = True
            elif kp_regex.match(os.path.basename(f)).group('description') == '' and not r_found:
                name, ins = get_header_info(f)
                # Strip off the first name for now (Time), and use that as the dataframe index.
                # Seems to make sense for now, but will it always?
                names.extend(name[1:])
                inst.extend(ins[1:])
                r_found = True
        all_names = names + crus_name
        all_inst = inst + crus_inst

        # Break up dictionary into instrument groups
        lpw_group, euv_group, swe_group, swi_group, sta_group, sep_group, mag_group, ngi_group, app_group, sc_group, \
            crus_group = [], [], [], [], [], [], [], [], [], [], []

        for i, j in zip(all_inst, all_names):
            if re.match('^LPW$', i.strip()):
                lpw_group.append(j)
            elif re.match('^LPW-EUV$', i.strip()):
                euv_group.append(j)
            elif re.match('^SWEA$', i.strip()):
                swe_group.append(j)
            elif re.match('^SWIA$', i.strip()):
                swi_group.append(j)
            elif re.match('^STATIC$', i.strip()):
                sta_group.append(j)
            elif re.match('^SEP$', i.strip()):
                sep_group.append(j)
            elif re.match('^MAG$', i.strip()):
                mag_group.append(j)
            elif re.match('^NGIMS$', i.strip()):
                ngi_group.append(j)
            elif re.match('^MODELED_MAG$', i.strip()):
                crus_group.append(j)
            elif re.match('^SPICE$', i.strip()):
                # NB Need to split into APP and SPACECRAFT
                if re.match('(.+)APP(.+)', j):
                    app_group.append(j)
                else:  # Everything not APP is SC in SPICE
                    # But do not include Orbit Num, or IO Flag
                    # Could probably stand to clean this line up a bit
                    if not re.match('(.+)(Orbit Number|Inbound Outbound Flag)', j):
                        sc_group.append(j)
            else:
                pass

        delete_groups = []
        if instruments is not None:
            if 'LPW' not in instruments and 'lpw' not in instruments:
                delete_groups += lpw_group
            if 'MAG' not in instruments and 'mag' not in instruments:
                delete_groups += mag_group
            if 'EUV' not in instruments and 'euv' not in instruments:
                delete_groups += euv_group
            if 'SWI' not in instruments and 'swi' not in instruments:
                delete_groups += swi_group
            if 'SWE' not in instruments and 'swe' not in instruments:
                delete_groups += swe_group
            if 'NGI' not in instruments and 'ngi' not in instruments:
                delete_groups += ngi_group
            if 'SEP' not in instruments and 'sep' not in instruments:
                delete_groups += sep_group
            if 'STA' not in instruments and 'sta' not in instruments:
                delete_groups += sta_group
            if 'MODELED_MAG' not in instruments and 'modeled_mag' not in instruments:
                delete_groups += crus_group

        # Read in all relavent data into a pandas dataframe called "temp"
        temp_data = []
        filenames.sort()
        for filename in filenames:
            # Determine number of header lines
            nheader = 0
            with open(filename) as f:
                for line in f:
                    if line.startswith('#'):
                        nheader += 1
                if kp_regex.match(os.path.basename(filename)).group('description') == '_crustal':
                    temp_data.append(pd.read_fwf(filename, skiprows=nheader, index_col=0,
                                                 widths=[19] + len(crus_name) * [16], names=crus_name))
                else:
                    temp_data.append(pd.read_fwf(filename, skiprows=nheader, index_col=0,
                                                 widths=[19] + len(names) * [16], names=names))
                for i in delete_groups:
                    del temp_data[-1][i]

        temp_unconverted = pd.concat(temp_data, axis=0)

        # Need to convert columns
        # This is kind of a hack, but I can't figure out a better way for now

        if 'SWEA.Electron Spectrum Shape' in temp_unconverted and 'NGIMS.Density NO' in temp_unconverted:
            temp = temp_unconverted.astype(dtype={'SWEA.Electron Spectrum Shape': np.float64,
                                                  'NGIMS.Density NO': np.float64})
        elif 'SWEA.Electron Spectrum Shape' in temp_unconverted and 'NGIMS.Density NO' not in temp_unconverted:
            temp = temp_unconverted.astype(dtype={'SWEA.Electron Spectrum Shape': np.float64})
        elif 'SWEA.Electron Spectrum Shape' not in temp_unconverted and 'NGIMS.Density NO' in temp_unconverted:
            temp = temp_unconverted.astype(dtype={'NGIMS.Density NO': np.float64})
        else:
            temp = temp_unconverted

        # Cut out the times not included in the date range
        time_unix = [calendar.timegm(datetime.strptime(i, '%Y-%m-%dT%H:%M:%S').timetuple()) for i in temp.index]
        start_index = 0
        for t in time_unix:
            if t >= date1_unix:
                break
            start_index += 1
        end_index = 0
        for t in time_unix:
            if t >= date2_unix:
                break
            end_index += 1

        # Assign the first-level only tags
        time_unix = time_unix[start_index:end_index]
        temp = temp[start_index:end_index]
        time = temp.index
        time_unix = pd.Series(time_unix)  # convert into Series for consistency
        time_unix.index = temp.index

        if 'SPICE.Orbit Number' in list(temp):
            orbit = temp['SPICE.Orbit Number']
        else:
            orbit = None
        if 'SPICE.Inbound Outbound Flag' in list(temp):
            io_flag = temp['SPICE.Inbound Outbound Flag']
        else:
            io_flag = None

        # Build the sub-level DataFrames for the larger dictionary/structure
        app = temp[app_group]
        spacecraft = temp[sc_group]
        if instruments is not None:
            if 'LPW' in instruments or 'lpw' in instruments:
                lpw = temp[lpw_group]
            else:
                lpw = None
            if 'MAG' in instruments or 'mag' in instruments:
                mag = temp[mag_group]
            else:
                mag = None
            if 'EUV' in instruments or 'euv' in instruments:
                euv = temp[euv_group]
            else:
                euv = None
            if 'SWE' in instruments or 'swe' in instruments:
                swea = temp[swe_group]
            else:
                swea = None
            if 'SWI' in instruments or 'swi' in instruments:
                swia = temp[swi_group]
            else:
                swia = None
            if 'NGI' in instruments or 'ngi' in instruments:
                ngims = temp[ngi_group]
            else:
                ngims = None
            if 'SEP' in instruments or 'sep' in instruments:
                sep = temp[sep_group]
            else:
                sep = None
            if 'STA' in instruments or 'sta' in instruments:
                static = temp[sta_group]
            else:
                static = None
            if 'MODELED_MAG' in instruments or 'modeled_mag' in instruments:
                crus = temp[crus_group]
            else:
                crus = None
        else:
            lpw = temp[lpw_group]
            euv = temp[euv_group]
            swea = temp[swe_group]
            swia = temp[swi_group]
            static = temp[sta_group]
            sep = temp[sep_group]
            mag = temp[mag_group]
            ngims = temp[ngi_group]
            crus = temp[crus_group]

        # Strip out the duplicated instrument part of the column names
        # (this is a bit hardwired and can be improved)
        for i in [lpw, euv, swea, swia, sep, static, ngims, mag, crus, app, spacecraft]:
            if i is not None:
                i.columns = remove_inst_tag(i)

        if lpw is not None:
            lpw = lpw.rename(index=str, columns=param_dict)
        if euv is not None:
            euv = euv.rename(index=str, columns=param_dict)
        if swea is not None:
            swea = swea.rename(index=str, columns=param_dict)
        if swia is not None:
            swia = swia.rename(index=str, columns=param_dict)
        if sep is not None:
            sep = sep.rename(index=str, columns=param_dict)
        if static is not None:
            static = static.rename(index=str, columns=param_dict)
        if ngims is not None:
            ngims = ngims.rename(index=str, columns=param_dict)
        if mag is not None:
            mag = mag.rename(index=str, columns=param_dict)
        if crus is not None:
            crus = crus.rename(index=str, columns=param_dict)
        if app is not None:
            app = app.rename(index=str, columns=param_dict)
        if spacecraft is not None:
            spacecraft = spacecraft.rename(index=str, columns=param_dict)

        if orbit is not None and io_flag is not None:
            # Do not forget to save units
            # Define the list of first level tag names
            tag_names = ['TimeString', 'Time', 'Orbit', 'IOflag',
                         'LPW', 'EUV', 'SWEA', 'SWIA', 'STATIC',
                         'SEP', 'MAG', 'NGIMS', 'MODELED_MAG',
                         'APP', 'SPACECRAFT']

            # Define list of first level data structures
            data_tags = [time, time_unix, orbit, io_flag,
                         lpw, euv, swea, swia, static,
                         sep, mag, ngims, crus, app, spacecraft]
        else:
            # Do not forget to save units
            # Define the list of first level tag names
            tag_names = ['TimeString', 'Time', 'LPW', 'EUV',
                         'SWEA', 'SWIA', 'STATIC', 'SEP',
                         'MAG', 'NGIMS', 'MODELED_MAG',
                         'APP', 'SPACECRAFT']

            # Define list of first level data structures
            data_tags = [time, time_unix, lpw, euv,
                         swea, swia, static, sep, 
                         mag, ngims, crus, app,
                         spacecraft]

        kp_insitu = OrderedDict(zip(tag_names, data_tags))

    # Now for IUVS
    kp_iuvs = []
    if not insitu_only and iuvs_filenames:
        for file in iuvs_filenames:
            kp_iuvs.append(read_iuvs_file(file))
    if not kp_iuvs:
        return kp_insitu
    else:
        return kp_insitu, kp_iuvs