Source code for pydivide.read

# Copyright 2017 Regents of the University of Colorado. All Rights Reserved.
# Released under the MIT license.
# This software was developed at the University of Colorado's Laboratory for Atmospheric and Space Physics.
# Verify current version before use at: https://github.com/MAVENSDC/Pydivide

import calendar
import numpy as np
from .utilities import kp_regex
from .utilities import param_dict
from .utilities import remove_inst_tag
from .utilities import get_latest_files_from_date_range, read_iuvs_file, get_latest_iuvs_files_from_date_range
from .utilities import get_header_info
from .utilities import orbit_time
from _collections import OrderedDict
import builtins
import os


[docs]def read(filename=None, input_time=None, instruments=None, insitu_only=False, specified_files_only=False): ''' Read in a given filename in situ file into a dictionary object Optional keywords maybe used to downselect instruments returned and the time windows. Parameters: filename: str/list of str Name of the in situ KP file(s) to read in. input_time: list of str/int Set a time bounds/filter on the data, must be length 2 with the first value being the start time, and the second value being the end time. instruments: Optional keyword listing the instruments to include in the returned dictionary/structure. insitu_only: Optional keyword that allows you to specify that you only want to download insitu files. specified_files_only: Optional keyword that allows you to specify you only want filenames given in 'filename' to be read in, not other files close in date/time as well. Returns: A dictionary (data structure) containing up to all of the columns included in a MAVEN in-situ Key parameter data file. Examples: >>> # Retrieve insitu and IUVS data for LPW and MAG on 2015-12-26. >>> insitu,iuvs = pydivide.read('2015-12-26', instruments=['lpw','mag']) >>> # Retrieve only insitu data for all instruments on 2017-06-19. >>> insitu = pydivide.read('2017-06-19', insitu_only=True) ''' import pandas as pd import re from datetime import datetime, timedelta from dateutil.parser import parse filenames = [] iuvs_filenames = [] if instruments is not None: if not isinstance(instruments, builtins.list): instruments = [instruments] if filename is None and input_time is None: print('You must specify either a set of filenames to read in, or a time frame in which ' 'you want to search for downloaded files.') if filename is not None: if not isinstance(filename, builtins.list): filename = [filename] dates = [] for file in filename: date = re.findall(r'_(\d{8})', file)[0] dates.append(date) if 'iuvs' in file: iuvs_filenames.append(file) else: filenames.append(file) # To keep the rest of the code consistent, if someone gave a files, or files, to load, but no input_time, # go ahead and create an 'input_time' if input_time is None: if len(dates) == 1: input_time = str(dates[0][:4]) + '-' + str(dates[0][4:6]) + '-' + str(dates[0][6:]) else: beg_date = min(dates) end_date = max(dates) input_time = [str(dates[0][:4]) + '-' + str(dates[0][4:6]) + '-' + str(dates[0][6:]), str(dates[1][:4]) + '-' + str(dates[1][4:6]) + '-' + str(dates[1][6:])] # Check for orbit num rather than time string if isinstance(input_time, builtins.list): if isinstance(input_time[0], int): input_time = orbit_time(input_time[0], input_time[1]) elif isinstance(input_time, int): input_time = orbit_time(input_time) # Turn string input into datetime objects if isinstance(input_time, list): if len(input_time[0]) <= 10: input_time[0] = input_time[0] + ' 00:00:00' if len(input_time[1]) <= 10: input_time[1] = input_time[1] + ' 00:00:00' date1 = parse(input_time[0]) date2 = parse(input_time[1]) else: if len(input_time) <= 10: input_time += ' 00:00:00' date1 = parse(input_time) date2 = date1 + timedelta(days=1) date1_unix = calendar.timegm(date1.timetuple()) date2_unix = calendar.timegm(date2.timetuple()) # Grab insitu and iuvs files for the specified/created date ranges date_range_filenames = get_latest_files_from_date_range(date1, date2) date_range_iuvs_filenames = get_latest_iuvs_files_from_date_range(date1, date2) # Add date range files to respective file lists if desired if not specified_files_only: filenames.extend(date_range_filenames) iuvs_filenames.extend(date_range_iuvs_filenames) if not date_range_filenames and not date_range_iuvs_filenames: if not filenames and not iuvs_filenames: print("No files found for the input date range, and no specific filenames were given. Exiting.") return # Going to look for files between time frames, but as we might have already specified # certain files to load in, we don't want to load them in 2x... so doing a check for that here filenames = list(set(filenames)) iuvs_filenames = list(set(iuvs_filenames)) iuvs_filenames.sort() kp_insitu = [] if filenames: # Get column names names, inst = [], [] crus_name, crus_inst = [], [] c_found = False r_found = False for f in filenames: if kp_regex.match(os.path.basename(f)).group('description') == '_crustal' and not c_found: name, inss = get_header_info(f) # Strip off the first name for now (Time), and use that as the dataframe index. # Seems to make sense for now, but will it always? crus_name.extend(name[1:]) crus_inst.extend(inss[1:]) c_found = True elif kp_regex.match(os.path.basename(f)).group('description') == '' and not r_found: name, ins = get_header_info(f) # Strip off the first name for now (Time), and use that as the dataframe index. # Seems to make sense for now, but will it always? names.extend(name[1:]) inst.extend(ins[1:]) r_found = True all_names = names + crus_name all_inst = inst + crus_inst # Break up dictionary into instrument groups lpw_group, euv_group, swe_group, swi_group, sta_group, sep_group, mag_group, ngi_group, app_group, sc_group, \ crus_group = [], [], [], [], [], [], [], [], [], [], [] for i, j in zip(all_inst, all_names): if re.match('^LPW$', i.strip()): lpw_group.append(j) elif re.match('^LPW-EUV$', i.strip()): euv_group.append(j) elif re.match('^SWEA$', i.strip()): swe_group.append(j) elif re.match('^SWIA$', i.strip()): swi_group.append(j) elif re.match('^STATIC$', i.strip()): sta_group.append(j) elif re.match('^SEP$', i.strip()): sep_group.append(j) elif re.match('^MAG$', i.strip()): mag_group.append(j) elif re.match('^NGIMS$', i.strip()): ngi_group.append(j) elif re.match('^MODELED_MAG$', i.strip()): crus_group.append(j) elif re.match('^SPICE$', i.strip()): # NB Need to split into APP and SPACECRAFT if re.match('(.+)APP(.+)', j): app_group.append(j) else: # Everything not APP is SC in SPICE # But do not include Orbit Num, or IO Flag # Could probably stand to clean this line up a bit if not re.match('(.+)(Orbit Number|Inbound Outbound Flag)', j): sc_group.append(j) else: pass delete_groups = [] if instruments is not None: if 'LPW' not in instruments and 'lpw' not in instruments: delete_groups += lpw_group if 'MAG' not in instruments and 'mag' not in instruments: delete_groups += mag_group if 'EUV' not in instruments and 'euv' not in instruments: delete_groups += euv_group if 'SWI' not in instruments and 'swi' not in instruments: delete_groups += swi_group if 'SWE' not in instruments and 'swe' not in instruments: delete_groups += swe_group if 'NGI' not in instruments and 'ngi' not in instruments: delete_groups += ngi_group if 'SEP' not in instruments and 'sep' not in instruments: delete_groups += sep_group if 'STA' not in instruments and 'sta' not in instruments: delete_groups += sta_group if 'MODELED_MAG' not in instruments and 'modeled_mag' not in instruments: delete_groups += crus_group # Read in all relavent data into a pandas dataframe called "temp" temp_data = [] filenames.sort() for filename in filenames: # Determine number of header lines nheader = 0 with open(filename) as f: for line in f: if line.startswith('#'): nheader += 1 if kp_regex.match(os.path.basename(filename)).group('description') == '_crustal': temp_data.append(pd.read_fwf(filename, skiprows=nheader, index_col=0, widths=[19] + len(crus_name) * [16], names=crus_name)) else: temp_data.append(pd.read_fwf(filename, skiprows=nheader, index_col=0, widths=[19] + len(names) * [16], names=names)) for i in delete_groups: del temp_data[-1][i] temp_unconverted = pd.concat(temp_data, axis=0) # Need to convert columns # This is kind of a hack, but I can't figure out a better way for now if 'SWEA.Electron Spectrum Shape' in temp_unconverted and 'NGIMS.Density NO' in temp_unconverted: temp = temp_unconverted.astype(dtype={'SWEA.Electron Spectrum Shape': np.float64, 'NGIMS.Density NO': np.float64}) elif 'SWEA.Electron Spectrum Shape' in temp_unconverted and 'NGIMS.Density NO' not in temp_unconverted: temp = temp_unconverted.astype(dtype={'SWEA.Electron Spectrum Shape': np.float64}) elif 'SWEA.Electron Spectrum Shape' not in temp_unconverted and 'NGIMS.Density NO' in temp_unconverted: temp = temp_unconverted.astype(dtype={'NGIMS.Density NO': np.float64}) else: temp = temp_unconverted # Cut out the times not included in the date range time_unix = [calendar.timegm(datetime.strptime(i, '%Y-%m-%dT%H:%M:%S').timetuple()) for i in temp.index] start_index = 0 for t in time_unix: if t >= date1_unix: break start_index += 1 end_index = 0 for t in time_unix: if t >= date2_unix: break end_index += 1 # Assign the first-level only tags time_unix = time_unix[start_index:end_index] temp = temp[start_index:end_index] time = temp.index time_unix = pd.Series(time_unix) # convert into Series for consistency time_unix.index = temp.index if 'SPICE.Orbit Number' in list(temp): orbit = temp['SPICE.Orbit Number'] else: orbit = None if 'SPICE.Inbound Outbound Flag' in list(temp): io_flag = temp['SPICE.Inbound Outbound Flag'] else: io_flag = None # Build the sub-level DataFrames for the larger dictionary/structure app = temp[app_group] spacecraft = temp[sc_group] if instruments is not None: if 'LPW' in instruments or 'lpw' in instruments: lpw = temp[lpw_group] else: lpw = None if 'MAG' in instruments or 'mag' in instruments: mag = temp[mag_group] else: mag = None if 'EUV' in instruments or 'euv' in instruments: euv = temp[euv_group] else: euv = None if 'SWE' in instruments or 'swe' in instruments: swea = temp[swe_group] else: swea = None if 'SWI' in instruments or 'swi' in instruments: swia = temp[swi_group] else: swia = None if 'NGI' in instruments or 'ngi' in instruments: ngims = temp[ngi_group] else: ngims = None if 'SEP' in instruments or 'sep' in instruments: sep = temp[sep_group] else: sep = None if 'STA' in instruments or 'sta' in instruments: static = temp[sta_group] else: static = None if 'MODELED_MAG' in instruments or 'modeled_mag' in instruments: crus = temp[crus_group] else: crus = None else: lpw = temp[lpw_group] euv = temp[euv_group] swea = temp[swe_group] swia = temp[swi_group] static = temp[sta_group] sep = temp[sep_group] mag = temp[mag_group] ngims = temp[ngi_group] crus = temp[crus_group] # Strip out the duplicated instrument part of the column names # (this is a bit hardwired and can be improved) for i in [lpw, euv, swea, swia, sep, static, ngims, mag, crus, app, spacecraft]: if i is not None: i.columns = remove_inst_tag(i) if lpw is not None: lpw = lpw.rename(index=str, columns=param_dict) if euv is not None: euv = euv.rename(index=str, columns=param_dict) if swea is not None: swea = swea.rename(index=str, columns=param_dict) if swia is not None: swia = swia.rename(index=str, columns=param_dict) if sep is not None: sep = sep.rename(index=str, columns=param_dict) if static is not None: static = static.rename(index=str, columns=param_dict) if ngims is not None: ngims = ngims.rename(index=str, columns=param_dict) if mag is not None: mag = mag.rename(index=str, columns=param_dict) if crus is not None: crus = crus.rename(index=str, columns=param_dict) if app is not None: app = app.rename(index=str, columns=param_dict) if spacecraft is not None: spacecraft = spacecraft.rename(index=str, columns=param_dict) if orbit is not None and io_flag is not None: # Do not forget to save units # Define the list of first level tag names tag_names = ['TimeString', 'Time', 'Orbit', 'IOflag', 'LPW', 'EUV', 'SWEA', 'SWIA', 'STATIC', 'SEP', 'MAG', 'NGIMS', 'MODELED_MAG', 'APP', 'SPACECRAFT'] # Define list of first level data structures data_tags = [time, time_unix, orbit, io_flag, lpw, euv, swea, swia, static, sep, mag, ngims, crus, app, spacecraft] else: # Do not forget to save units # Define the list of first level tag names tag_names = ['TimeString', 'Time', 'LPW', 'EUV', 'SWEA', 'SWIA', 'STATIC', 'SEP', 'MAG', 'NGIMS', 'MODELED_MAG', 'APP', 'SPACECRAFT'] # Define list of first level data structures data_tags = [time, time_unix, lpw, euv, swea, swia, static, sep, mag, ngims, crus, app, spacecraft] kp_insitu = OrderedDict(zip(tag_names, data_tags)) # Now for IUVS kp_iuvs = [] if not insitu_only and iuvs_filenames: for file in iuvs_filenames: kp_iuvs.append(read_iuvs_file(file)) if not kp_iuvs: return kp_insitu else: return kp_insitu, kp_iuvs