Source code for gridwxcomp.download_gridmet_opendap

# -*- coding: utf-8 -*-
"""
Download gridMET climatic time series for multiple variables, e.g. ETr, temp,
wind speed 2m, swrad, etc. Uses `OpeNDAP <https://www.opendap.org>`_.

"""
import argparse
import datetime as dt
import logging
import os
import sys
import timeit
from time import sleep

import pandas as pd
pd.options.display.float_format = '{:,.10f}'.format
import refet
import xarray

[docs]def download_gridmet_opendap(input_csv, out_folder, year_filter='', update_data=False, optional_vars=None): """ Download gridMET time series data for multiple climate variables for select gridMET cells as listed in ``input_csv``. Uses `OpeNDAP <https://www.opendap.org>`_. Arguments: input_csv (str): file path of input CSV produced by :mod:`prep_input.py` out_folder (str): directory path to save gridmet timeseries CSV files Keyword Arguments: year_filter (list): default ''. Single year or range to download update_data (bool): default False. Re-download existing data optional_vars (None or list): default None. List of additional gridMET vars to download using gridMET short names, currently only wind direction named "th" is available. Returns: None Examples: Say we wanted to download data for 2016 through 2018, from the command line, .. code-block:: sh $ download_gridmet_opendap.py -i merged_input.csv -o gridmet_data -y 2016-2018 note, "merged_input.csv" should have been created by first running :mod:`prep_input.py`. If the ``[-y, --years]`` option is note given the default behaviour is to download gridMET data from 1979 up through yesterday. If the data for 2018 has changed since the last run or for debugging purposes you can re-download data for all or select years with the ``[-u, --update-data]`` option .. code-block:: sh $ download_gridmet_opendap.py -i merged_input.csv -o gridmet_data -y 2018 -u To download the same gridMET data within Python >>> from gridwxcomp import download_gridmet_opendap >>> download_gridmet_opendap('merged_input.csv', 'gridmet_data', '2016-2018' ) Running :func:`download_gridmet_opendap.py` also updates the CSV file produced from :mod:`prep_input.py` to include file paths to gridMET time series files that are paired with climate stations. """ if not os.path.exists(out_folder): logging.info('\nCreating output folder: {}'.format(out_folder)) os.makedirs(out_folder) # Input .csv containing GRIDMET_ID, LAT, LON input_df = pd.read_csv(input_csv) # Specify column order for output .csv Variables: output_order = ['date', 'year', 'month', 'day', 'centroid_lat', 'centroid_lon', 'elev_m', 'u2_ms', 'tmin_c', 'tmax_c', 'srad_wm2', 'ea_kpa', 'pair_kpa', 'prcp_mm', 'etr_mm', 'eto_mm'] opendap_url = 'http://thredds.northwestknowledge.net:8080/thredds/dodsC' elev_nc = '{}/{}'.format( opendap_url, '/MET/elev/metdata_elevationdata.nc#fillmismatch') params = { 'etr': { 'nc': 'agg_met_etr_1979_CurrentYear_CONUS', 'var': 'daily_mean_reference_evapotranspiration_alfalfa', 'col': 'etr_mm'}, 'pet': { 'nc': 'agg_met_pet_1979_CurrentYear_CONUS', 'var': 'daily_mean_reference_evapotranspiration_grass', 'col': 'eto_mm'}, 'pr': { 'nc': 'agg_met_pr_1979_CurrentYear_CONUS', 'var': 'precipitation_amount', 'col': 'prcp_mm'}, 'sph': { 'nc': 'agg_met_sph_1979_CurrentYear_CONUS', 'var': 'daily_mean_specific_humidity', 'col': 'q_kgkg'}, 'srad': { 'nc': 'agg_met_srad_1979_CurrentYear_CONUS', 'var': 'daily_mean_shortwave_radiation_at_surface', 'col': 'srad_wm2'}, 'vs': { 'nc': 'agg_met_vs_1979_CurrentYear_CONUS', 'var': 'daily_mean_wind_speed', 'col': 'u10_ms'}, 'tmmx': { 'nc': 'agg_met_tmmx_1979_CurrentYear_CONUS', 'var': 'daily_maximum_temperature', 'col': 'tmax_k'}, 'tmmn': { 'nc': 'agg_met_tmmn_1979_CurrentYear_CONUS', 'var': 'daily_minimum_temperature', 'col': 'tmin_k'}, 'th': { 'nc': 'agg_met_th_1979_CurrentYear_CONUS', 'var': 'daily_mean_wind_direction', 'col': 'wdir_deg'} } extra_vars_available = ['th'] default_vars = list(set(params.keys()).difference(extra_vars_available)) if optional_vars is not None: if isinstance(optional_vars, list): pass elif isinstance(optional_vars, str): # if using CLI optional_vars = optional_vars.split(',') if not set(optional_vars).issubset(extra_vars_available): raise ValueError( 'ERROR: one or more optional gridMET variables is not' 'avaliable, these are all currently available: ' f'{",".join(params.keys())}' ) # add the written names to list else: for el in optional_vars: output_order.append(params[el].get('col')) else: optional_vars = [] # Year Filter if year_filter: year_list = sorted(list(_parse_int_set(year_filter))) logging.info('\nDownloading Years: {0}-{1}'.format(min(year_list), max(year_list))) date_list = pd.date_range( dt.datetime.strptime('{}-01-01'.format(min(year_list)), '%Y-%m-%d'), dt.datetime.strptime('{}-12-31'.format(max(year_list)), '%Y-%m-%d')) else: logging.info('\nDownloading full historical record (1979-present).') # Create List of all dates # determine end date of data collection current_date = dt.datetime.today() end_date = dt.date(current_date.year, current_date.month, current_date.day - 1) date_list = pd.date_range(dt.datetime.strptime('1979-01-01', '%Y-%m-%d'), end_date) # Loop through dataframe row by row and grab desired met data from # GRID collections based on Lat/Lon and Start/End dates for index, row in input_df.iterrows(): start_time = timeit.default_timer() # Reset original_df original_df = None export_df = None GRIDMET_ID_str = str(row.GRIDMET_ID) logging.info('Processing GRIDMET ID: {}'.format(GRIDMET_ID_str)) output_name = 'gridmet_historical_' + GRIDMET_ID_str + '.csv' output_file = os.path.join(out_folder, output_name) if os.path.isfile(output_file): logging.info('{} exists. Checking for missing data.'.format( output_name)) original_df = pd.read_csv(output_file, parse_dates=True) missing_dates = list(set(date_list) - set(pd.to_datetime( original_df['date']))) original_df.date = pd.to_datetime(original_df.date.astype(str), format='%Y-%m-%d') original_df['date'] = original_df.date.apply(lambda x: x.strftime( '%Y-%m-%d')) else: logging.info('{} does not exists. Creating file.'.format( output_name)) missing_dates = list(set(date_list)) if not missing_dates and not update_data: logging.info('No missing data found. Skipping') # Add gridMET file path to input table if not already there input_df.loc[input_df.GRIDMET_ID == row.GRIDMET_ID,\ 'GRID_FILE_PATH'] = os.path.abspath(output_file) input_df.to_csv(input_csv, index=False) continue # for option to redownload all data in given year range elif not missing_dates and update_data: if min(date_list.year) == max(date_list.year): yr_rng = min(date_list.year) else: yr_rng = '{}-{}'.format( min(date_list.year), max(date_list.year)) logging.info('Updating data for years: {}'.format(yr_rng)) start_date = min(date_list) end_date = max(date_list) # missing years are all years for updating missing_years = sorted(list(set(date_list.year))) # only download years with missing data else: # Min and Max of Missing Dates (Start: Inclusive; End: Exclusive) start_date = min(missing_dates) end_date = max(missing_dates) missing_years = [] for date in missing_dates: missing_years = missing_years + [date.year] missing_years = sorted(list(set(missing_years))) logging.debug(' Missing Years: {}'.format( ', '.join(map(str, missing_years)))) # Loop through ee pull by year (max 5000 records for getInfo()) # Append each new year on end of dataframe # Process dataframe units and output after start_date_yr = start_date.year # Calculate out grid cell centroid # gridMET elevation asset lower left corner coordinates gridmet_lon = -124.78749996666667 gridmet_lat = 25.04583333333334 gridmet_cs = 0.041666666666666664 gridcell_lat = (int(abs(row.LAT - gridmet_lat) / gridmet_cs) * gridmet_cs + gridmet_lat + 0.5 * gridmet_cs) gridcell_lon = (int(abs(row.LON - gridmet_lon) / gridmet_cs) * gridmet_cs + gridmet_lon + 0.5 * gridmet_cs) gridmet_crs = 'EPSG:4326' gridmet_geo = [gridmet_cs, 0, gridmet_lon, 0, -gridmet_cs, gridmet_lat + gridmet_cs * 585] logging.debug(' Latitude: {}'.format(gridcell_lat)) logging.debug(' Longitude: {}'.format(gridcell_lon)) # OpenDAP call for elevation elev_ds = xarray.open_dataset(elev_nc)\ .sel(lon=gridcell_lon, lat=gridcell_lat, method='nearest') elev = elev_ds['elevation'].values[0] logging.debug(' Elevation: {}'.format(elev)) # OpenDAP call for each variable met_df_list = [] vars_to_download = default_vars + optional_vars for met_name in vars_to_download: logging.debug(' Variable: {}'.format(met_name)) # connect with the full time series then filtering later is faster # # day=pd.date_range(start=start_date, end=end_date), # had to add #fillmismatch to fill any data entries that are of # a different type (presumably incorrectly filled null values # by the OpenDap Thredds server) with null values. # issue here: https://github.com/Unidata/netcdf-c/issues/1299 met_nc = '{}/{}.nc#fillmismatch'.format( opendap_url, params[met_name]['nc'] ) met_ds = xarray.open_dataset(met_nc).sel( lon=gridcell_lon, lat=gridcell_lat, method='nearest' ).drop_vars(['crs', 'lat', 'lon']).rename({ params[met_name]['var']:params[met_name]['col'],'day':'date' }) met_df = met_ds.to_dataframe() # logging.debug(met_df.head()) met_df_list.append(met_df) # This might need to be a merge call if the indices don't match export_df = pd.concat(met_df_list, axis=1) logging.debug(export_df.head()) # End OpenDAP stuff # If export_df is None (skip to next ID) if export_df is None: continue # Reset Index export_df = export_df.reset_index(drop=False) # Convert dateNum to datetime and create Year, Month, Day, DOY variables export_df.date = pd.to_datetime(export_df.date.astype(str), format='%Y-%m-%d') export_df['year'] = export_df['date'].dt.year export_df['month'] = export_df['date'].dt.month export_df['day'] = export_df['date'].dt.day # export_df['DOY'] = export_df['Date'].dt.dayofyear # Format Date for export export_df['date'] = export_df.date.apply(lambda x: x.strftime( '%Y-%m-%d')) # CGM - This is needed here since filtering by date on the OpenDAP call was really slow # Should the dataframe be filtered to missing years or based on start/end date? # export_df = export_df.loc[(export_df['date'] >= start_date.strftime('%Y-%m-%d')) & # (export_df['date'] <= end_date.strftime('%Y-%m-%d'))] export_df = export_df.loc[export_df['year'].isin(missing_years)] # export_df = export_df.reset_index(drop=False) # Remove all negative Prcp values (GRIDMET Bug) export_df.prcp_mm = export_df.prcp_mm.clip(lower=0) # Convert 10m windspeed to 2m (ASCE Eqn. 33) zw = 10 export_df['u2_ms'] = refet.calcs._wind_height_adjust( export_df.u10_ms, zw) # elevation from gridMET elevation layer export_df['elev_m'] = elev export_df['centroid_lat'] = gridcell_lat export_df['centroid_lon'] = gridcell_lon # air pressure from gridmet elevation using refet module export_df['pair_kpa'] = refet.calcs._air_pressure( export_df.elev_m, method='asce') # actual vapor pressure (kg/kg) using refet module export_df['ea_kpa'] = refet.calcs._actual_vapor_pressure( export_df.q_kgkg, export_df.pair_kpa) # Unit Conversions export_df['tmax_c'] = export_df.tmax_k - 273.15 # K to C export_df['tmin_c'] = export_df.tmin_k - 273.15 # K to C # export_df['Tavg_C'] = (export_df.Tmax_C + export_df.Tmin_C)/2 # Relative Humidity from gridMET min and max # export_df['RH_avg'] = (export_df.RH_max + export_df.RH_min)/2 # Add new data to original dataframe, remove duplicates export_df = pd.concat([original_df, export_df], ignore_index=True, sort=True) export_df = export_df[output_order].drop_duplicates('date') export_df = export_df.sort_values(by=['year', 'month', 'day']) export_df = export_df.dropna() # Add gridMET file path to input table input_df.loc[input_df.GRIDMET_ID == row.GRIDMET_ID,\ 'GRID_FILE_PATH'] = os.path.abspath(output_file) input_df.to_csv(input_csv, index=False) # Write csv files to working directory export_df.to_csv(output_file, columns=output_order, index=False, float_format='%.10f') elapsed = timeit.default_timer() - start_time logging.info('\nDownload Time: {}'.format(elapsed))
def _parse_int_set(nputstr=""): """Return list of numbers given a string of ranges http://thoughtsbyclayg.blogspot.com/2008/10/ parsing-list-of-numbers-in-python.html """ selection = set() invalid = set() # tokens are comma separated values tokens = [x.strip() for x in nputstr.split(',')] for i in tokens: try: # typically tokens are plain old integers selection.add(int(i)) except: # if not, then it might be a range try: token = [int(k.strip()) for k in i.split('-')] if len(token) > 1: token.sort() # we have items seperated by a dash # try to build a valid range first = token[0] last = token[len(token) - 1] for x in range(first, last + 1): selection.add(x) except: # not an int and not a range... invalid.add(i) # Report invalid tokens before returning valid selection # print "Invalid set: " + str(invalid) return selection def arg_parse(): """ Command line usage of download_gridmet_opendap.py for downloading gridMET time series of several climatic variables using OpenDAP . """ parser = argparse.ArgumentParser( description=arg_parse.__doc__, formatter_class=argparse.ArgumentDefaultsHelpFormatter) optional = parser._action_groups.pop() # optionals listed second required = parser.add_argument_group('required arguments') required.add_argument( '-i', '--input', metavar='', required=True, help='Input file containing station and gridMET IDs created by '+\ 'prep_input.py') required.add_argument( '-o', '--out-dir', metavar='', required=True, help='Output directory to save time series CSVs of gridMET data') optional.add_argument( '-y', '--years', metavar='', default=None, type=str, help='Year(s) to download, single year (YYYY) or range (YYYY-YYYY)') optional.add_argument( '-u','--update-data', required=False, default=False, action='store_true', help='Flag to re-download existing data') optional.add_argument( '-ov','--optional-vars', required=False, default=None, type=str, help='Additional gridMET vars as comma separated list') optional.add_argument( '--debug', default=logging.INFO, const=logging.DEBUG, help='Debug level logging', action="store_const", dest="loglevel") parser._action_groups.append(optional)# to avoid optionals listed first args = parser.parse_args() return args if __name__ == '__main__': args = arg_parse() logging.basicConfig(level=args.loglevel, format='%(message)s') logging.info('\n{}'.format('#' * 80)) logging.info('{0:<20s} {1}'.format( 'Run Time Stamp:', dt.datetime.now().isoformat(' '))) logging.info('{0:<20s} {1}'.format('Current Directory:', os.getcwd())) logging.info('{0:<20s} {1}'.format( 'Script:', os.path.basename(sys.argv[0]))) download_gridmet_opendap(input_csv=args.input, out_folder=args.out_dir, year_filter=args.years, update_data=args.update_data)