# -*- coding: utf-8 -*-
"""
Download gridMET climatic time series for multiple variables, e.g. ETr, temp,
wind speed 2m, swrad, etc. Uses `OpeNDAP <https://www.opendap.org>`_.
"""
import argparse
import datetime as dt
import logging
import os
import sys
import timeit
from time import sleep
import pandas as pd
pd.options.display.float_format = '{:,.10f}'.format
import refet
import xarray
[docs]def download_gridmet_opendap(input_csv, out_folder, year_filter='',
update_data=False, optional_vars=None):
"""
Download gridMET time series data for multiple climate variables for
select gridMET cells as listed in ``input_csv``. Uses `OpeNDAP <https://www.opendap.org>`_.
Arguments:
input_csv (str): file path of input CSV produced by
:mod:`prep_input.py`
out_folder (str): directory path to save gridmet timeseries CSV files
Keyword Arguments:
year_filter (list): default ''. Single year or range to download
update_data (bool): default False. Re-download existing data
optional_vars (None or list): default None. List of additional gridMET
vars to download using gridMET short names, currently only
wind direction named "th" is available.
Returns:
None
Examples:
Say we wanted to download data for 2016 through 2018, from the command
line,
.. code-block:: sh
$ download_gridmet_opendap.py -i merged_input.csv -o gridmet_data -y 2016-2018
note, "merged_input.csv" should have been created by first running
:mod:`prep_input.py`. If the ``[-y, --years]`` option is note given
the default behaviour is to download gridMET data from 1979 up through
yesterday.
If the data for 2018 has changed since the last run or for debugging
purposes you can re-download data for all or select years with the
``[-u, --update-data]`` option
.. code-block:: sh
$ download_gridmet_opendap.py -i merged_input.csv -o gridmet_data -y 2018 -u
To download the same gridMET data within Python
>>> from gridwxcomp import download_gridmet_opendap
>>> download_gridmet_opendap('merged_input.csv',
'gridmet_data',
'2016-2018'
)
Running :func:`download_gridmet_opendap.py` also updates the CSV file
produced from :mod:`prep_input.py` to include file paths to gridMET
time series files that are paired with climate stations.
"""
if not os.path.exists(out_folder):
logging.info('\nCreating output folder: {}'.format(out_folder))
os.makedirs(out_folder)
# Input .csv containing GRIDMET_ID, LAT, LON
input_df = pd.read_csv(input_csv)
# Specify column order for output .csv Variables:
output_order = ['date', 'year', 'month', 'day', 'centroid_lat',
'centroid_lon', 'elev_m', 'u2_ms', 'tmin_c', 'tmax_c',
'srad_wm2', 'ea_kpa', 'pair_kpa', 'prcp_mm', 'etr_mm',
'eto_mm']
opendap_url = 'http://thredds.northwestknowledge.net:8080/thredds/dodsC'
elev_nc = '{}/{}'.format(
opendap_url, '/MET/elev/metdata_elevationdata.nc#fillmismatch')
params = {
'etr': {
'nc': 'agg_met_etr_1979_CurrentYear_CONUS',
'var': 'daily_mean_reference_evapotranspiration_alfalfa',
'col': 'etr_mm'},
'pet': {
'nc': 'agg_met_pet_1979_CurrentYear_CONUS',
'var': 'daily_mean_reference_evapotranspiration_grass',
'col': 'eto_mm'},
'pr': {
'nc': 'agg_met_pr_1979_CurrentYear_CONUS',
'var': 'precipitation_amount',
'col': 'prcp_mm'},
'sph': {
'nc': 'agg_met_sph_1979_CurrentYear_CONUS',
'var': 'daily_mean_specific_humidity',
'col': 'q_kgkg'},
'srad': {
'nc': 'agg_met_srad_1979_CurrentYear_CONUS',
'var': 'daily_mean_shortwave_radiation_at_surface',
'col': 'srad_wm2'},
'vs': {
'nc': 'agg_met_vs_1979_CurrentYear_CONUS',
'var': 'daily_mean_wind_speed',
'col': 'u10_ms'},
'tmmx': {
'nc': 'agg_met_tmmx_1979_CurrentYear_CONUS',
'var': 'daily_maximum_temperature',
'col': 'tmax_k'},
'tmmn': {
'nc': 'agg_met_tmmn_1979_CurrentYear_CONUS',
'var': 'daily_minimum_temperature',
'col': 'tmin_k'},
'th': {
'nc': 'agg_met_th_1979_CurrentYear_CONUS',
'var': 'daily_mean_wind_direction',
'col': 'wdir_deg'}
}
extra_vars_available = ['th']
default_vars = list(set(params.keys()).difference(extra_vars_available))
if optional_vars is not None:
if isinstance(optional_vars, list):
pass
elif isinstance(optional_vars, str): # if using CLI
optional_vars = optional_vars.split(',')
if not set(optional_vars).issubset(extra_vars_available):
raise ValueError(
'ERROR: one or more optional gridMET variables is not'
'avaliable, these are all currently available: '
f'{",".join(params.keys())}'
)
# add the written names to list
else:
for el in optional_vars:
output_order.append(params[el].get('col'))
else:
optional_vars = []
# Year Filter
if year_filter:
year_list = sorted(list(_parse_int_set(year_filter)))
logging.info('\nDownloading Years: {0}-{1}'.format(min(year_list),
max(year_list)))
date_list = pd.date_range(
dt.datetime.strptime('{}-01-01'.format(min(year_list)),
'%Y-%m-%d'),
dt.datetime.strptime('{}-12-31'.format(max(year_list)),
'%Y-%m-%d'))
else:
logging.info('\nDownloading full historical record (1979-present).')
# Create List of all dates
# determine end date of data collection
current_date = dt.datetime.today()
end_date = dt.date(current_date.year, current_date.month,
current_date.day - 1)
date_list = pd.date_range(dt.datetime.strptime('1979-01-01',
'%Y-%m-%d'), end_date)
# Loop through dataframe row by row and grab desired met data from
# GRID collections based on Lat/Lon and Start/End dates
for index, row in input_df.iterrows():
start_time = timeit.default_timer()
# Reset original_df
original_df = None
export_df = None
GRIDMET_ID_str = str(row.GRIDMET_ID)
logging.info('Processing GRIDMET ID: {}'.format(GRIDMET_ID_str))
output_name = 'gridmet_historical_' + GRIDMET_ID_str + '.csv'
output_file = os.path.join(out_folder, output_name)
if os.path.isfile(output_file):
logging.info('{} exists. Checking for missing data.'.format(
output_name))
original_df = pd.read_csv(output_file, parse_dates=True)
missing_dates = list(set(date_list) - set(pd.to_datetime(
original_df['date'])))
original_df.date = pd.to_datetime(original_df.date.astype(str),
format='%Y-%m-%d')
original_df['date'] = original_df.date.apply(lambda x: x.strftime(
'%Y-%m-%d'))
else:
logging.info('{} does not exists. Creating file.'.format(
output_name))
missing_dates = list(set(date_list))
if not missing_dates and not update_data:
logging.info('No missing data found. Skipping')
# Add gridMET file path to input table if not already there
input_df.loc[input_df.GRIDMET_ID == row.GRIDMET_ID,\
'GRID_FILE_PATH'] = os.path.abspath(output_file)
input_df.to_csv(input_csv, index=False)
continue
# for option to redownload all data in given year range
elif not missing_dates and update_data:
if min(date_list.year) == max(date_list.year):
yr_rng = min(date_list.year)
else:
yr_rng = '{}-{}'.format(
min(date_list.year), max(date_list.year))
logging.info('Updating data for years: {}'.format(yr_rng))
start_date = min(date_list)
end_date = max(date_list)
# missing years are all years for updating
missing_years = sorted(list(set(date_list.year)))
# only download years with missing data
else:
# Min and Max of Missing Dates (Start: Inclusive; End: Exclusive)
start_date = min(missing_dates)
end_date = max(missing_dates)
missing_years = []
for date in missing_dates:
missing_years = missing_years + [date.year]
missing_years = sorted(list(set(missing_years)))
logging.debug(' Missing Years: {}'.format(
', '.join(map(str, missing_years))))
# Loop through ee pull by year (max 5000 records for getInfo())
# Append each new year on end of dataframe
# Process dataframe units and output after
start_date_yr = start_date.year
# Calculate out grid cell centroid
# gridMET elevation asset lower left corner coordinates
gridmet_lon = -124.78749996666667
gridmet_lat = 25.04583333333334
gridmet_cs = 0.041666666666666664
gridcell_lat = (int(abs(row.LAT - gridmet_lat) / gridmet_cs) * gridmet_cs +
gridmet_lat + 0.5 * gridmet_cs)
gridcell_lon = (int(abs(row.LON - gridmet_lon) / gridmet_cs) * gridmet_cs +
gridmet_lon + 0.5 * gridmet_cs)
gridmet_crs = 'EPSG:4326'
gridmet_geo = [gridmet_cs, 0, gridmet_lon,
0, -gridmet_cs, gridmet_lat + gridmet_cs * 585]
logging.debug(' Latitude: {}'.format(gridcell_lat))
logging.debug(' Longitude: {}'.format(gridcell_lon))
# OpenDAP call for elevation
elev_ds = xarray.open_dataset(elev_nc)\
.sel(lon=gridcell_lon, lat=gridcell_lat, method='nearest')
elev = elev_ds['elevation'].values[0]
logging.debug(' Elevation: {}'.format(elev))
# OpenDAP call for each variable
met_df_list = []
vars_to_download = default_vars + optional_vars
for met_name in vars_to_download:
logging.debug(' Variable: {}'.format(met_name))
# connect with the full time series then filtering later is faster # # day=pd.date_range(start=start_date, end=end_date),
# had to add #fillmismatch to fill any data entries that are of
# a different type (presumably incorrectly filled null values
# by the OpenDap Thredds server) with null values.
# issue here: https://github.com/Unidata/netcdf-c/issues/1299
met_nc = '{}/{}.nc#fillmismatch'.format(
opendap_url, params[met_name]['nc']
)
met_ds = xarray.open_dataset(met_nc).sel(
lon=gridcell_lon, lat=gridcell_lat, method='nearest'
).drop_vars(['crs', 'lat', 'lon']).rename({
params[met_name]['var']:params[met_name]['col'],'day':'date'
})
met_df = met_ds.to_dataframe()
# logging.debug(met_df.head())
met_df_list.append(met_df)
# This might need to be a merge call if the indices don't match
export_df = pd.concat(met_df_list, axis=1)
logging.debug(export_df.head())
# End OpenDAP stuff
# If export_df is None (skip to next ID)
if export_df is None:
continue
# Reset Index
export_df = export_df.reset_index(drop=False)
# Convert dateNum to datetime and create Year, Month, Day, DOY variables
export_df.date = pd.to_datetime(export_df.date.astype(str),
format='%Y-%m-%d')
export_df['year'] = export_df['date'].dt.year
export_df['month'] = export_df['date'].dt.month
export_df['day'] = export_df['date'].dt.day
# export_df['DOY'] = export_df['Date'].dt.dayofyear
# Format Date for export
export_df['date'] = export_df.date.apply(lambda x: x.strftime(
'%Y-%m-%d'))
# CGM - This is needed here since filtering by date on the OpenDAP call was really slow
# Should the dataframe be filtered to missing years or based on start/end date?
# export_df = export_df.loc[(export_df['date'] >= start_date.strftime('%Y-%m-%d')) &
# (export_df['date'] <= end_date.strftime('%Y-%m-%d'))]
export_df = export_df.loc[export_df['year'].isin(missing_years)]
# export_df = export_df.reset_index(drop=False)
# Remove all negative Prcp values (GRIDMET Bug)
export_df.prcp_mm = export_df.prcp_mm.clip(lower=0)
# Convert 10m windspeed to 2m (ASCE Eqn. 33)
zw = 10
export_df['u2_ms'] = refet.calcs._wind_height_adjust(
export_df.u10_ms, zw)
# elevation from gridMET elevation layer
export_df['elev_m'] = elev
export_df['centroid_lat'] = gridcell_lat
export_df['centroid_lon'] = gridcell_lon
# air pressure from gridmet elevation using refet module
export_df['pair_kpa'] = refet.calcs._air_pressure(
export_df.elev_m, method='asce')
# actual vapor pressure (kg/kg) using refet module
export_df['ea_kpa'] = refet.calcs._actual_vapor_pressure(
export_df.q_kgkg, export_df.pair_kpa)
# Unit Conversions
export_df['tmax_c'] = export_df.tmax_k - 273.15 # K to C
export_df['tmin_c'] = export_df.tmin_k - 273.15 # K to C
# export_df['Tavg_C'] = (export_df.Tmax_C + export_df.Tmin_C)/2
# Relative Humidity from gridMET min and max
# export_df['RH_avg'] = (export_df.RH_max + export_df.RH_min)/2
# Add new data to original dataframe, remove duplicates
export_df = pd.concat([original_df, export_df], ignore_index=True,
sort=True)
export_df = export_df[output_order].drop_duplicates('date')
export_df = export_df.sort_values(by=['year', 'month', 'day'])
export_df = export_df.dropna()
# Add gridMET file path to input table
input_df.loc[input_df.GRIDMET_ID == row.GRIDMET_ID,\
'GRID_FILE_PATH'] = os.path.abspath(output_file)
input_df.to_csv(input_csv, index=False)
# Write csv files to working directory
export_df.to_csv(output_file, columns=output_order, index=False,
float_format='%.10f')
elapsed = timeit.default_timer() - start_time
logging.info('\nDownload Time: {}'.format(elapsed))
def _parse_int_set(nputstr=""):
"""Return list of numbers given a string of ranges
http://thoughtsbyclayg.blogspot.com/2008/10/
parsing-list-of-numbers-in-python.html
"""
selection = set()
invalid = set()
# tokens are comma separated values
tokens = [x.strip() for x in nputstr.split(',')]
for i in tokens:
try:
# typically tokens are plain old integers
selection.add(int(i))
except:
# if not, then it might be a range
try:
token = [int(k.strip()) for k in i.split('-')]
if len(token) > 1:
token.sort()
# we have items seperated by a dash
# try to build a valid range
first = token[0]
last = token[len(token) - 1]
for x in range(first, last + 1):
selection.add(x)
except:
# not an int and not a range...
invalid.add(i)
# Report invalid tokens before returning valid selection
# print "Invalid set: " + str(invalid)
return selection
def arg_parse():
"""
Command line usage of download_gridmet_opendap.py for downloading
gridMET time series of several climatic variables using OpenDAP .
"""
parser = argparse.ArgumentParser(
description=arg_parse.__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
optional = parser._action_groups.pop() # optionals listed second
required = parser.add_argument_group('required arguments')
required.add_argument(
'-i', '--input', metavar='', required=True,
help='Input file containing station and gridMET IDs created by '+\
'prep_input.py')
required.add_argument(
'-o', '--out-dir', metavar='', required=True,
help='Output directory to save time series CSVs of gridMET data')
optional.add_argument(
'-y', '--years', metavar='', default=None, type=str,
help='Year(s) to download, single year (YYYY) or range (YYYY-YYYY)')
optional.add_argument(
'-u','--update-data', required=False, default=False,
action='store_true', help='Flag to re-download existing data')
optional.add_argument(
'-ov','--optional-vars', required=False, default=None,
type=str, help='Additional gridMET vars as comma separated list')
optional.add_argument(
'--debug', default=logging.INFO, const=logging.DEBUG,
help='Debug level logging', action="store_const", dest="loglevel")
parser._action_groups.append(optional)# to avoid optionals listed first
args = parser.parse_args()
return args
if __name__ == '__main__':
args = arg_parse()
logging.basicConfig(level=args.loglevel, format='%(message)s')
logging.info('\n{}'.format('#' * 80))
logging.info('{0:<20s} {1}'.format(
'Run Time Stamp:', dt.datetime.now().isoformat(' ')))
logging.info('{0:<20s} {1}'.format('Current Directory:', os.getcwd()))
logging.info('{0:<20s} {1}'.format(
'Script:', os.path.basename(sys.argv[0])))
download_gridmet_opendap(input_csv=args.input, out_folder=args.out_dir,
year_filter=args.years, update_data=args.update_data)