from bokeh.layouts import gridplot
from bokeh.plotting import output_file, reset_output, save
import datetime as dt
from math import ceil
import numpy as np
import os
import pandas as pd
from agweatherqaqc import utils, calc_functions, input_functions, plot, qaqc_functions
from refet.calcs import _wind_height_adjust
import warnings
[docs]class WeatherQC:
"""
The WeatherQC class is a holistic package for the QC of agricultural weather data.
It requires the filepath to a config file, and from that file does everything necessary for to enable
the end user to QC the data from a station. The class handles the reading in of data, sanitizing of inputs,
plotting and logging the correction of said data, and finally saving the final data for in other analyses.
Usually, weather stations from the same source or network have the same file structure, so WeatherQC features
an optional input of the filepath to a metadata file to process multiple stations using the same input file.
# Example:
>>> from agweatherqaqc.agweatherqaqc import WeatherQC
>>> config_path = 'test_files/test_config.ini'
>>> metadata_path = 'test_files/test_metadata.xlsx'
>>> station_qaqc = WeatherQC(config_path, metadata_path, gridplot_columns=1)
>>> station_qaqc.process_station()
Both the config and metadata files have requirements on file structure and contents. The files
contained within the `./test_data/` directory can serve as templates to copy from or modify.
Please see the documentation of `WeatherQC.process_station()`, as well as the documentation for the functions
within `agweatherqaqc.qaqc_functions` and `agweatherqaqc.calc_functions` for more information on
the overall process and individual steps.
"""
def __init__(self, config_file_path='config.ini', metadata_file_path=None, gridplot_columns=1):
self.config_path = config_file_path
self.metadata_path = metadata_file_path
self.gridplot_columns = gridplot_columns
self.script_mode = 0
self.mc_iterations_pre_corrections = 100 # initially do only a few to save time
self.mc_iterations_post_corrections = 1000 # do MC approach after all data has been corrected
self.generate_bokeh = True
def _obtain_data(self):
"""
Obtain initial data and put it into a dataframe
"""
(self.data_df, self.column_ser, self.metadata_df, self.metadata_series, self.config_dict) = \
input_functions._obtain_data(self.config_path, self.metadata_path)
self.station_name = self.config_dict['station_name']
self.log_file = self.config_dict['log_file_path']
self.station_lat = self.config_dict['station_latitude']
self.station_lon = self.config_dict['station_longitude']
self.station_elev = self.config_dict['station_elevation']
self.ws_anemometer_height = self.config_dict['anemometer_height']
self.missing_fill_value = self.config_dict['missing_output_value']
self.folder_path = self.config_dict['folder_path']
self.auto_mode = self.config_dict['auto_flag']
self.fill_mode = self.config_dict['fill_flag']
print("\nSystem: Raw data successfully extracted from station file.")
# Extract individual variables from data frame back into to numpy arrays.
self.data_year = np.array(self.data_df.year)
self.data_month = np.array(self.data_df.month)
self.data_day = np.array(self.data_df.day)
self.data_tavg = np.array(self.data_df.tavg)
self.data_tmax = np.array(self.data_df.tmax)
self.data_tmin = np.array(self.data_df.tmin)
self.data_tdew = np.array(self.data_df.tdew)
self.data_ea = np.array(self.data_df.ea)
self.data_rhavg = np.array(self.data_df.rhavg)
self.data_rhmax = np.array(self.data_df.rhmax)
self.data_rhmin = np.array(self.data_df.rhmin)
self.data_rs = np.array(self.data_df.rs)
self.data_ws = np.array(self.data_df.ws)
self.data_precip = np.array(self.data_df.precip)
self.output_file_path = (self.folder_path +
"/correction_files/output_data/" + self.station_name + "_output" + ".xlsx")
def _calculate_secondary_vars(self):
"""
Calculate secondary variables from initial ones
"""
print("\nSystem: Now calculating secondary variables based on data provided.")
self.data_length = self.data_year.shape[0]
self.station_pressure = 101.3 * (((293 - (0.0065 * self.station_elev)) / 293) ** 5.26) # units kPa, EQ 3 ASCE
# Calculate DOY from Y/M/D values
self.data_doy = []
for i in range(self.data_length):
# Create list of string DOY values
self.data_doy.append(dt.date(self.data_year[i], self.data_month[i], self.data_day[i]).strftime("%j"))
self.data_doy = np.array(list(map(int, self.data_doy))) # Converts list of string values into ints
# Calculate tavg if it is not provided by dataset
if self.column_ser.tavg == -1:
# Tavg not provided
self.data_tavg = np.array((self.data_tmax + self.data_tmin) / 2.0)
else:
# Tavg is provided, no action needs to be taken
pass
# Figure out which humidity variables are provided and calculate Ea and TDew if needed
(self.data_ea, self.data_tdew) = calc_functions.\
calc_humidity_variables(self.data_tmax, self.data_tmin, self.data_tavg, self.data_ea, self.column_ser.ea,
self.data_tdew, self.column_ser.tdew, self.data_rhmax, self.column_ser.rhmax,
self.data_rhmin, self.column_ser.rhmin, self.data_rhavg, self.column_ser.rhavg)
# Calculates secondary temperature values and mean monthly counterparts
(self.delta_t, self.mm_delta_t, self.k_not, self.mm_k_not, self.mm_tmin, self.mm_tdew) = calc_functions. \
calc_temperature_variables(self.data_month, self.data_tmax, self.data_tmin, self.data_tdew)
'''
Tdew_ko will have all missing values of tdew filled in with tmin - Ko curve method, but will keep missing
values if the underlying tmin is also missing. Currently it is just a copy of TDew, but filling will occur
after temp/humidity has been corrected. If the user does not correct data then this will stay unfilled, but
that is an edge case and not the intent of this code
Tdew_ko is distinct from complete_tdew in that it only fills in an day's observation if there is a
tmin observation presentfor that day. Complete_tdew has a filled in observation for every day because it is
based on a tmin that has been filled with random samples from a normal distribution. Unless the user
specifically asks for this simulated data to be saved then it is only used to create a complete record of
Rso for Rs correction and then is discarded.
'''
self.data_tdew_ko = np.array(self.data_tdew)
'''
This script uses multiple vapor pressure (ea) variables. As a reference for readability:
data_ea = either ea data provided by the station or ea that has been calculated by whatever the 'best'
humidity variable was in the input file. If it is the latter, this ea will have all the gaps and
issues (like drift) that the underlying variable had.
compiled_ea = the script creates as complete a record as possible of ea values by calculating ea from
'worse' variables should there be gaps in the more preferred ones. It only samples from data
that has been provided by the dataset. By the end of this process, the only gaps that will exist
will match the gaps that exist within Tmin. The preference is as follows:
(provided) Ea > (provided) TDew > RH Max and Min > RH Avg > TDew generated from TMin - Ko curve
complete_ea = this is compiled ea but with all of the gaps filled in with TDew generated from
TMin - Ko Curve, and all gaps in TMin filled in by sampling from a monthly normal distribution
of values. Unless the user specifically wants to export this day (option is in config file)
then this data is only used to create a complete record of Rso values for Rs correction,
and then is discarded at the end.
'''
self.compiled_ea = calc_functions.calc_compiled_ea(self.data_tmax, self.data_tmin, self.data_tavg,
self.data_ea, self.data_tdew, self.column_ser.tdew,
self.data_rhmax, self.column_ser.rhmax, self.data_rhmin,
self.column_ser.rhmin, self.data_rhavg,
self.column_ser.rhavg, self.data_tdew_ko)
# Calculates rso and grass/alfalfa reference evapotranspiration from refet package
warnings.filterwarnings('ignore', 'invalid value encountered') # invalid value warning for nans
(self.rso, self.mm_rs, self.eto, self.etr, self.mm_eto, self.mm_etr) = calc_functions.\
calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height, self.data_doy,
self.data_month, self.data_tmax, self.data_tmin, self.compiled_ea, self.data_ws,
self.data_rs)
# Calculate original and optimized Thornton Running solar radiation using a monte carlo approach.
# Only do a few number of iterations here as this will be recomputed once the data has been corrected
(self.orig_rs_tr, self.mm_orig_rs_tr, self.opt_rs_tr, self.mm_opt_rs_tr) = calc_functions. \
calc_org_and_opt_rs_tr(self.mc_iterations_pre_corrections, self.log_file, self.data_month,
self.delta_t, self.mm_delta_t, self.data_rs, self.rso)
warnings.resetwarnings() # reset warning filter to default
#########################
# Back up original data to later save it to output file
# Values are also used to generate delta values of corrected data - original data
self.original_df = self.data_df.copy(deep=True) # Create an unlinked copy of read-in values dataframe
self.original_df['rso'] = self.rso
self.original_df['etr'] = self.etr
self.original_df['eto'] = self.eto
self.original_df['compiled_ea'] = self.compiled_ea
# Create datetime variables that will be used by bokeh plot and correction functions
self.dt_array = []
for i in range(self.data_length):
self.dt_array.append(dt.datetime(self.data_year[i], self.data_month[i], self.data_day[i]))
self.dt_array = np.array(self.dt_array, dtype=np.datetime64)
self.mm_dt_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
self.data_null = np.empty(self.data_length) * np.nan
self.mm_data_null = np.zeros(12) * np.nan
def _correct_data(self):
"""
Correct data
Loop where user selects an option, corrects it,
then the script recalculates all downstream variables,
and finally prompts the user again.
"""
print("\nSystem: Now beginning correction on data.")
# create a flag to check if composite ea has been adjusted or not before correcting solar radiation
self.humidity_adjusted = False
# Complete_vars are going to be filled for the whole record, which may be put into output file if user requests
self.complete_tmax = np.array(self.data_tmax)
self.complete_tmin = np.array(self.data_tmin)
self.complete_ea = np.array(self.compiled_ea)
self.complete_tdew = np.array(self.data_tdew)
# Create arrays that will track which values have been filled (replace missing data) by the script
self.fill_tmax = np.zeros(self.data_length)
self.fill_tmin = np.zeros(self.data_length)
self.fill_ea = np.zeros(self.data_length)
self.fill_tdew = np.zeros(self.data_length)
self.fill_rs = np.zeros(self.data_length)
self.fill_ws = np.zeros(self.data_length)
self.fill_rso = np.zeros(self.data_length)
# Begin loop for correcting variables
while True:
reset_output() # clears bokeh output, prevents ballooning file sizes
print('\nPlease select which of the following variables you want to correct'
'\n Enter 1 for TMax and TMin.'
'\n Enter 2 for TMin and TDew, if TDew was provided.'
'\n Enter 3 for Windspeed.'
'\n Enter 4 for Precipitation.'
'\n Enter 5 for Solar Radiation (Rs).'
'\n Enter 6 for Vapor Pressure (Ea), if it was provided.'
'\n Enter 7 for RH Maximum and Minimum, if they were provided.'
'\n Enter 8 for RH Average, if it was provided.'
'\n Enter 9 to adjust how compiled humidity is sourced.'
'\n Enter 0 to stop applying corrections.'
)
choice_loop = True
while choice_loop:
user = utils.get_int_input(0, 9, "\nEnter your selection: ")
# The following if statements check whether user tries to correct a variable that was not provided
# or make sure correction is being done in the ideal order
if user == 2 and self.column_ser.tdew == -1:
print('\nDewpoint temperature was not provided by the file, please choose a different option.')
elif user == 6 and self.column_ser.ea == -1:
print('\nVapor Pressure was not provided by the file, please choose a different option.')
elif user == 7 and (self.column_ser.rhmax == -1 or self.column_ser.rhmin == -1):
print('\nRHMax and RHMin were not provided by the file, please choose a different option.')
elif user == 8 and self.column_ser.rhavg == -1:
print('\nRHAvg was not provided by the file, please choose a different option.')
elif user == 5 and not self.humidity_adjusted:
print('\n\nBefore correcting solar radiation, did you want to adjust compiled humidity?.')
print('Doing so may allow you to get the best possible humidity record for Rs correction.')
print('\nEnter 1 to adjust compiled humidity or 0 to skip.')
humid_choice = utils.get_int_input(0, 1, 'Enter your selection: ')
if humid_choice == 1: # change original choice to the adjust humidity option
user = 9
choice_loop = False
else:
choice_loop = False
##########
# Correcting individual variables based on user choice
# Correcting Max/Min Temperature data
if user == 1:
(self.data_tmax, self.data_tmin) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_tmax, self.data_tmin, self.dt_array,
self.data_month, self.data_year, 1, self.auto_mode)
# Correcting Min/Dew Temperature data
elif user == 2:
(self.data_tmin, self.data_tdew) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_tmin, self.data_tdew, self.dt_array,
self.data_month, self.data_year, 2, self.auto_mode)
# Correcting Windspeed
elif user == 3:
(self.data_ws, self.data_null) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_ws, self.data_null, self.dt_array,
self.data_month, self.data_year, 3, self.auto_mode)
# Correcting Precipitation
elif user == 4:
(self.data_precip, self.data_null) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_precip, self.data_null, self.dt_array,
self.data_month, self.data_year, 4, self.auto_mode)
# Correcting Solar radiation
elif user == 5:
(self.data_rs, self.data_null) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_rs, self.rso, self.dt_array,
self.data_month, self.data_year, 5, self.auto_mode)
# Correcting Vapor Pressure
elif user == 6:
(self.data_ea, self.data_null) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_ea, self.data_null, self.dt_array,
self.data_month, self.data_year, 7, self.auto_mode)
# Correcting Relative Humidity Max and Min
elif user == 7:
(self.data_rhmax, self.data_rhmin) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_rhmax, self.data_rhmin, self.dt_array,
self.data_month, self.data_year, 8, self.auto_mode)
# Correcting Relative Humidity Average
elif user == 8:
(self.data_rhavg, self.data_null) = qaqc_functions.\
correction(self.station_name, self.log_file, self.folder_path,
self.data_rhavg, self.data_null, self.dt_array,
self.data_month, self.data_year, 9, self.auto_mode)
# Adjusting compiled_ea
elif user == 9:
self.compiled_ea = qaqc_functions.\
compiled_humidity_adjustment(self.station_name, self.log_file, self.folder_path, self.dt_array,
self.data_tmax, self.data_tmin, self.data_tavg, self.compiled_ea,
self.data_ea, self.column_ser.ea, self.data_tdew, self.column_ser.tdew,
self.data_tdew_ko, self.data_rhmax, self.column_ser.rhmax,
self.data_rhmin, self.column_ser.rhmin,
self.data_rhavg, self.column_ser.rhavg)
self.humidity_adjusted = True
else:
# user quits, exit out of loop
print('\nSystem: Now finishing up corrections.')
# Break here because all recalculations were done at the end of the last loop iteration
break
if 1 <= user <= 2 or 6 <= user <= 8:
if user == 1: # User has corrected temperature, so fill all missing values with a normal distribution
# Reset 'complete' vars as the underlying var has been changed.
self.complete_tmax = np.array(self.data_tmax)
self.complete_tmin = np.array(self.data_tmin)
# Remove corresponding TAvg observations after outliers have been removed from TMax and TMin
tmax_removed_indices = np.array(np.where(np.isnan(self.data_tmax))) # array of indices of nans
tmin_removed_indices = np.array(np.where(np.isnan(self.data_tmin))) # array of indices of nans
self.data_tavg[tmax_removed_indices] = np.nan
self.data_tavg[tmin_removed_indices] = np.nan
# Create mean monthly and standard deviation
self.mm_tmax = np.zeros(12)
self.mm_tmin = np.zeros(12)
self.std_tmax = np.zeros(12)
self.std_tmin = np.zeros(12)
for k in range(12):
temp_indexes = np.where(self.data_month == k+1)[0]
temp_indexes = np.array(temp_indexes, dtype=int)
self.mm_tmax[k] = np.nanmean(self.data_tmax[temp_indexes])
self.std_tmax[k] = np.nanstd(self.data_tmax[temp_indexes])
self.mm_tmin[k] = np.nanmean(self.data_tmin[temp_indexes])
self.std_tmin[k] = np.nanstd(self.data_tmin[temp_indexes])
# Fill missing observations with samples from a normal distribution with monthly mean and variance
for i in range(self.data_length):
if np.isnan(self.data_tmax[i]):
self.complete_tmax[i] = np.random.normal(self.mm_tmax[self.data_month[i] - 1],
self.std_tmax[self.data_month[i] - 1], 1)[0]
self.fill_tmax[i] = self.complete_tmax[i]
else:
pass
if np.isnan(self.data_tmin[i]):
self.complete_tmin[i] = np.random.normal(self.mm_tmin[self.data_month[i] - 1],
self.std_tmin[self.data_month[i] - 1], 1)[0]
self.fill_tmin[i] = self.complete_tmin[i]
else:
pass
if (self.complete_tmax[i] <= self.complete_tmin[i]) or \
(self.complete_tmax[i] - self.complete_tmin[i] <= 3):
# This is a logical check to make sure that tmax is sufficiently distant from tmin once
# they have been filled in, tmax needs to be warmer than tmin and daily temp isn't constant
# so there should be at least a small difference in tmax-tmin
# todo the below lines always provide a higher than average tmax
# and a lower than average tmin, this can be eventually improved
# Fill this observation in with mm observation with the difference of 1/2 of mm delta t
self.complete_tmax[i] = self.mm_tmax[self.data_month[i] - 1] + \
(0.5 * self.mm_delta_t[self.data_month[i] - 1])
self.fill_tmax[i] = self.complete_tmax[i]
self.complete_tmin[i] = self.mm_tmin[self.data_month[i] - 1] - \
(0.5 * self.mm_delta_t[self.data_month[i] - 1])
self.fill_tmin[i] = self.complete_tmin[i]
else:
# data is different enough to appear valid
pass
if self.fill_mode:
# we are filling in data, so copy all the filled versions onto the original temperature
self.data_tmax = np.array(self.complete_tmax)
self.data_tmin = np.array(self.complete_tmin)
else:
# if we are not filling, we will hold the copies to later fill in rso, but will reset fill
# tracking variables
self.fill_tmax = np.zeros(self.data_length)
self.fill_tmin = np.zeros(self.data_length)
else:
# user did not correct option 1
pass
# Figure out which humidity variables are provided and recalculate Ea and TDew if needed
# This function is safe to use after correcting because it tracks what variable was provided by the data
# and recalculates appropriately. It doesn't overwrite provided variables with calculated versions.
# Ex. if only TDew is provided, it recalculates ea while returning original provided tdew
(self.data_ea, self.data_tdew) = calc_functions.\
calc_humidity_variables(self.data_tmax, self.data_tmin, self.data_tavg, self.data_ea,
self.column_ser.ea, self.data_tdew, self.column_ser.tdew,
self.data_rhmax, self.column_ser.rhmax, self.data_rhmin,
self.column_ser.rhmin, self.data_rhavg, self.column_ser.rhavg)
# Recalculates secondary temperature values and mean monthly counterparts
(self.delta_t, self.mm_delta_t, self.k_not, self.mm_k_not, self.mm_tmin, self.mm_tdew) = \
calc_functions.calc_temperature_variables(self.data_month, self.data_tmax,
self.data_tmin, self.data_tdew)
# Since we are recalculating humidity variables, we also need to reset tdew_ko to ensure it matches the
# underlying unfilled tdew. It is filled later after this once the user corrects a humidity var
# so this reset is acceptable
self.data_tdew_ko = np.array(self.data_tdew)
if user == 2 or 6 <= user <= 8:
# Reset 'complete' version as underlying variable may have changed
self.complete_tdew = np.array(self.data_tdew)
'''
Fill in any missing tdew data with tmin - k0 curve.
As detailed above, data_tdew_ko only fills in missing tdew observations with real tmin obs,
while complete_tdew is a full record filled in using a filled in tmin.
Nothing occurs if this fill code is run a second time because vars are already filled unless
correction methods throw out data, in which case we need to refill for the complete record
that Rs correction requires.
'''
for i in range(self.data_length):
if np.isnan(self.data_tdew[i]):
# Tdew_ko will have gaps that match gaps in Tmin
# Complete_tdew will match complete_tmin in having no gaps
self.data_tdew_ko[i] = self.data_tmin[i] - self.mm_k_not[self.data_month[i] - 1]
self.complete_tdew[i] = self.complete_tmin[i] - self.mm_k_not[self.data_month[i] - 1]
self.fill_tdew[i] = self.complete_tdew[i]
else:
# If TDew isn't empty then nothing is required to be done.
pass
if self.fill_mode:
# we are filling in data, so copy all the filled versions onto the original arrays
self.data_tdew = np.array(self.complete_tdew)
else:
# if we are not filling, we will hold the copies to later fill in rso, but will reset fill
# tracking variables
self.fill_tdew = np.zeros(self.data_length)
else:
# user did not select option 2 or 6-8
pass
'''
Recreate the 'compiled' ea as temperature or humidity vars were corrected and may have changed the
data underlying the compiled ea. Once that is done we will fill in all the gaps with the
variable 'complete_tdew' so that a complete record of ea exists for rs correction
The gaps in compiled_ea are reset every time temperature or humidity is corrected so this code is
okay to run multiple times
'''
self.compiled_ea = calc_functions.calc_compiled_ea(self.data_tmax, self.data_tmin, self.data_tavg,
self.data_ea, self.data_tdew, self.column_ser.tdew,
self.data_rhmax, self.column_ser.rhmax,
self.data_rhmin, self.column_ser.rhmin,
self.data_rhavg, self.column_ser.rhavg,
self.data_tdew_ko)
# Reset 'complete' version as underlying variable may have changed.
self.complete_ea = np.array(self.compiled_ea)
for i in range(self.data_length):
if np.isnan(self.compiled_ea[i]):
self.complete_ea[i] = (0.6108 * np.exp((17.27 * self.complete_tdew[i]) / (self.complete_tdew[i]
+ 237.3)))
self.fill_ea[i] = self.complete_ea[i]
else:
# Ea is provided and the index is not empty, do nothing to avoid overwriting actual data
pass
if self.fill_mode:
# we are filling in data, so copy all the filled versions onto the original arrays
self.data_ea = np.array(self.complete_ea)
self.compiled_ea = np.array(self.complete_ea)
else:
# if we are not filling, we will hold the copies to later fill in rso, but will reset fill
# tracking variables
self.fill_ea = np.zeros(self.data_length)
elif user == 9: # User has adjusted how the compiled humidity is sourced, recreate complete_ea
self.complete_ea = np.array(self.compiled_ea)
for i in range(self.data_length):
if np.isnan(self.compiled_ea[i]):
self.complete_ea[i] = (0.6108 * np.exp((17.27 * self.complete_tdew[i]) /
(self.complete_tdew[i] + 237.3)))
self.fill_ea[i] = self.complete_ea[i]
else:
# the index is not empty, do nothing to avoid overwriting actual data
pass
if self.fill_mode:
# we are filling in data, so copy all the filled versions onto the original arrays
self.data_ea = np.array(self.complete_ea)
self.compiled_ea = np.array(self.complete_ea)
else:
# if we are not filling, we will hold the copies to later fill in rso, but will reset fill
# tracking variables
self.fill_ea = np.zeros(self.data_length)
else:
# user did not select options 1,2, 6, 7, 8, or 9.
pass
'''
Even if the user doesn't want to put filled data into their output file, we still need to use complete
records to get a complete record of Rso for use in Rs correction. This completed rso is only used for
this step and is not written as data to the output file
'''
if self.fill_mode:
'''
This recalculates Rso and ETr values using the filled 'completed_' versions to provide a complete
record of ETr values.
If this code is executing then 'data_' vars have already been replaced by their 'completed_'
versions so the code is accurate in calling them 'data_'
'''
warnings.filterwarnings('ignore', 'invalid value encountered') # catch invalid value warning, nans
(self.rso, self.mm_rs, self.eto, self.etr, self.mm_eto, self.mm_etr) = calc_functions. \
calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height, self.data_doy,
self.data_month, self.data_tmax, self.data_tmin, self.data_ea, self.data_ws,
self.data_rs)
warnings.resetwarnings()
else:
'''
User doesn't want to keep filled in data, so use complete versions to create a filled version of
rso while saving the other outputs of calc_rso_and_refet as temporary names which are unused to
prevent them from impacting later calculations
'''
warnings.filterwarnings('ignore', 'invalid value encountered') # catch invalid value warning, nans
(self.rso, self._mm_rs, self._eto, self._etr, self._mm_eto, self._mm_etr) = \
calc_functions.calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height,
self.data_doy, self.data_month, self.complete_tmax,
self.complete_tmin, self.complete_ea, self.data_ws, self.data_rs)
warnings.resetwarnings()
'''
At this point the user has finished correcting all variables they want to.
We now recalculate original and optimized Thornton-Running Solar Radiation.
Additionally, if the user is electing to fill data then we fill in all gaps of data_rs
with the optimized thornton running solar radiation.
Also, only if the user wants, we fill in all gaps of wind data using samples from a normal distribution.
Finally, we do one final calculation of Rso/ETr with all the final versions of variables. If the user does
NOT want to fill in data, this final calculation of Rso will replace the filled one used for Solar
Radiation correction with one using only real data.
'''
(self.orig_rs_tr, self.mm_orig_rs_tr, self.opt_rs_tr, self.mm_opt_rs_tr) = calc_functions. \
calc_org_and_opt_rs_tr(self.mc_iterations_post_corrections, self.log_file, self.data_month,
self.delta_t, self.mm_delta_t, self.data_rs, self.rso)
# This section provides for the filling of data should fill_mode be set to true
self.mm_ws = np.zeros(12)
self.std_ws = np.zeros(12)
for k in range(12):
temp_indexes = np.where(self.data_month == k + 1)[0]
temp_indexes = np.array(temp_indexes, dtype=int)
self.mm_ws[k] = np.nanmean(self.data_ws[temp_indexes])
self.std_ws[k] = np.nanmean(self.data_ws[temp_indexes])
if self.fill_mode:
for i in range(self.data_length):
# fill data_rs with rs_tr and data_ws with an exponential function centered on mm_ws for that month
if np.isnan(self.data_rs[i]):
self.data_rs[i] = self.opt_rs_tr[i]
self.fill_rs[i] = self.opt_rs_tr[i]
else:
# If rs isn't empty then nothing is required to be done.
pass
if np.isnan(self.data_ws[i]):
self.data_ws[i] = np.random.normal(self.mm_ws[self.data_month[i] - 1],
self.std_ws[self.data_month[i] - 1], 1)
if self.data_ws[i] < 0.2: # check to see if filled windspeed is lower than reasonable
self.data_ws[i] = 0.2
else:
pass
self.fill_ws[i] = self.data_ws[i]
else:
# If ws isn't empty then nothing is required to be done.
pass
else:
pass
# Recalculate eto and etr one final time
# This also overwrites the filled Rso, so we will create a copy for posterity
self.fill_rso = np.array(self.rso)
(self.rso, self.mm_rs, self.eto, self.etr, self.mm_eto, self.mm_etr) = calc_functions. \
calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height, self.data_doy,
self.data_month, self.data_tmax, self.data_tmin, self.compiled_ea,
self.data_ws, self.data_rs)
def _create_plots(self):
"""
Makes and saves histogram and composite plots.
"""
#########################
# Histograms of original data
# Generates composite plot of specific variables before correction
# We fill these variables by sampling a normal distribution, so we use this plot mainly as evidence for that.
if self.generate_bokeh and self.script_mode == 0:
ws_hist = plot.histogram_plot(self.data_ws[~np.isnan(self.data_ws)],
'Windspeed', 'black', 'm/s')
tmax_hist = plot.histogram_plot(self.data_tmax[~np.isnan(self.data_tmax)],
'TMax', 'red', 'degrees C')
tmin_hist = plot.histogram_plot(self.data_tmin[~np.isnan(self.data_tmin)],
'TMin', 'blue', 'degrees C')
tavg_hist = plot.histogram_plot(self.data_tmin[~np.isnan(self.data_tmin)],
'TAvg', 'black', 'degrees C')
tdew_hist = plot.histogram_plot(self.data_tdew[~np.isnan(self.data_tdew)],
'TDew', 'black', 'degrees C')
k_not_hist = plot.histogram_plot(self.k_not[~np.isnan(self.k_not)],
'Ko', 'black', 'degrees C')
output_file(self.folder_path + "/correction_files/histograms/" + self.station_name + '_histograms.html',
title=self.station_name + ' histograms')
save(gridplot([ws_hist, tmax_hist, tmin_hist, tavg_hist, tdew_hist, k_not_hist], ncols=2,
width=400, height=400, toolbar_location=None))
#########################
# Generate bokeh composite plot
# Creates one large plot featuring all variables as subplots, used to get a concise overview of the full dataset
# This output will be generated twice, first to plot data before correction, and then again
# to plot data after correction
if self.generate_bokeh: # Flag to create graphs or not
reset_output()
plot_list = []
x_size = 1200
y_size = 400
if self.script_mode == 0:
output_file(self.folder_path + "/correction_files/before_graphs/" + self.station_name +
"_before_corrections_composite_graph.html")
print("\nSystem: Now creating pre-correction composite bokeh graph.")
elif self.script_mode == 1:
output_file(self.folder_path + "/correction_files/after_graphs/" + self.station_name +
"_after_corrections_composite_graph.html")
print("\nSystem: Now creating post-correction composite bokeh graph.")
else:
# Incorrect setup of script mode variable, raise an error
raise ValueError('Incorrect parameters: script mode is not set to a valid option.')
# Temperature Maximum and Minimum Plot
plot_tmax_tmin = plot.line_plot(x_size, y_size, self.dt_array, self.data_tmax,
self.data_tmin, 1, '')
plot_list.append(plot_tmax_tmin)
# Temperature Minimum and Dewpoint Plot
plot_tmin_tdew = plot.line_plot(x_size, y_size, self.dt_array, self.data_tmin,
self.data_tdew, 2, '', plot_tmax_tmin)
plot_list.append(plot_tmin_tdew)
# 'Completed' vapor pressure plot
plot_comp_ea = plot.line_plot(x_size, y_size, self.dt_array, self.compiled_ea, self.data_null,
7, 'Composite ', plot_tmax_tmin)
plot_list.append(plot_comp_ea)
# vapor pressure plot that was just the provided dataset
if self.column_ser.ea != -1:
plot_data_ea = plot.line_plot(x_size, y_size, self.dt_array, self.data_ea, self.data_null,
7, 'Provided ', plot_tmax_tmin)
plot_list.append(plot_data_ea)
# rh max and rh min plot if it was provided in dataset
if self.column_ser.rhmax != -1 and self.column_ser.rhmin != -1: # RH max and RH min
plot_rhmax_rhmin = plot.line_plot(x_size, y_size, self.dt_array, self.data_rhmax,
self.data_rhmin, 8, '', plot_tmax_tmin)
plot_list.append(plot_rhmax_rhmin)
# rh avg if it was provided in the dataset
if self.column_ser.rhavg != -1: # RH Avg
plot_rhavg = plot.line_plot(x_size, y_size, self.dt_array, self.data_rhavg,
self.data_null, 9, '', plot_tmax_tmin)
plot_list.append(plot_rhavg)
# Mean Monthly Temperature Minimum and Dewpoint
plot_mm_tmin_tdew = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_tmin,
self.mm_tdew, 2, 'MM ')
plot_list.append(plot_mm_tmin_tdew)
# Mean Monthly k0 curve (Tmin-Tdew)
plot_mm_k_not = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_k_not,
self.mm_data_null, 10, '', plot_mm_tmin_tdew)
plot_list.append(plot_mm_k_not)
# Solar radiation and clear sky solar radiation
plot_rs_rso = plot.line_plot(x_size, y_size, self.dt_array, self.data_rs, self.rso,
5, '', plot_tmax_tmin)
plot_list.append(plot_rs_rso)
# Windspeed
plot_ws = plot.line_plot(x_size, y_size, self.dt_array, self.data_ws, self.data_null,
3, '', plot_tmax_tmin)
plot_list.append(plot_ws)
# Precipitation
plot_precip = plot.line_plot(x_size, y_size, self.dt_array, self.data_precip, self.data_null,
4, '', plot_tmax_tmin)
plot_list.append(plot_precip)
# Optimized mean monthly Thornton-Running solar radiation and Mean Monthly solar radiation
plot_mm_opt_rs_tr = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_rs,
self.mm_opt_rs_tr, 6, 'MM Optimized ', plot_mm_tmin_tdew)
plot_list.append(plot_mm_opt_rs_tr)
# Optimized mean monthly Thornton-Running solar radiation and Mean Monthly solar radiation
plot_mm_orig_rs_tr = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_rs,
self.mm_orig_rs_tr, 6, 'MM Original ', plot_mm_tmin_tdew)
plot_list.append(plot_mm_orig_rs_tr)
# Now construct grid plot out of all the subplots
number_of_plots = len(plot_list)
number_of_rows = ceil(number_of_plots / self.gridplot_columns)
grid_of_plots = [([None] * 1) for i in range(number_of_rows)]
for i in range(number_of_rows):
for j in range(self.gridplot_columns):
if len(plot_list) > 0:
grid_of_plots[i][j] = plot_list.pop(0)
else:
pass
fig = gridplot(grid_of_plots, toolbar_location='left', sizing_mode='stretch_width')
save(fig)
def _write_outputs(self):
"""
Creates all the output files
"""
#########################
# Create necessary variables for generic metadata file, as well as
# generate and fill metadata file
record_start = pd.to_datetime(self.dt_array[0]).date()
record_end = pd.to_datetime(self.dt_array[-1]).date()
# Generate metadata output file that saves the site-specific data for each run (appended to end)
if not os.path.isfile('correction_metadata.xlsx'):
# file does not exist, create new one
metadata_info = pd.DataFrame({'Station': self.station_name, 'Latitude': self.station_lat,
'Longitude': self.station_lon, 'station_elev_m': self.station_elev,
'record_start': record_start, 'record_end': record_end,
'anemom_height_m': self.ws_anemometer_height,
'Filename': self.output_file_path}, index=np.array([1]))
with pd.ExcelWriter('correction_metadata.xlsx', date_format='YYYY-MM-DD',
datetime_format='YYYY-MM-DD HH:MM:SS', engine='openpyxl', mode='w') as writer:
metadata_info.to_excel(writer, header=True, index=False, sheet_name='Sheet1')
else:
# file is already created, so we need to read it in, append our new information to the bottom of it
# and then save the info
metadata_info = pd.read_excel('correction_metadata.xlsx', sheet_name=0, index_col=None,
engine='openpyxl', keep_default_na=False, verbose=True)
new_meta_info = pd.DataFrame({'Station': self.station_name, 'Latitude': self.station_lat,
'Longitude': self.station_lon, 'station_elev_m': self.station_elev,
'record_start': record_start, 'record_end': record_end,
'anemom_height_m': self.ws_anemometer_height,
'Filename': self.output_file_path}, index=np.array([1]))
output_metadata = pd.concat([metadata_info, new_meta_info], ignore_index=True)
with pd.ExcelWriter('correction_metadata.xlsx', date_format='YYYY-MM-DD',
datetime_format='YYYY-MM-DD HH:MM:SS', engine='openpyxl', mode='w') as writer:
output_metadata.to_excel(writer, header=True, index=False, sheet_name='Sheet1')
# if we are using a network-specific metadata file, update that another file has been processed
if self.metadata_path is not None:
current_row = self.metadata_df.processed.ne(1).idxmax() - 1
self.metadata_df.processed.iloc[current_row] = 1
self.metadata_df.record_start.iloc[current_row] = record_start
self.metadata_df.record_end.iloc[current_row] = record_end
self.metadata_df.output_path.iloc[current_row] = self.output_file_path
with pd.ExcelWriter(self.metadata_path, date_format='YYYY-MM-DD',
datetime_format='YYYY-MM-DD', engine='openpyxl', mode='w') as writer:
self.metadata_df.to_excel(writer, header=True, index=True, sheet_name='Sheet1')
#########################
# Generate output file
# Create any final variables, then create panda dataframes to save all the data
# Includes the following sheets:
# Corrected Data : Actual corrected values
# Delta : Magnitude of difference between original data and corrected data
# Filled Data : Tracks which data points have been filled by script generated values instead of provided
# Data that is provided and subsequently corrected by the script do not count as filled values.
print("\nSystem: Saving corrected data to .xslx file.")
# Create any individually-requested output data
ws_2m = _wind_height_adjust(uz=self.data_ws, zw=self.ws_anemometer_height)
# Create corrected-original delta numpy arrays
diff_tavg = np.array(self.data_tavg - self.original_df.tavg)
diff_tmax = np.array(self.data_tmax - self.original_df.tmax)
diff_tmin = np.array(self.data_tmin - self.original_df.tmin)
diff_tdew = np.array(self.data_tdew - self.original_df.tdew)
diff_ea = np.array(self.data_ea - self.original_df.ea)
diff_rhavg = np.array(self.data_rhavg - self.original_df.rhavg)
diff_rhmax = np.array(self.data_rhmax - self.original_df.rhmax)
diff_rhmin = np.array(self.data_rhmin - self.original_df.rhmin)
diff_rs = np.array(self.data_rs - self.original_df.rs)
diff_rs_tr = np.array(self.opt_rs_tr - self.orig_rs_tr)
diff_rso = np.array(self.rso - self.original_df.rso)
diff_ws = np.array(self.data_ws - self.original_df.ws)
diff_precip = np.array(self.data_precip - self.original_df.precip)
diff_etr = np.array(self.etr - self.original_df.etr)
diff_eto = np.array(self.eto - self.original_df.eto)
# Create k0 array to output values
k_not_vals = np.zeros(self.data_length)
k_not_vals[0:12] = self.mm_k_not[0:12]
# Create datetime for output dataframe
datetime_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month, 'day': self.data_day})
datetime_df = pd.to_datetime(datetime_df[['month', 'day', 'year']])
# Create output dataframe
output_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month,
'day': self.data_day, 'TAvg (C)': self.data_tavg, 'TMax (C)': self.data_tmax,
'TMin (C)': self.data_tmin, 'TDew (C)': self.data_tdew,
'Compiled Ea (kPa)': self.compiled_ea,
'Vapor Pres (kPa)': self.data_ea, 'RHAvg (%)': self.data_rhavg,
'RHMax (%)': self.data_rhmax, 'RHMin (%)': self.data_rhmin, 'Rs (w/m2)': self.data_rs,
'Opt_Rs_TR (w/m2)': self.opt_rs_tr, 'Rso (w/m2)': self.rso,
'Windspeed (m/s)': self.data_ws, 'Precip (mm)': self.data_precip,
'ETr (mm)': self.etr, 'ETo (mm)': self.eto, 'ws_2m (m/s)': ws_2m},
index=datetime_df)
# Creating difference dataframe to track amount of correction
delta_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month,
'day': self.data_day, 'TAvg (C)': diff_tavg, 'TMax (C)': diff_tmax,
'TMin (C)': diff_tmin, 'TDew (C)': diff_tdew,
'Vapor Pres (kPa)': diff_ea, 'RHAvg (%)': diff_rhavg, 'RHMax (%)': diff_rhmax,
'RHMin (%)': diff_rhmin, 'Rs (w/m2)': diff_rs, 'Opt - Orig Rs_TR (w/m2)': diff_rs_tr,
'Rso (w/m2)': diff_rso, 'Windspeed (m/s)': diff_ws, 'Precip (mm)': diff_precip,
'ETr (mm)': diff_etr, 'ETo (mm)': diff_eto}, index=datetime_df)
# Creating a fill dataframe that tracks where missing data was filled in
fill_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month,
'day': self.data_day, 'TMax (C)': self.fill_tmax, 'TMin (C)': self.fill_tmin,
'TDew (C)': self.fill_tdew, 'Vapor Pres (kPa)': self.fill_ea, 'Rs (w/m2)': self.fill_rs,
'Complete Record Rso (w/m2)': self.fill_rso, 'mm k0 values': k_not_vals},
index=datetime_df)
output_df.index.name = 'date'
delta_df.index.name = 'date'
fill_df.index.name = 'date'
# Open up pandas excel writer
output_writer = pd.ExcelWriter(self.output_file_path, engine='xlsxwriter')
# Convert data frames to xlsxwriter excel objects
output_df.to_excel(output_writer, sheet_name='Corrected Data', na_rep=self.missing_fill_value)
delta_df.to_excel(output_writer, sheet_name='Delta (Corr - Orig)', na_rep=self.missing_fill_value)
fill_df.to_excel(output_writer, sheet_name='Filled Data', na_rep=self.missing_fill_value)
# Save output file
output_writer.close()
logger = open(self.log_file, 'a')
if self.fill_mode == 1:
if np.isnan(self.eto).any() or np.isnan(self.etr).any():
print("\nSystem: After finishing corrections and filling data, "
"ETr and ETo still had missing observations.")
logger.write('After finishing corrections and filling data, '
'ETr and ETo still had missing observations. \n')
else:
logger.write('The output file for this station has a complete record of ETo and ETr observations. \n')
else:
pass
logger.write('\nThe file has been successfully processed and output files saved at %s.' %
dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
logger.close()
[docs] def process_station(self):
"""
This function serves as the structure for the overall workflow in
applying the QC process to an input data source. The standard process is as follows:
1. Read in the data.
2. Calculate any secondary variables (mean monthly values, clear-sky solar radiation, etc.).
3. Plot the data before any corrections are performed.
4. Allow the user to adjust/remove/QC data. Recompute any dependent secondary variables.
5. Plot the data after corrections are performed and save the output data.
Returns:
None
"""
self._obtain_data()
self._calculate_secondary_vars()
# first plot the data before correcting it
print("\nSystem: Plotting raw data.")
self._create_plots()
self.script_mode = 1
self._correct_data()
self._create_plots()
self._write_outputs()
# This is never run by itself
if __name__ == "__main__":
print("\nThis module does nothing if run by itself. "
"Please see the script \'qaqc_single_station.py\' for more info.")