Source code for agweatherqaqc.agweatherqaqc

from bokeh.layouts import gridplot
from bokeh.plotting import output_file, reset_output, save
import datetime as dt
from math import ceil
import numpy as np
import os
import pandas as pd
from agweatherqaqc import utils, calc_functions, input_functions, plot, qaqc_functions
from refet.calcs import _wind_height_adjust
import warnings


[docs]class WeatherQC:
    """
    The WeatherQC class is a holistic package for the QC of agricultural weather data.

    It requires the filepath to a config file, and from that file does everything necessary for to enable
    the end user to QC the data from a station. The class handles the reading in of data, sanitizing of inputs,
    plotting and logging the correction of said data, and finally saving the final data for in other analyses.

    Usually, weather stations from the same source or network have the same file structure, so WeatherQC features
    an optional input of the filepath to a metadata file to process multiple stations using the same input file.

    # Example:
        >>> from agweatherqaqc.agweatherqaqc import WeatherQC
        >>> config_path = 'test_files/test_config.ini'
        >>> metadata_path = 'test_files/test_metadata.xlsx'
        >>> station_qaqc = WeatherQC(config_path, metadata_path, gridplot_columns=1)
        >>> station_qaqc.process_station()

    Both the config and metadata files have requirements on file structure and contents. The files
    contained within the `./test_data/` directory can serve as templates to copy from or modify.

    Please see the documentation of `WeatherQC.process_station()`, as well as the documentation for the functions
    within `agweatherqaqc.qaqc_functions` and `agweatherqaqc.calc_functions` for more information on
    the overall process and individual steps.
    """
    def __init__(self, config_file_path='config.ini', metadata_file_path=None, gridplot_columns=1):
        self.config_path = config_file_path
        self.metadata_path = metadata_file_path
        self.gridplot_columns = gridplot_columns
        self.script_mode = 0
        self.mc_iterations_pre_corrections = 100  # initially do only a few to save time
        self.mc_iterations_post_corrections = 1000  # do MC approach after all data has been corrected
        self.generate_bokeh = True

    def _obtain_data(self):
        """
            Obtain initial data and put it into a dataframe
        """
        (self.data_df, self.column_ser, self.metadata_df, self.metadata_series, self.config_dict) = \
            input_functions._obtain_data(self.config_path, self.metadata_path)
        self.station_name = self.config_dict['station_name']
        self.log_file = self.config_dict['log_file_path']
        self.station_lat = self.config_dict['station_latitude']
        self.station_lon = self.config_dict['station_longitude']
        self.station_elev = self.config_dict['station_elevation']
        self.ws_anemometer_height = self.config_dict['anemometer_height']
        self.missing_fill_value = self.config_dict['missing_output_value']
        self.folder_path = self.config_dict['folder_path']
        self.auto_mode = self.config_dict['auto_flag']
        self.fill_mode = self.config_dict['fill_flag']

        print("\nSystem: Raw data successfully extracted from station file.")

        # Extract individual variables from data frame back into to numpy arrays.
        self.data_year = np.array(self.data_df.year)
        self.data_month = np.array(self.data_df.month)
        self.data_day = np.array(self.data_df.day)
        self.data_tavg = np.array(self.data_df.tavg)
        self.data_tmax = np.array(self.data_df.tmax)
        self.data_tmin = np.array(self.data_df.tmin)
        self.data_tdew = np.array(self.data_df.tdew)
        self.data_ea = np.array(self.data_df.ea)
        self.data_rhavg = np.array(self.data_df.rhavg)
        self.data_rhmax = np.array(self.data_df.rhmax)
        self.data_rhmin = np.array(self.data_df.rhmin)
        self.data_rs = np.array(self.data_df.rs)
        self.data_ws = np.array(self.data_df.ws)
        self.data_precip = np.array(self.data_df.precip)

        self.output_file_path = (self.folder_path +
                                 "/correction_files/output_data/" + self.station_name + "_output" + ".xlsx")

    def _calculate_secondary_vars(self):
        """
            Calculate secondary variables from initial ones
        """
        print("\nSystem: Now calculating secondary variables based on data provided.")
        self.data_length = self.data_year.shape[0]
        self.station_pressure = 101.3 * (((293 - (0.0065 * self.station_elev)) / 293) ** 5.26)  # units kPa, EQ 3 ASCE

        # Calculate DOY from Y/M/D values
        self.data_doy = []
        for i in range(self.data_length):
            # Create list of string DOY values
            self.data_doy.append(dt.date(self.data_year[i], self.data_month[i], self.data_day[i]).strftime("%j"))

        self.data_doy = np.array(list(map(int, self.data_doy)))  # Converts list of string values into ints

        # Calculate tavg if it is not provided by dataset
        if self.column_ser.tavg == -1:
            # Tavg not provided
            self.data_tavg = np.array((self.data_tmax + self.data_tmin) / 2.0)
        else:
            # Tavg is provided, no action needs to be taken
            pass

        # Figure out which humidity variables are provided and calculate Ea and TDew if needed
        (self.data_ea, self.data_tdew) = calc_functions.\
            calc_humidity_variables(self.data_tmax, self.data_tmin, self.data_tavg, self.data_ea, self.column_ser.ea,
                                    self.data_tdew, self.column_ser.tdew, self.data_rhmax, self.column_ser.rhmax,
                                    self.data_rhmin, self.column_ser.rhmin, self.data_rhavg, self.column_ser.rhavg)

        # Calculates secondary temperature values and mean monthly counterparts
        (self.delta_t, self.mm_delta_t, self.k_not, self.mm_k_not, self.mm_tmin, self.mm_tdew) = calc_functions. \
            calc_temperature_variables(self.data_month, self.data_tmax, self.data_tmin, self.data_tdew)

        '''
            Tdew_ko will have all missing values of tdew filled in with tmin - Ko curve method, but will keep missing
            values if the underlying tmin is also missing. Currently it is just a copy of TDew, but filling will occur
            after temp/humidity has been corrected. If the user does not correct data then this will stay unfilled, but
            that is an edge case and not the intent of this code
            
            Tdew_ko is distinct from complete_tdew in that it only fills in an day's observation if there is a 
            tmin observation presentfor that day. Complete_tdew has a filled in observation for every day because it is
            based on a tmin that has been filled with random samples from a normal distribution. Unless the user 
            specifically asks for this simulated data to be saved then it is only used to create a complete record of 
            Rso for Rs correction and then is discarded.
        '''
        self.data_tdew_ko = np.array(self.data_tdew)

        '''
            This script uses multiple vapor pressure (ea) variables. As a reference for readability:
    
            data_ea = either ea data provided by the station or ea that has been calculated by whatever the 'best'
                      humidity variable was in the input file. If it is the latter, this ea will have all the gaps and
                      issues (like drift) that the underlying variable had.
    
            compiled_ea = the script creates as complete a record as possible of ea values by calculating ea from 
                       'worse' variables should there be gaps in the more preferred ones. It only samples from data 
                       that has been provided by the dataset. By the end of this process, the only gaps that will exist 
                       will match the gaps that exist within Tmin. The preference is as follows:
    
                       (provided) Ea > (provided) TDew > RH Max and Min > RH Avg > TDew generated from TMin - Ko curve
    
            complete_ea = this is compiled ea but with all of the gaps filled in with TDew generated from 
                       TMin - Ko Curve, and all gaps in TMin filled in by sampling from a monthly normal distribution
                       of values. Unless the user specifically wants to export this day (option is in config file)
                       then this data is only used to create a complete record of Rso values for Rs correction,
                       and then is discarded at the end.
        '''
        self.compiled_ea = calc_functions.calc_compiled_ea(self.data_tmax, self.data_tmin, self.data_tavg,
                                                           self.data_ea, self.data_tdew, self.column_ser.tdew,
                                                           self.data_rhmax, self.column_ser.rhmax, self.data_rhmin,
                                                           self.column_ser.rhmin, self.data_rhavg,
                                                           self.column_ser.rhavg, self.data_tdew_ko)

        # Calculates rso and grass/alfalfa reference evapotranspiration from refet package
        warnings.filterwarnings('ignore', 'invalid value encountered')  # invalid value warning for nans
        (self.rso, self.mm_rs, self.eto, self.etr, self.mm_eto, self.mm_etr) = calc_functions.\
            calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height, self.data_doy,
                               self.data_month, self.data_tmax, self.data_tmin, self.compiled_ea, self.data_ws,
                               self.data_rs)

        # Calculate original and optimized Thornton Running solar radiation using a monte carlo approach.
        # Only do a few number of iterations here as this will be recomputed once the data has been corrected
        (self.orig_rs_tr, self.mm_orig_rs_tr, self.opt_rs_tr, self.mm_opt_rs_tr) = calc_functions. \
            calc_org_and_opt_rs_tr(self.mc_iterations_pre_corrections, self.log_file, self.data_month,
                                   self.delta_t, self.mm_delta_t, self.data_rs, self.rso)

        warnings.resetwarnings()  # reset warning filter to default

        #########################
        # Back up original data to later save it to output file
        # Values are also used to generate delta values of corrected data - original data
        self.original_df = self.data_df.copy(deep=True)  # Create an unlinked copy of read-in values dataframe
        self.original_df['rso'] = self.rso
        self.original_df['etr'] = self.etr
        self.original_df['eto'] = self.eto
        self.original_df['compiled_ea'] = self.compiled_ea

        # Create datetime variables that will be used by bokeh plot and correction functions
        self.dt_array = []
        for i in range(self.data_length):
            self.dt_array.append(dt.datetime(self.data_year[i], self.data_month[i], self.data_day[i]))
        self.dt_array = np.array(self.dt_array, dtype=np.datetime64)
        self.mm_dt_array = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
        self.data_null = np.empty(self.data_length) * np.nan
        self.mm_data_null = np.zeros(12) * np.nan

    def _correct_data(self):
        """
            Correct data
            Loop where user selects an option, corrects it,
            then the script recalculates all downstream variables,
            and finally prompts the user again.
        """

        print("\nSystem: Now beginning correction on data.")
        # create a flag to check if composite ea has been adjusted or not before correcting solar radiation
        self.humidity_adjusted = False

        # Complete_vars are going to be filled for the whole record, which may be put into output file if user requests
        self.complete_tmax = np.array(self.data_tmax)
        self.complete_tmin = np.array(self.data_tmin)
        self.complete_ea = np.array(self.compiled_ea)
        self.complete_tdew = np.array(self.data_tdew)

        # Create arrays that will track which values have been filled (replace missing data) by the script
        self.fill_tmax = np.zeros(self.data_length)
        self.fill_tmin = np.zeros(self.data_length)
        self.fill_ea = np.zeros(self.data_length)
        self.fill_tdew = np.zeros(self.data_length)
        self.fill_rs = np.zeros(self.data_length)
        self.fill_ws = np.zeros(self.data_length)
        self.fill_rso = np.zeros(self.data_length)

        # Begin loop for correcting variables
        while True:
            reset_output()  # clears bokeh output, prevents ballooning file sizes
            print('\nPlease select which of the following variables you want to correct'
                  '\n   Enter 1 for TMax and TMin.'
                  '\n   Enter 2 for TMin and TDew, if TDew was provided.'
                  '\n   Enter 3 for Windspeed.'
                  '\n   Enter 4 for Precipitation.'
                  '\n   Enter 5 for Solar Radiation (Rs).'
                  '\n   Enter 6 for Vapor Pressure (Ea), if it was provided.'
                  '\n   Enter 7 for RH Maximum and Minimum, if they were provided.'
                  '\n   Enter 8 for RH Average, if it was provided.'
                  '\n   Enter 9 to adjust how compiled humidity is sourced.'
                  '\n   Enter 0 to stop applying corrections.'
                  )

            choice_loop = True
            while choice_loop:
                user = utils.get_int_input(0, 9, "\nEnter your selection: ")
                # The following if statements check whether user tries to correct a variable that was not provided
                # or make sure correction is being done in the ideal order
                if user == 2 and self.column_ser.tdew == -1:
                    print('\nDewpoint temperature was not provided by the file, please choose a different option.')
                elif user == 6 and self.column_ser.ea == -1:
                    print('\nVapor Pressure was not provided by the file, please choose a different option.')
                elif user == 7 and (self.column_ser.rhmax == -1 or self.column_ser.rhmin == -1):
                    print('\nRHMax and RHMin were not provided by the file, please choose a different option.')
                elif user == 8 and self.column_ser.rhavg == -1:
                    print('\nRHAvg was not provided by the file, please choose a different option.')
                elif user == 5 and not self.humidity_adjusted:
                    print('\n\nBefore correcting solar radiation, did you want to adjust compiled humidity?.')
                    print('Doing so may allow you to get the best possible humidity record for Rs correction.')
                    print('\nEnter 1 to adjust compiled humidity or 0 to skip.')

                    humid_choice = utils.get_int_input(0, 1, 'Enter your selection: ')
                    if humid_choice == 1:  # change original choice to the adjust humidity option
                        user = 9
                    choice_loop = False
                else:
                    choice_loop = False

            ##########
            # Correcting individual variables based on user choice
            # Correcting Max/Min Temperature data
            if user == 1:
                (self.data_tmax, self.data_tmin) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_tmax, self.data_tmin, self.dt_array,
                               self.data_month, self.data_year, 1, self.auto_mode)
            # Correcting Min/Dew Temperature data
            elif user == 2:
                (self.data_tmin, self.data_tdew) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_tmin, self.data_tdew, self.dt_array,
                               self.data_month, self.data_year, 2, self.auto_mode)
            # Correcting Windspeed
            elif user == 3:
                (self.data_ws, self.data_null) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_ws, self.data_null, self.dt_array,
                               self.data_month, self.data_year, 3, self.auto_mode)
            # Correcting Precipitation
            elif user == 4:
                (self.data_precip, self.data_null) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_precip, self.data_null, self.dt_array,
                               self.data_month, self.data_year, 4, self.auto_mode)
            # Correcting Solar radiation
            elif user == 5:
                (self.data_rs, self.data_null) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_rs, self.rso, self.dt_array,
                               self.data_month, self.data_year, 5, self.auto_mode)
            # Correcting Vapor Pressure
            elif user == 6:
                (self.data_ea, self.data_null) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_ea, self.data_null, self.dt_array,
                               self.data_month, self.data_year, 7, self.auto_mode)
            # Correcting Relative Humidity Max and Min
            elif user == 7:
                (self.data_rhmax, self.data_rhmin) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_rhmax, self.data_rhmin, self.dt_array,
                               self.data_month, self.data_year, 8, self.auto_mode)
            # Correcting Relative Humidity Average
            elif user == 8:
                (self.data_rhavg, self.data_null) = qaqc_functions.\
                    correction(self.station_name, self.log_file, self.folder_path,
                               self.data_rhavg, self.data_null, self.dt_array,
                               self.data_month, self.data_year, 9, self.auto_mode)
            # Adjusting compiled_ea
            elif user == 9:
                self.compiled_ea = qaqc_functions.\
                    compiled_humidity_adjustment(self.station_name, self.log_file, self.folder_path, self.dt_array,
                                                 self.data_tmax, self.data_tmin, self.data_tavg, self.compiled_ea,
                                                 self.data_ea, self.column_ser.ea, self.data_tdew, self.column_ser.tdew,
                                                 self.data_tdew_ko, self.data_rhmax, self.column_ser.rhmax,
                                                 self.data_rhmin, self.column_ser.rhmin,
                                                 self.data_rhavg, self.column_ser.rhavg)
                self.humidity_adjusted = True
            else:
                # user quits, exit out of loop
                print('\nSystem: Now finishing up corrections.')
                # Break here because all recalculations were done at the end of the last loop iteration
                break

            if 1 <= user <= 2 or 6 <= user <= 8:
                if user == 1:  # User has corrected temperature, so fill all missing values with a normal distribution
                    # Reset 'complete' vars as the underlying var has been changed.
                    self.complete_tmax = np.array(self.data_tmax)
                    self.complete_tmin = np.array(self.data_tmin)

                    # Remove corresponding TAvg observations after outliers have been removed from TMax and TMin
                    tmax_removed_indices = np.array(np.where(np.isnan(self.data_tmax)))  # array of indices of nans
                    tmin_removed_indices = np.array(np.where(np.isnan(self.data_tmin)))  # array of indices of nans
                    self.data_tavg[tmax_removed_indices] = np.nan
                    self.data_tavg[tmin_removed_indices] = np.nan

                    # Create mean monthly and standard deviation
                    self.mm_tmax = np.zeros(12)
                    self.mm_tmin = np.zeros(12)
                    self.std_tmax = np.zeros(12)
                    self.std_tmin = np.zeros(12)

                    for k in range(12):
                        temp_indexes = np.where(self.data_month == k+1)[0]
                        temp_indexes = np.array(temp_indexes, dtype=int)
                        self.mm_tmax[k] = np.nanmean(self.data_tmax[temp_indexes])
                        self.std_tmax[k] = np.nanstd(self.data_tmax[temp_indexes])
                        self.mm_tmin[k] = np.nanmean(self.data_tmin[temp_indexes])
                        self.std_tmin[k] = np.nanstd(self.data_tmin[temp_indexes])

                    # Fill missing observations with samples from a normal distribution with monthly mean and variance
                    for i in range(self.data_length):
                        if np.isnan(self.data_tmax[i]):
                            self.complete_tmax[i] = np.random.normal(self.mm_tmax[self.data_month[i] - 1],
                                                                     self.std_tmax[self.data_month[i] - 1], 1)[0]
                            self.fill_tmax[i] = self.complete_tmax[i]
                        else:
                            pass

                        if np.isnan(self.data_tmin[i]):
                            self.complete_tmin[i] = np.random.normal(self.mm_tmin[self.data_month[i] - 1],
                                                                     self.std_tmin[self.data_month[i] - 1], 1)[0]
                            self.fill_tmin[i] = self.complete_tmin[i]
                        else:
                            pass

                        if (self.complete_tmax[i] <= self.complete_tmin[i]) or \
                                (self.complete_tmax[i] - self.complete_tmin[i] <= 3):
                            # This is a logical check to make sure that tmax is sufficiently distant from tmin once
                            # they have been filled in, tmax needs to be warmer than tmin and daily temp isn't constant
                            # so there should be at least a small difference in tmax-tmin

                            # todo the below lines always provide a higher than average tmax
                            #   and a lower than average tmin, this can be eventually improved
                            # Fill this observation in with  mm observation with the difference of 1/2 of mm delta t
                            self.complete_tmax[i] = self.mm_tmax[self.data_month[i] - 1] + \
                                                    (0.5 * self.mm_delta_t[self.data_month[i] - 1])
                            self.fill_tmax[i] = self.complete_tmax[i]

                            self.complete_tmin[i] = self.mm_tmin[self.data_month[i] - 1] - \
                                (0.5 * self.mm_delta_t[self.data_month[i] - 1])
                            self.fill_tmin[i] = self.complete_tmin[i]
                        else:
                            # data is different enough to appear valid
                            pass

                    if self.fill_mode:
                        # we are filling in data, so copy all the filled versions onto the original temperature
                        self.data_tmax = np.array(self.complete_tmax)
                        self.data_tmin = np.array(self.complete_tmin)
                    else:
                        # if we are not filling, we will hold the copies to later fill in rso, but will reset fill
                        # tracking variables
                        self.fill_tmax = np.zeros(self.data_length)
                        self.fill_tmin = np.zeros(self.data_length)
                else:
                    # user did not correct option 1
                    pass

                # Figure out which humidity variables are provided and recalculate Ea and TDew if needed
                # This function is safe to use after correcting because it tracks what variable was provided by the data
                # and recalculates appropriately. It doesn't overwrite provided variables with calculated versions.
                # Ex. if only TDew is provided, it recalculates ea while returning original provided tdew
                (self.data_ea, self.data_tdew) = calc_functions.\
                    calc_humidity_variables(self.data_tmax, self.data_tmin, self.data_tavg, self.data_ea,
                                            self.column_ser.ea, self.data_tdew, self.column_ser.tdew,
                                            self.data_rhmax, self.column_ser.rhmax, self.data_rhmin,
                                            self.column_ser.rhmin, self.data_rhavg, self.column_ser.rhavg)

                # Recalculates secondary temperature values and mean monthly counterparts
                (self.delta_t, self.mm_delta_t, self.k_not, self.mm_k_not, self.mm_tmin, self.mm_tdew) = \
                    calc_functions.calc_temperature_variables(self.data_month, self.data_tmax,
                                                              self.data_tmin, self.data_tdew)

                # Since we are recalculating humidity variables, we also need to reset tdew_ko to ensure it matches the
                # underlying unfilled tdew. It is filled later after this once the user corrects a humidity var
                # so this reset is acceptable
                self.data_tdew_ko = np.array(self.data_tdew)

                if user == 2 or 6 <= user <= 8:

                    # Reset 'complete' version as underlying variable may have changed
                    self.complete_tdew = np.array(self.data_tdew)
                    '''
                        Fill in any missing tdew data with tmin - k0 curve.
                        
                        As detailed above, data_tdew_ko only fills in missing tdew observations with real tmin obs,
                        while complete_tdew is a full record filled in using a filled in tmin.
                        
                        Nothing occurs if this fill code is run a second time because vars are already filled unless
                        correction methods throw out data, in which case we need to refill for the complete record
                        that Rs correction requires.
                    '''
                    for i in range(self.data_length):
                        if np.isnan(self.data_tdew[i]):

                            # Tdew_ko will have gaps that match gaps in Tmin
                            # Complete_tdew will match complete_tmin in having no gaps
                            self.data_tdew_ko[i] = self.data_tmin[i] - self.mm_k_not[self.data_month[i] - 1]
                            self.complete_tdew[i] = self.complete_tmin[i] - self.mm_k_not[self.data_month[i] - 1]
                            self.fill_tdew[i] = self.complete_tdew[i]
                        else:
                            # If TDew isn't empty then nothing is required to be done.
                            pass

                    if self.fill_mode:
                        # we are filling in data, so copy all the filled versions onto the original arrays
                        self.data_tdew = np.array(self.complete_tdew)
                    else:
                        # if we are not filling, we will hold the copies to later fill in rso, but will reset fill
                        # tracking variables
                        self.fill_tdew = np.zeros(self.data_length)
                else:
                    # user did not select option 2 or 6-8
                    pass

                '''
                    Recreate the 'compiled' ea as temperature or humidity vars were corrected and may have changed the
                    data underlying the compiled ea. Once that is done we will fill in all the gaps with the 
                    variable 'complete_tdew' so that a complete record of ea exists for rs correction
                    
                    The gaps in compiled_ea are reset every time temperature or humidity is corrected so this code is 
                    okay to run multiple times
                '''
                self.compiled_ea = calc_functions.calc_compiled_ea(self.data_tmax, self.data_tmin, self.data_tavg,
                                                                   self.data_ea, self.data_tdew, self.column_ser.tdew,
                                                                   self.data_rhmax, self.column_ser.rhmax,
                                                                   self.data_rhmin, self.column_ser.rhmin,
                                                                   self.data_rhavg, self.column_ser.rhavg,
                                                                   self.data_tdew_ko)

                # Reset 'complete' version as underlying variable may have changed.
                self.complete_ea = np.array(self.compiled_ea)

                for i in range(self.data_length):
                    if np.isnan(self.compiled_ea[i]):
                        self.complete_ea[i] = (0.6108 * np.exp((17.27 * self.complete_tdew[i]) / (self.complete_tdew[i]
                                                                                                  + 237.3)))
                        self.fill_ea[i] = self.complete_ea[i]
                else:
                    # Ea is provided and the index is not empty, do nothing to avoid overwriting actual data
                    pass

                if self.fill_mode:
                    # we are filling in data, so copy all the filled versions onto the original arrays
                    self.data_ea = np.array(self.complete_ea)
                    self.compiled_ea = np.array(self.complete_ea)
                else:
                    # if we are not filling, we will hold the copies to later fill in rso, but will reset fill
                    # tracking variables
                    self.fill_ea = np.zeros(self.data_length)

            elif user == 9:  # User has adjusted how the compiled humidity is sourced, recreate complete_ea
                self.complete_ea = np.array(self.compiled_ea)

                for i in range(self.data_length):
                    if np.isnan(self.compiled_ea[i]):
                        self.complete_ea[i] = (0.6108 * np.exp((17.27 * self.complete_tdew[i]) /
                                                               (self.complete_tdew[i] + 237.3)))
                        self.fill_ea[i] = self.complete_ea[i]
                else:
                    # the index is not empty, do nothing to avoid overwriting actual data
                    pass

                if self.fill_mode:
                    # we are filling in data, so copy all the filled versions onto the original arrays
                    self.data_ea = np.array(self.complete_ea)
                    self.compiled_ea = np.array(self.complete_ea)
                else:
                    # if we are not filling, we will hold the copies to later fill in rso, but will reset fill
                    # tracking variables
                    self.fill_ea = np.zeros(self.data_length)
            else:
                # user did not select options 1,2, 6, 7, 8, or 9.
                pass

            '''
                Even if the user doesn't want to put filled data into their output file, we still need to use complete
                records to get a complete record of Rso for use in Rs correction. This completed rso is only used for 
                this step and is not written as data to the output file
            '''
            if self.fill_mode:
                '''                  
                    This recalculates Rso and ETr values using the filled 'completed_' versions to provide a complete
                    record of ETr values.
                    
                    If this code is executing then 'data_' vars have already been replaced by their 'completed_' 
                    versions so the code is accurate in calling them 'data_'
                '''
                warnings.filterwarnings('ignore', 'invalid value encountered')  # catch invalid value warning, nans
                (self.rso, self.mm_rs, self.eto, self.etr, self.mm_eto, self.mm_etr) = calc_functions. \
                    calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height, self.data_doy,
                                       self.data_month, self.data_tmax, self.data_tmin, self.data_ea, self.data_ws,
                                       self.data_rs)
                warnings.resetwarnings()
            else:
                '''
                    User doesn't want to keep filled in data, so use complete versions to create a filled version of
                    rso while saving the other outputs of calc_rso_and_refet as temporary names which are unused to 
                    prevent them from impacting later calculations
                '''
                warnings.filterwarnings('ignore', 'invalid value encountered')  # catch invalid value warning, nans
                (self.rso, self._mm_rs, self._eto, self._etr, self._mm_eto, self._mm_etr) = \
                    calc_functions.calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height,
                                                      self.data_doy, self.data_month, self.complete_tmax,
                                                      self.complete_tmin, self.complete_ea, self.data_ws, self.data_rs)
                warnings.resetwarnings()

        '''
            At this point the user has finished correcting all variables they want to.
            
            We now recalculate original and optimized Thornton-Running Solar Radiation.
            Additionally, if the user is electing to fill data then we fill in all gaps of data_rs 
            with the optimized thornton running solar radiation.
            
            Also, only if the user wants, we fill in all gaps of wind data using samples from a normal distribution.
            
            Finally, we do one final calculation of Rso/ETr with all the final versions of variables. If the user does
            NOT want to fill in data, this final calculation of Rso will replace the filled one used for Solar
            Radiation correction with one using only real data.
        '''

        (self.orig_rs_tr, self.mm_orig_rs_tr, self.opt_rs_tr, self.mm_opt_rs_tr) = calc_functions. \
            calc_org_and_opt_rs_tr(self.mc_iterations_post_corrections, self.log_file, self.data_month,
                                   self.delta_t, self.mm_delta_t, self.data_rs, self.rso)

        # This section provides for the filling of data should fill_mode be set to true
        self.mm_ws = np.zeros(12)
        self.std_ws = np.zeros(12)
        for k in range(12):
            temp_indexes = np.where(self.data_month == k + 1)[0]
            temp_indexes = np.array(temp_indexes, dtype=int)
            self.mm_ws[k] = np.nanmean(self.data_ws[temp_indexes])
            self.std_ws[k] = np.nanmean(self.data_ws[temp_indexes])

        if self.fill_mode:
            for i in range(self.data_length):
                # fill data_rs with rs_tr and data_ws with an exponential function centered on mm_ws for that month
                if np.isnan(self.data_rs[i]):
                    self.data_rs[i] = self.opt_rs_tr[i]
                    self.fill_rs[i] = self.opt_rs_tr[i]
                else:
                    # If rs isn't empty then nothing is required to be done.
                    pass
                if np.isnan(self.data_ws[i]):
                    self.data_ws[i] = np.random.normal(self.mm_ws[self.data_month[i] - 1],
                                                       self.std_ws[self.data_month[i] - 1], 1)

                    if self.data_ws[i] < 0.2:  # check to see if filled windspeed is lower than reasonable
                        self.data_ws[i] = 0.2
                    else:
                        pass

                    self.fill_ws[i] = self.data_ws[i]
                else:
                    # If ws isn't empty then nothing is required to be done.
                    pass
        else:
            pass

        # Recalculate eto and etr one final time
        # This also overwrites the filled Rso, so we will create a copy for posterity
        self.fill_rso = np.array(self.rso)

        (self.rso, self.mm_rs, self.eto, self.etr, self.mm_eto, self.mm_etr) = calc_functions. \
            calc_rso_and_refet(self.station_lat, self.station_elev, self.ws_anemometer_height, self.data_doy,
                               self.data_month, self.data_tmax, self.data_tmin, self.compiled_ea,
                               self.data_ws, self.data_rs)

    def _create_plots(self):
        """
            Makes and saves histogram and composite plots.
        """
        #########################
        # Histograms of original data
        # Generates composite plot of specific variables before correction
        # We fill these variables by sampling a normal distribution, so we use this plot mainly as evidence for that.
        if self.generate_bokeh and self.script_mode == 0:
            ws_hist = plot.histogram_plot(self.data_ws[~np.isnan(self.data_ws)],
                                          'Windspeed', 'black', 'm/s')
            tmax_hist = plot.histogram_plot(self.data_tmax[~np.isnan(self.data_tmax)],
                                            'TMax', 'red', 'degrees C')
            tmin_hist = plot.histogram_plot(self.data_tmin[~np.isnan(self.data_tmin)],
                                            'TMin', 'blue', 'degrees C')
            tavg_hist = plot.histogram_plot(self.data_tmin[~np.isnan(self.data_tmin)],
                                            'TAvg', 'black', 'degrees C')
            tdew_hist = plot.histogram_plot(self.data_tdew[~np.isnan(self.data_tdew)],
                                            'TDew', 'black', 'degrees C')
            k_not_hist = plot.histogram_plot(self.k_not[~np.isnan(self.k_not)],
                                            'Ko', 'black', 'degrees C')

            output_file(self.folder_path + "/correction_files/histograms/" + self.station_name + '_histograms.html',
                        title=self.station_name + ' histograms')

            save(gridplot([ws_hist, tmax_hist, tmin_hist, tavg_hist, tdew_hist, k_not_hist], ncols=2,
                          width=400, height=400, toolbar_location=None))

        #########################
        # Generate bokeh composite plot
        # Creates one large plot featuring all variables as subplots, used to get a concise overview of the full dataset
        # This output will be generated twice, first to plot data before correction, and then again
        # to plot data after correction
        if self.generate_bokeh:  # Flag to create graphs or not
            reset_output()
            plot_list = []

            x_size = 1200
            y_size = 400

            if self.script_mode == 0:
                output_file(self.folder_path + "/correction_files/before_graphs/" + self.station_name +
                            "_before_corrections_composite_graph.html")
                print("\nSystem: Now creating pre-correction composite bokeh graph.")
            elif self.script_mode == 1:
                output_file(self.folder_path + "/correction_files/after_graphs/" + self.station_name +
                            "_after_corrections_composite_graph.html")
                print("\nSystem: Now creating post-correction composite bokeh graph.")
            else:
                # Incorrect setup of script mode variable, raise an error
                raise ValueError('Incorrect parameters: script mode is not set to a valid option.')

            # Temperature Maximum and Minimum Plot
            plot_tmax_tmin = plot.line_plot(x_size, y_size, self.dt_array, self.data_tmax,
                                            self.data_tmin, 1, '')
            plot_list.append(plot_tmax_tmin)
            # Temperature Minimum and Dewpoint Plot
            plot_tmin_tdew = plot.line_plot(x_size, y_size, self.dt_array, self.data_tmin,
                                            self.data_tdew, 2, '', plot_tmax_tmin)
            plot_list.append(plot_tmin_tdew)

            # 'Completed' vapor pressure plot
            plot_comp_ea = plot.line_plot(x_size, y_size, self.dt_array, self.compiled_ea, self.data_null,
                                          7, 'Composite ', plot_tmax_tmin)
            plot_list.append(plot_comp_ea)

            # vapor pressure plot that was just the provided dataset
            if self.column_ser.ea != -1:
                plot_data_ea = plot.line_plot(x_size, y_size, self.dt_array, self.data_ea, self.data_null,
                                              7, 'Provided ', plot_tmax_tmin)
                plot_list.append(plot_data_ea)

            # rh max and rh min plot if it was provided in dataset
            if self.column_ser.rhmax != -1 and self.column_ser.rhmin != -1:  # RH max and RH min
                plot_rhmax_rhmin = plot.line_plot(x_size, y_size, self.dt_array, self.data_rhmax,
                                                  self.data_rhmin, 8, '', plot_tmax_tmin)
                plot_list.append(plot_rhmax_rhmin)

            # rh avg if it was provided in the dataset
            if self.column_ser.rhavg != -1:  # RH Avg
                plot_rhavg = plot.line_plot(x_size, y_size, self.dt_array, self.data_rhavg,
                                            self.data_null, 9, '', plot_tmax_tmin)
                plot_list.append(plot_rhavg)

            # Mean Monthly Temperature Minimum and Dewpoint
            plot_mm_tmin_tdew = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_tmin,
                                               self.mm_tdew, 2, 'MM ')
            plot_list.append(plot_mm_tmin_tdew)

            # Mean Monthly k0 curve (Tmin-Tdew)
            plot_mm_k_not = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_k_not,
                                           self.mm_data_null, 10, '', plot_mm_tmin_tdew)
            plot_list.append(plot_mm_k_not)

            # Solar radiation and clear sky solar radiation
            plot_rs_rso = plot.line_plot(x_size, y_size, self.dt_array, self.data_rs, self.rso,
                                         5, '', plot_tmax_tmin)
            plot_list.append(plot_rs_rso)

            # Windspeed
            plot_ws = plot.line_plot(x_size, y_size, self.dt_array, self.data_ws, self.data_null,
                                     3, '', plot_tmax_tmin)
            plot_list.append(plot_ws)

            # Precipitation
            plot_precip = plot.line_plot(x_size, y_size, self.dt_array, self.data_precip, self.data_null,
                                         4, '', plot_tmax_tmin)
            plot_list.append(plot_precip)

            # Optimized mean monthly Thornton-Running solar radiation and Mean Monthly solar radiation
            plot_mm_opt_rs_tr = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_rs,
                                               self.mm_opt_rs_tr, 6, 'MM Optimized ', plot_mm_tmin_tdew)
            plot_list.append(plot_mm_opt_rs_tr)

            # Optimized mean monthly Thornton-Running solar radiation and Mean Monthly solar radiation
            plot_mm_orig_rs_tr = plot.line_plot(x_size, y_size, self.mm_dt_array, self.mm_rs,
                                                self.mm_orig_rs_tr, 6, 'MM Original ', plot_mm_tmin_tdew)
            plot_list.append(plot_mm_orig_rs_tr)

            # Now construct grid plot out of all the subplots
            number_of_plots = len(plot_list)
            number_of_rows = ceil(number_of_plots / self.gridplot_columns)

            grid_of_plots = [([None] * 1) for i in range(number_of_rows)]

            for i in range(number_of_rows):
                for j in range(self.gridplot_columns):

                    if len(plot_list) > 0:
                        grid_of_plots[i][j] = plot_list.pop(0)
                    else:
                        pass

            fig = gridplot(grid_of_plots, toolbar_location='left', sizing_mode='stretch_width')
            save(fig)

    def _write_outputs(self):
        """
            Creates all the output files
        """

        #########################
        # Create necessary variables for generic metadata file, as well as
        # generate and fill metadata file
        record_start = pd.to_datetime(self.dt_array[0]).date()
        record_end = pd.to_datetime(self.dt_array[-1]).date()

        # Generate metadata output file that saves the site-specific data for each run (appended to end)
        if not os.path.isfile('correction_metadata.xlsx'):
            # file does not exist, create new one
            metadata_info = pd.DataFrame({'Station': self.station_name, 'Latitude': self.station_lat,
                                          'Longitude': self.station_lon, 'station_elev_m': self.station_elev,
                                          'record_start': record_start, 'record_end': record_end,
                                          'anemom_height_m': self.ws_anemometer_height,
                                          'Filename': self.output_file_path}, index=np.array([1]))

            with pd.ExcelWriter('correction_metadata.xlsx', date_format='YYYY-MM-DD',
                                datetime_format='YYYY-MM-DD HH:MM:SS', engine='openpyxl', mode='w') as writer:
                metadata_info.to_excel(writer, header=True,  index=False, sheet_name='Sheet1')
        else:
            # file is already created, so we need to read it in, append our new information to the bottom of it
            # and then save the info
            metadata_info = pd.read_excel('correction_metadata.xlsx', sheet_name=0, index_col=None,
                                          engine='openpyxl', keep_default_na=False, verbose=True)

            new_meta_info = pd.DataFrame({'Station': self.station_name, 'Latitude': self.station_lat,
                                          'Longitude': self.station_lon, 'station_elev_m': self.station_elev,
                                          'record_start': record_start, 'record_end': record_end,
                                          'anemom_height_m': self.ws_anemometer_height,
                                          'Filename': self.output_file_path}, index=np.array([1]))

            output_metadata = pd.concat([metadata_info, new_meta_info], ignore_index=True)

            with pd.ExcelWriter('correction_metadata.xlsx', date_format='YYYY-MM-DD',
                                datetime_format='YYYY-MM-DD HH:MM:SS', engine='openpyxl', mode='w') as writer:
                output_metadata.to_excel(writer, header=True, index=False, sheet_name='Sheet1')

        # if we are using a network-specific metadata file, update that another file has been processed
        if self.metadata_path is not None:
            current_row = self.metadata_df.processed.ne(1).idxmax() - 1
            self.metadata_df.processed.iloc[current_row] = 1
            self.metadata_df.record_start.iloc[current_row] = record_start
            self.metadata_df.record_end.iloc[current_row] = record_end
            self.metadata_df.output_path.iloc[current_row] = self.output_file_path

            with pd.ExcelWriter(self.metadata_path, date_format='YYYY-MM-DD',
                                datetime_format='YYYY-MM-DD', engine='openpyxl', mode='w') as writer:
                self.metadata_df.to_excel(writer, header=True, index=True, sheet_name='Sheet1')

        #########################
        # Generate output file
        # Create any final variables, then create panda dataframes to save all the data
        # Includes the following sheets:
        #     Corrected Data : Actual corrected values
        #     Delta : Magnitude of difference between original data and corrected data
        #     Filled Data : Tracks which data points have been filled by script generated values instead of provided
        # Data that is provided and subsequently corrected by the script do not count as filled values.
        print("\nSystem: Saving corrected data to .xslx file.")

        # Create any individually-requested output data
        ws_2m = _wind_height_adjust(uz=self.data_ws, zw=self.ws_anemometer_height)

        # Create corrected-original delta numpy arrays
        diff_tavg = np.array(self.data_tavg - self.original_df.tavg)
        diff_tmax = np.array(self.data_tmax - self.original_df.tmax)
        diff_tmin = np.array(self.data_tmin - self.original_df.tmin)
        diff_tdew = np.array(self.data_tdew - self.original_df.tdew)
        diff_ea = np.array(self.data_ea - self.original_df.ea)
        diff_rhavg = np.array(self.data_rhavg - self.original_df.rhavg)
        diff_rhmax = np.array(self.data_rhmax - self.original_df.rhmax)
        diff_rhmin = np.array(self.data_rhmin - self.original_df.rhmin)
        diff_rs = np.array(self.data_rs - self.original_df.rs)
        diff_rs_tr = np.array(self.opt_rs_tr - self.orig_rs_tr)
        diff_rso = np.array(self.rso - self.original_df.rso)
        diff_ws = np.array(self.data_ws - self.original_df.ws)
        diff_precip = np.array(self.data_precip - self.original_df.precip)
        diff_etr = np.array(self.etr - self.original_df.etr)
        diff_eto = np.array(self.eto - self.original_df.eto)

        # Create k0 array to output values
        k_not_vals = np.zeros(self.data_length)
        k_not_vals[0:12] = self.mm_k_not[0:12]

        # Create datetime for output dataframe
        datetime_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month, 'day': self.data_day})
        datetime_df = pd.to_datetime(datetime_df[['month', 'day', 'year']])

        # Create output dataframe
        output_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month,
                                  'day': self.data_day, 'TAvg (C)': self.data_tavg, 'TMax (C)': self.data_tmax,
                                  'TMin (C)': self.data_tmin, 'TDew (C)': self.data_tdew,
                                  'Compiled Ea (kPa)': self.compiled_ea,
                                  'Vapor Pres (kPa)': self.data_ea, 'RHAvg (%)': self.data_rhavg,
                                  'RHMax (%)': self.data_rhmax, 'RHMin (%)': self.data_rhmin, 'Rs (w/m2)': self.data_rs,
                                  'Opt_Rs_TR (w/m2)': self.opt_rs_tr, 'Rso (w/m2)': self.rso,
                                  'Windspeed (m/s)': self.data_ws, 'Precip (mm)': self.data_precip,
                                  'ETr (mm)': self.etr, 'ETo (mm)': self.eto, 'ws_2m (m/s)': ws_2m},
                                 index=datetime_df)

        # Creating difference dataframe to track amount of correction
        delta_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month,
                                 'day': self.data_day, 'TAvg (C)': diff_tavg, 'TMax (C)': diff_tmax,
                                 'TMin (C)': diff_tmin, 'TDew (C)': diff_tdew,
                                 'Vapor Pres (kPa)': diff_ea, 'RHAvg (%)': diff_rhavg, 'RHMax (%)': diff_rhmax,
                                 'RHMin (%)': diff_rhmin, 'Rs (w/m2)': diff_rs, 'Opt - Orig Rs_TR (w/m2)': diff_rs_tr,
                                 'Rso (w/m2)': diff_rso, 'Windspeed (m/s)': diff_ws, 'Precip (mm)': diff_precip,
                                 'ETr (mm)': diff_etr, 'ETo (mm)': diff_eto}, index=datetime_df)

        # Creating a fill dataframe that tracks where missing data was filled in
        fill_df = pd.DataFrame({'year': self.data_year, 'month': self.data_month,
                                'day': self.data_day, 'TMax (C)': self.fill_tmax, 'TMin (C)': self.fill_tmin,
                                'TDew (C)': self.fill_tdew, 'Vapor Pres (kPa)': self.fill_ea, 'Rs (w/m2)': self.fill_rs,
                                'Complete Record Rso (w/m2)': self.fill_rso, 'mm k0 values': k_not_vals},
                               index=datetime_df)
        output_df.index.name = 'date'
        delta_df.index.name = 'date'
        fill_df.index.name = 'date'
        # Open up pandas excel writer
        output_writer = pd.ExcelWriter(self.output_file_path, engine='xlsxwriter')
        # Convert data frames to xlsxwriter excel objects
        output_df.to_excel(output_writer, sheet_name='Corrected Data', na_rep=self.missing_fill_value)
        delta_df.to_excel(output_writer, sheet_name='Delta (Corr - Orig)', na_rep=self.missing_fill_value)
        fill_df.to_excel(output_writer, sheet_name='Filled Data', na_rep=self.missing_fill_value)
        # Save output file
        output_writer.close()

        logger = open(self.log_file, 'a')
        if self.fill_mode == 1:
            if np.isnan(self.eto).any() or np.isnan(self.etr).any():
                print("\nSystem: After finishing corrections and filling data, "
                      "ETr and ETo still had missing observations.")
                logger.write('After finishing corrections and filling data, '
                             'ETr and ETo still had missing observations. \n')
            else:
                logger.write('The output file for this station has a complete record of ETo and ETr observations. \n')
        else:
            pass
        logger.write('\nThe file has been successfully processed and output files saved at %s.' %
                     dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
        logger.close()

[docs]    def process_station(self):
        """
            This function serves as the structure for the overall workflow in
            applying the QC process to an input data source. The standard process is as follows:

            1. Read in the data.

            2. Calculate any secondary variables (mean monthly values, clear-sky solar radiation, etc.).

            3. Plot the data before any corrections are performed.

            4. Allow the user to adjust/remove/QC data. Recompute any dependent secondary variables.

            5. Plot the data after corrections are performed and save the output data.

        Returns:
            None

        """
        self._obtain_data()
        self._calculate_secondary_vars()
        # first plot the data before correcting it
        print("\nSystem: Plotting raw data.")
        self._create_plots()
        self.script_mode = 1
        self._correct_data()
        self._create_plots()
        self._write_outputs()


# This is never run by itself
if __name__ == "__main__":
    print("\nThis module does nothing if run by itself. "
          "Please see the script \'qaqc_single_station.py\' for more info.")