Source code for gridwxcomp.interpgdal

# -*- coding: utf-8 -*-
"""
Contains objects for interfacing ``gridwxcomp`` with gdal command line tools
used for spatial interpolation of scattered point data.
"""
import os
import subprocess
import xml.etree.cElementTree as ET
from copy import copy
from pathlib import Path
from xml.dom import minidom

import numpy as np
import pandas as pd

# allows for CL script usage if gridwxcomp not installed
try:
    from .spatial import get_subgrid_bounds, zonal_stats, calc_pt_error
    from .plot import station_bar_plot
except:
    from spatial import get_subgrid_bounds, zonal_stats, calc_pt_error
    from plot import station_bar_plot

[docs]class InterpGdal(object):
    """
    Usage of gdal command line tools within ``gridwxcomp``, currently
    utilizes the `gdal_grid <https://www.gdal.org/gdal_grid.html>`_ 
    command line tool. 
    
    Arguments:
        summary_csv_path (str): path to [var]_summary_comp CSV file created 
            by :mod:`gridwxcomp.calc_bias_ratios` containing point bias ratios,
            lat, and long. 
            
    Attributes:
        CELL_SIZE (float): resolution of gridMET dataset in decimal degrees.
        summary_csv_path (:obj:`pathlib.Path`): absolute path object to 
            input ``summary_csv_path``.
        layers (list): list of layers in ``summary_csv_path`` to interpolate
            e.g. when using :meth:`InterpGdal.gdal_grid` with ``layer="all"``
            defaults to :attr:`InterpGdal.default_layers`.
        grid_bounds (tuple or None): default None. Extent for interpolation
            raster in order (min long, max long, min lat, max lat).
        interp_meth (str): default 'invdist'. gdal_grid interpolation method.
        interped_rasters (list): empty list that is appended with 
            :obj:`pathlib.Path` objects to interpolated rasters after using
            :meth:`InterpGdal.gdal_grid`.
        params (dict or None): default None. After :meth:`InterpGdal.gdal_grid`
            ``params`` is updated with the last used interpolation parameters
            in the form of a dictionary with parameter names as keys.

    Example:
        The :class:`InterpGdal` class is useful for experimenting with multiple 
        interpolation algorithms provided by gdal which are optimized and
        often sensitive to multiple parameters. We can use the object
        to loop over a range of parameter combinations to test how algorithms
        perform, we might pick a single layer to test, in this case the
        growing season bias ratios between station and gridMET reference
        evapotranspiration (etr_mm). Below is a routine to conduct a
        basic sensitivity analysis of the power and smooth parameters of the 
        inverse distance to a power interpolation method,
        
        >>> from gridwxcomp import InterpGdal
        >>> import os
        >>> # create instance variable from summary csv
        >>> summary_file = 'PATH/TO/etr_mm_summary_comp.csv'
        >>> # create a InterpGdal instance
        >>> test = InterpGdal(summary_file)
        >>> layer = 'growseason_mean' # could be a list
        >>> # run inverse distance interpolation with different params
        >>> for p in range(1,10):
        >>>     for s in [0, 0.5, 1, 1.5, 2]:
        >>>         # build output directory based on parameter sets
        >>>         out_dir = os.path.join('spatial', 'invdist', 
        >>>             'power={}_smooth={}'.format(p, s))
        >>>         params = {'power': p, 'smooth': s}
        >>>         test.gdal_grid(out_dir=out_dir, layer=layer, params=params)

        Note, we did not assign the interpolation method 'invdist' because it 
        is the default. To use another interpolation method we would
        assign the ``interp_meth`` kwarg to :meth:`InterpGdal.gdal_grid`.
        Similarly, we could experiment with other parameters which all can be
        found in the class attribute :attr:`InterpGdal.default_params`. The
        instance variable :attr:`InterpGdal.params` can also be used to save
        metadata on parameters used for each run.
        
    """
    # constant gridMET cell size in degrees
    GRIDMET_RES = 0.041666666666666664
    # gdal_grid interpolation methods
    interp_methods = ('average',
                     'invdist',
                     'invdistnn',
                     'linear',
                     'nearest')
    """
        interp_methods (tuple): gdal_grid interpolation algorithms.
    """
    # default layers calculated by calc bias ratios module
    default_layers = ('Jan_mean',
                     'Feb_mean',
                     'Mar_mean',
                     'Apr_mean',
                     'May_mean',
                     'Jun_mean',
                     'Jul_mean',
                     'Aug_mean',
                     'Sep_mean',
                     'Oct_mean',
                     'Nov_mean',
                     'Dec_mean',
                     'growseason_mean',
                     'annual_mean',
                     'summer_mean')
    """
    default_layers (tuple): Layers to interpolate created by 
        :mod:`gridwxcomp.calc_bias_ratios`, e.g. "Jan_mean", found in 
        ``summary_csv_path``.
    """

    # interp params, method key, param dic as values
    default_params = {
        'invdist':{
            'power': 2,
            'smoothing': 0,
            'radius1': 0,
            'radius2': 0,
            'angle': 0,
            'max_points': 0,
            'min_points': 0,
            'nodata': -999
        },
        'invdistnn':{
            'power': 2,
            'smoothing': 0,
            'radius': 10,
            'max_points': 12,
            'min_points': 0,
            'nodata': -999
        },
        'average':{
            'radius1': 0,
            'radius2': 0,
            'angle': 0,
            'min_points': 0,
            'nodata': -999
        },
        'linear':{
            'radius': -1,
            'nodata': -999
        },
        'nearest':{
            'radius1': 0,
            'radius2': 0,
            'angle': 0,
            'nodata': -999
        }
    }
    """
    default_params (dict): Dictionary with default parameters for each
        interpolation algorithm, slightly modified from GDAL defaults.
        Keys are interpolation method names, keys are dictionaries with
        parameter names keys and corresponding values.
    """

    # maps bias ratio variables names to residual names for point shapefile
    # which must be shorter (< 11 characters) to be stored as fields
    var_residual_names = {
        'annual_mean': 'annual_res',
        'growseason_mean': 'grow_res',
        'summer_mean': 'summer_res'
    }

    """
    var_residual_names (dict): Dictionary that maps names of bias ratios 
        to the name of their intepolated residual if they are too long for
        storing as a field name in an ESRI shapefile (i.e. > 10 chars).
    """

    def __init__(self, summary_csv_path):
        
        if not Path(summary_csv_path).is_file():
            raise FileNotFoundError(
                'Summary CSV file: {} not found!'.format(
                    Path(summary_csv_path).absolute())
            )
            
        self.summary_csv_path = Path(summary_csv_path).absolute()
        self.layers = list(self.default_layers) # mutable instance attr.
        self.grid_bounds = None
        self.interp_meth = 'invdist'
        self.interped_rasters = [] # appended with Path objects
        self.params = None # to hold last used interp. parameters as dict
        
        
    def _make_pt_vrt(self, layer_name, out_dir):
        """
        Make a vrt file for station point ratios in summary CSV for a 
        specific layer or field name. Save to out_dir. Used for gdal_grid 
        interpolation commands of scatter point data.
        """
        if not Path(out_dir).is_dir():
            os.makedirs(out_dir)
        
        point_data = Path(self.summary_csv_path).name.replace(
                '.csv', '_tmp.csv')

        # make tmp point data csv for given layer, drop missing values
        df = pd.read_csv(self.summary_csv_path)
        df = df[['STATION_LAT', 'STATION_LON', layer_name]]
        df = df[df[layer_name] != -999]
        tmp_out_path = str(Path(self.summary_csv_path).parent / point_data)
        df.to_csv(tmp_out_path, index=False)

        # if out_dir adjust summary CSV path by prepending parent dirs 
        tmp = copy(Path(out_dir))
        n_parent_dirs = 0
        while len(Path(tmp).parents) > 0:    
            if tmp.name == self.summary_csv_path.parent.name:
                break
            tmp = Path(tmp).parent
            n_parent_dirs+=1
        
        path_to_data = str(
            Path(
                '..{}'.format(os.sep)*n_parent_dirs
            ).joinpath(point_data)
        )
        
        out_file = '{}.vrt'.format(layer_name) # keep it simple just layer name

        # VRT format for reading CSV point data
        root = ET.Element('OGRVRTDataSource')
        OGRVRTLayer = ET.SubElement(root, 'OGRVRTLayer', 
                                    name=point_data.replace('.csv', ''))
        # set all fields, SRS WGS84, point geom
        ET.SubElement(OGRVRTLayer, 'SrcDataSource').text = path_to_data
        ET.SubElement(OGRVRTLayer, 'LayerSRS').text = 'epsg:4326'
        ET.SubElement(OGRVRTLayer, 'GeometryType').text = 'wkbPoint'
        ET.SubElement(OGRVRTLayer, 'GeometryField', encoding='PointFromColumns',
                     x='STATION_LON', y='STATION_LAT', z=layer_name)
        
        tree = ET.ElementTree(root)
        # indent xml, save to out_dir
        out_xml_str = _prettify(root)
        
        out_path = os.path.join(out_dir, out_file)
        with open(out_path, 'w') as outf:
            outf.write(out_xml_str)
        
    def _str_to_params(self, param_str):
        """ 
        Convert parameter string for gdal interpolation arguments
        into a dictionary. 
        
        Example:
        
            >>> in_str = ":power=2:smoothing=.1:nodata=-999"
            >>> _str_to_params(in_str)
                {'power': '2', 'smoothing': '.1', 'nodata': '-999'}
    
        """
        param_tmp = param_str.split(':')
        param_dict = {}
        for pair in param_tmp:
            if pair and pair.count('=') == 1:
                name, val = pair.split('=')[0], pair.split('=')[1]
                param_dict[name] = val
            elif not pair:
                continue
            else:
                raise ValueError('{} is not a valid interpolation argument'\
                        .format(param_str))
        
        return param_dict
    

[docs]    def gdal_grid(self, layer='all', out_dir='', interp_meth='invdist', 
                  params=None, bounds=None, nx_cells=None, ny_cells=None, 
                  scale_factor=1, z_stats=True, res_plot=True, grid_res=None,
                  grid_id_name='GRIDMET_ID', options=None):
        """
        Run gdal_grid command line tool to interpolate point ratios.
        
        For further information on theinterpolation algorithms including 
        their function, parameters, and options see 
        `gdal_grid <https://www.gdal.org/gdal_grid.html>`_.
        
        Keyword Arguments:
            layer (str or list): default 'all'. Name of summary file column
                to interpolate, e.g. 'Jan_mean', or list of names. If 'all'
                use all variables in mutable instance attribute "layers".
            out_dir (str): default ''. Output directory to save rasters and
                zonal stats CSV, always appended to the root dir of the
                input summary CSV parent path that contains point ratios.
            interp_meth (str): default 'invdist'. gdal interpolation algorithm
            params (dict, str, or None): default None. Parameters for 
                interpolation algorithm. See examples for format rules.
            bounds (tuple or None): default None. Tuple of bounding coordinates 
                in the following order (min long, max long, min lat, max lat) 
                which need to be in decimal degrees. If None, get extent from 
                locations of climate stations in input summary CSV with 25 cell
                buffer.
            nx_cells (int): default None. Number of pixels in x dim. of raster,
                if None calculated from ``bounds``.
            ny_cells (int): default None. Number of pixels in y dim. of raster,
                if None calculated from ``bounds``.
            scale_factor (float, int): default 1. Scaling factor to apply to 
                original gridMET fishnet to create resampling resolution. If 
                scale_factor = 0.1, the resolution will be one tenth gridMET 
                resolution or 400 m.
            z_stats (bool): default True. Calculate zonal means of 
                interpolated surface to gridMET cells in fishnet and save to a 
                CSV file. The CSV file will be saved to the same directory as 
                the interpolated raster file(s).
            res_plot (bool): default True. Make bar plot for residual (error)
                between interpolated and station value for ``layer``.
            options (str or None): default None. Extra command line options for
                gdal_grid spatial interpolation.
                
        Returns:
            None
                
        Examples:
            The default interpolation algorithm 'invdist' or inverse distance 
            weighting to a power to interpolate bias ratios in a summary CSV 
            file that was first created by :mod:`gridwxcomp.calc_bias_ratios`. 
            The default option will interpolate all layers in :obj:`InterpGdal.default_layers` 
            and calculate zonal statistics for all layers. It will also assume 
            the boundaries of the rasters are defined by the centroid locations 
            of the *outer* station locations in the input summary CSV plus a 25 
            gridMET cell buffer, the pixel size will be 400m (scale_factor=0.1).            We also limit the interpolation to two layers, growing season and 
            annual mean bias ratios,
            
            >>> from gridwxcomp import InterpGdal
            >>> summary_file = 'PATH/TO/[var]_summary_comp.csv'
            >>> out_dir = 'default_params'
            >>> layers = ['growseason_mean', 'annual_mean']
            >>> # create a InterpGdal instance
            >>> test = InterpGdal(summary_file)
            >>> # run inverse distance interpolation
            >>> test.gdal_grid(out_dir=out_dir, layer=layers)
            
            Note, zonal statistics to gridMET cells and interpolated residual 
            plots are computed by default. A gridMET fishnet must have been
            previously created using :func:`gridwxcomp.spatial.make_grid`. 
            
            After running the code above the following files will be created 
            in the 'default_params' directory which will be build in the same 
            location as the input summary CSV::

                default_params/           
                ├── annual_mean.tiff
                ├── annual_mean.vrt
                ├── etr_mm_summary_comp_all_yrs.csv
                ├── etr_mm_summary_pts.cpg
                ├── etr_mm_summary_pts.dbf
                ├── etr_mm_summary_pts.prj
                ├── etr_mm_summary_pts.shp
                ├── etr_mm_summary_pts.shx
                ├── zonal_stats.csv
                ├── growseason_mean.tiff
                ├── growseason_mean.vrt
                └── residual_plots/
                    ├── annual_res.html
                    └── grow_res.html

            GeoTiff interpolated raster files are now created for select layers
            as well as VRT (virtual raster) meta files that store info
            on each raster's data source. The file "gridMET_stats.csv" contains
            gridMET ID as an index and each layer zonal mean as columns. For
            example,
            
                ========== ================== ================== 
                GRIDMET_ID growseason_mean    annual_mean        
                ========== ================== ================== 
                511747     0.9650671287940088 0.9078723876243023 
                510361     0.9658465063428492 0.9097255715561022 
                508975     0.9667075970344162 0.9117676407214926 
                ========== ================== ================== 
                
            There are several :class:`InterpGdal` instance attributes that 
            may be useful, for example to see the parameters that were used 
            for the last call to :meth:`InterpGdal.gdal_grid`
            
            >>> test.params
            {'power': 2,
             'smoothing': 0,
             'radius1': 0,
             'radius2': 0,
             'angle': 0,
             'max_points': 0,
             'min_points': 0,
             'nodata': -999}
                 
            Or to find the paths to the interpolated raster files that
            have been created by the instance (all), the "interped_rasters"
            instance attribute is a list of all :obj:`pathlib.Path` objects
            of absolute paths of raster files. To get them as strings,
            
            >>> list(map(str, test.interped_rasters))
            ['PATH/TO/growseason_mean.tiff',
             'PATH/TO/annual_mean.tiff']
             
            Similary, the raster extent that was used and will be used again for
            any subsequent calls of :meth:`InterpGdal.gdal_grid` can be 
            retrieved by
             
            >>> test.grid_bounds 
            (-111.74583329966664,
             -108.74583330033335,
             38.21250000003333,
             40.462499999966674)
             
        Note:
            The ``nx_cells`` and ``ny_cells`` arguments are an option for 
            defining the output raster's resolution. If these arguments are 
            passed then the ``scale_factor`` argument has no effect. The latter
            assumes raster resolution is relative to gridMET (4 km). 
            
        Raises:
            KeyError: if interp_meth is not a valid gdal_grid interpolation
                algorithm name. 
        """
        
        cwd = os.getcwd()
        
        # user provided uniform grid, cell size should be in dec. degrees
        if grid_res is not None:
            CS = grid_res
        else: # assume gridMET
            CS = self.GRIDMET_RES
        self.grid_res = CS
        
        # grid resolution in decimal degrees
        res = scale_factor * CS
        # file structure to match gridwxcomp.spatial.interpolate
        grid_var = str(self.summary_csv_path).split('_summary')[0]
        
        # out_dir as subdir from dir with summary CSV 
        out_dir = (Path(self.summary_csv_path).parent/Path(out_dir)).resolve()

        if not out_dir.is_dir():
            print('{}\nDoes not exist, creating directory'.format(str(out_dir)))
            out_dir.mkdir(parents=True, exist_ok=True)
    
        source_file = Path(self.summary_csv_path).name
        source = source_file.replace('.csv', '_tmp')
        
        if interp_meth not in InterpGdal.interp_methods:
            err_msg = '{} not a valid interpolation method'.format(interp_meth)
            raise KeyError(err_msg)
        self.interp_meth = interp_meth
            
        # look up default parameters for interpolation method
        if not params:
            params = InterpGdal.default_params.get(self.interp_meth)
        # if run from command line, params is string to be parsed
        if isinstance(params, str):
            params = self._str_to_params(params)
        # avoid zero NA values for zonal_stats, set default NA rep to -999
        if not params.get('nodata'):
            params['nodata'] = -999
        # update instance parameters for later reference
        self.params = params
        # parameters to command line input str :name=value:name=value ...
        param_str = ':'+':'.join('{}={}'.format(key,val) 
                                 for (key,val) in params.items()) 
        # get boundary info and update instance attribute
        if not bounds and not self.grid_bounds:
            # use gridwxcomp.spatial function, assume 25 cell buffer
            self.grid_bounds = get_subgrid_bounds(self.summary_csv_path, 
                buffer=25, grid_res=CS)
        elif bounds:
            self.grid_bounds = bounds
        # raster extent
        xmin,xmax,ymin,ymax = self.grid_bounds

        # if not given get pixels in lon lat 
        # add one cell to avoid unfilled extent in case of large upscaling
        if not nx_cells:
            nx_cells = int(np.abs(xmin - xmax) / \
                    (CS * scale_factor)) + 1
        if not ny_cells:
            ny_cells = int(np.abs(ymin - ymax) / \
                    (CS * scale_factor)) + 1
        # fix any minor adjustments to make raster fit gridMET fishnet extent
        # if scale_factor=1 make sure raster pixels align exactly w/gridcells
        xmax = xmin + nx_cells * CS * scale_factor
        ymin = ymax - ny_cells * CS * scale_factor

        # to parse options, like --config GDAL_NUM_THREADS update here
        if not options:
            options = ''
                
        def _run_gdal_grid(layer):
            """reuse if running multiple layers"""
            existing_layers = pd.read_csv(self.summary_csv_path).columns
            if not layer in existing_layers:
                print('\nError: {} does not exist in input CSV:\n {}'.format(
                   layer, self.summary_csv_path),
                     '\nSkipping interpolation.'
               )
                return
            
            # move to out_dir to run gdal command
            os.chdir(out_dir)
            # build vrt files in out_dir 
            self._make_pt_vrt(layer, out_dir)
            
            vrt_file = '{}.vrt'.format(layer)
            tiff_file = '{}.tiff'.format(layer)
            # add raster path to instance if not already there (overwritten)
            out_file = out_dir.joinpath(tiff_file)
            # print message to console/logging about interpolation
            grid_var = Path(self.summary_csv_path).name.split('_summ')[0]
            _interp_msg(grid_var, layer, self.interp_meth, res, out_file)
            # build command line arguments
            cmd = (r'gdal_grid -a {meth}{p} -txe {xmin} {xmax} -tye {ymax}' 
                  ' {ymin} -outsize {nx} {ny} -of GTiff -ot Float32 -l {source}'
                  ' {vrt} {out} {options}'.format(meth=interp_meth, p=param_str,
                      xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax, nx=nx_cells, 
                      ny=ny_cells, source=source, vrt=vrt_file, out=tiff_file, 
                      options=options))
            # run gdal_grid with arguments, x-platform
            p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)
            out, err = p.communicate()
            if err:
                print(err)
            else:
                if not out_file in self.interped_rasters:
                    self.interped_rasters.append(out_file)

            p.stdout.close()
            p.stderr.close()    
         
            os.chdir(cwd)

            # calculate interpolated values and error at stations
            # only calc residuals for mean bias ratios, i.e. not std dev, etc.
            if layer in InterpGdal.default_layers:
                calc_pt_error(self.summary_csv_path, out_dir, layer, grid_var,
                    grid_id_name=grid_id_name
                )
            else:
                # delete tmp summary csv used in interpgdal _make_pt_vrt method 
                # normally deleted in calc_pt_error
                tmp_csv = str(self.summary_csv_path).replace('.csv','_tmp.csv')
                if Path(tmp_csv).resolve().is_file():
                    Path(tmp_csv).resolve().unlink()

            # zonal means extracted to GRIDMET_ID (cell index) 
            if z_stats:
                zonal_stats(self.summary_csv_path, out_file,
                    grid_id_name=grid_id_name
                )
            # residual (error) bar plot, only for mean bias ratios
            if res_plot and layer in InterpGdal.default_layers:
                layer = self.var_residual_names.get(
                    layer, 
                    layer.replace('mean','res')
                )
                y_label = 'residual (interpolated minus station value)'
                title = 'layer: {} algorithm: {} (gdal_grid) res: {} deg.'.\
                        format(layer, self.interp_meth, res)
                res_plot_dir = Path(out_dir)/'residual_plots'
                subtitle = 'parameters: {}'.format(params)
                source_file =  Path(out_dir)/Path(self.summary_csv_path).name

                station_bar_plot(source_file, layer, out_dir=res_plot_dir,
                    y_label=y_label, title=title, subtitle=subtitle)

            
        # run interpolation and zonal statistics depending on layer kwarg
        if layer == 'all': # potential for multiprocessing 
            for l in self.layers:
                _run_gdal_grid(l)
        # run single field
        elif isinstance(layer, str):
            _run_gdal_grid(layer)
        # run select list or tuple of layers
        elif isinstance(layer, (list, tuple)):
            for l in layer:
                _run_gdal_grid(l)
                
                
def _prettify(elem):
    """
    Return an indented, multiline XML string for a XML element tree.
    """
    rough_string = ET.tostring(elem, 'utf-8', method='xml')
    reparsed = minidom.parseString(rough_string)
    return reparsed.toprettyxml(indent='  ')

def _interp_msg(grid_var, layer, function, res, out_file):
    print(
        '\nInterpolating {g} point bias ratios for: {t}\n'.\
            format(g=grid_var, t=layer),
        'Using the "{}" method\n'.format(function),
        'Resolution (pixel size) of output raster: {} degrees'.format(res)
    )
    print(
        'GeoTIFF raster will be saved to: \n',
        os.path.abspath(out_file)
    )