Source code for rex.utilities.bc_utils

# -*- coding: utf-8 -*-
"""
rex bias correction utilities.
"""
import os
from concurrent.futures import ProcessPoolExecutor, as_completed
import logging
import numpy as np
import scipy

logger = logging.getLogger(__name__)


[docs] def sample_q_linear(n_samples): """Sample quantiles from 0 to 1 inclusive linearly with even spacing Parameters ---------- n_samples : int Number of points to sample between 0 and 1 Returns ------- quantiles : np.ndarray 1D array of evenly spaced samples from 0 to 1 """ quantiles = np.linspace(0, 1, n_samples) return quantiles
[docs] def sample_q_log(n_samples, log_base): """Sample quantiles from 0 to 1 while concentrating samples near quantile=0 Parameters ---------- n_samples : int Number of points to sample between 0 and 1 log_base : int | float Log base value. A higher value will concentrate more samples at the extreme sides of the distribution. Returns ------- quantiles : np.ndarray 1D array of log-spaced samples from 0 to 1 """ quantiles = np.logspace(0, 1, n_samples, base=log_base) quantiles = (quantiles - 1) / (log_base - 1) return quantiles
[docs] def sample_q_invlog(n_samples, log_base): """Sample quantiles from 0 to 1 while concentrating samples near quantile=1 Parameters ---------- n_samples : int Number of points to sample between 0 and 1 log_base : int | float Log base value. A higher value will concentrate more samples at the extreme sides of the distribution. Returns ------- quantiles : np.ndarray 1D array of log-spaced samples from 0 to 1 """ quantiles = np.logspace(0, 1, n_samples, base=log_base) quantiles = (quantiles - 1) / (log_base - 1) quantiles = np.array(sorted(1 - quantiles)) return quantiles
[docs] def sample_cdf(quantiles, x_values, n_samples): """Randomly draw a number of real values from a CDF. quantiles : np.ndarray 1D array of quantile values from 0 to 1. Must be monotonic. x_values : np.ndarray Values on the x-axis of a CDF corresponding to quantiles. Must be monotonic. n_samples : int Number of sample to draw Returns ------- samples : np.ndarray 1D array of real values sampled from the CDF made up by quantiles and x_values """ samples = np.random.uniform(0, 1, n_samples) samples = np.interp(samples, quantiles, x_values) return samples
[docs] class QuantileDeltaMapping: """Class for quantile delta mapping based on the method from Cannon et al., 2015 Note that this is a utility class for implementing QDM and should not be requested directly as a method in the reV/rex bias correction table input Cannon, A. J., Sobie, S. R. & Murdock, T. Q. Bias Correction of GCM Precipitation by Quantile Mapping: How Well Do Methods Preserve Changes in Quantiles and Extremes? Journal of Climate 28, 6938–6959 (2015). """ def __init__(self, params_oh, params_mh, params_mf, dist='empirical', relative=True, sampling='linear', log_base=10, delta_denom_min=None, delta_denom_zero=None, delta_range=None): """ Parameters ---------- params_oh : np.ndarray 2D array of **observed historical** distribution parameters created from a multi-year set of data where the shape is (space, N). This can be the output of a parametric distribution fit like ``scipy.stats.weibull_min.fit()`` where N is the number of parameters for that distribution, or this can define the x-values of N points from an empirical CDF that will be linearly interpolated between. If this is an empirical CDF, this must include the 0th and 100th percentile values and have even percentile spacing between values. params_mh : np.ndarray Same requirements as params_oh. This input arg is for the **modeled historical distribution**. params_mf : np.ndarray | None Same requirements as params_oh. This input arg is for the **modeled future distribution**. If this is None, this defaults to params_mh (no future data, just corrected to modeled historical distribution) dist : str | np.ndarray Probability distribution name to use to model the data which determines how the param args are used. This can "empirical" or any continuous distribution name from ``scipy.stats``. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. relative : bool | np.ndarray Flag to preserve relative rather than absolute changes in quantiles. relative=False (default) will multiply by the change in quantiles while relative=True will add. See Equations 4-6 from Cannon et al., 2015 for more details. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. sampling : str | np.ndarray If dist="empirical", this is an option for how the quantiles were sampled to produce the params inputs, e.g., how to sample the y-axis of the distribution (see sampling functions in ``rex.utilities.bc_utils``). "linear" will do even spacing, "log" will concentrate samples near quantile=0, and "invlog" will concentrate samples near quantile=1. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. log_base : int | float | np.ndarray Log base value if sampling is "log" or "invlog". A higher value will concentrate more samples at the extreme sides of the distribution. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. delta_denom_min : float | None Option to specify a minimum value for the denominator term in the calculation of a relative delta value. This prevents division by a very small number making delta blow up and resulting in very large output bias corrected values. See equation 4 of Cannon et al., 2015 for the delta term. delta_denom_zero : float | None Option to specify a value to replace zeros in the denominator term in the calculation of a relative delta value. This prevents division by a very small number making delta blow up and resulting in very large output bias corrected values. See equation 4 of Cannon et al., 2015 for the delta term. delta_range : tuple | None Option to set a (min, max) on the delta term in QDM. This can help prevent QDM from making non-realistic increases/decreases in otherwise physical values. See equation 4 of Cannon et al., 2015 for the delta term. """ self.params_oh = params_oh self.params_mh = params_mh self.params_mf = params_mf if params_mf is not None else params_mh self.relative = bool(self._clean_kwarg(relative)) self.dist_name = str(self._clean_kwarg(dist)).casefold() self.sampling = str(self._clean_kwarg(sampling)).casefold() self.log_base = float(self._clean_kwarg(log_base)) self.scipy_dist = None self.delta_denom_min = delta_denom_min self.delta_denom_zero = delta_denom_zero self.delta_range = delta_range if self.dist_name != 'empirical': self.scipy_dist = getattr(scipy.stats, self.dist_name, None) if self.scipy_dist is None: msg = ('Could not get requested distribution "{}" from ' '``scipy.stats``. Please double check your spelling ' 'and select "empirical" or one of the continuous ' 'distribution options from here: ' 'https://docs.scipy.org/doc/scipy/reference/stats.html' .format(self.dist_name)) logger.error(msg) raise KeyError(msg) @staticmethod def _clean_kwarg(inp): """Clean any kwargs inputs (e.g., dist, relative) that might be provided as an array and must be collapsed into a single string or boolean value""" unique = np.unique(inp) msg = ('_QuantileDeltaMapping kwargs must have only one unique input ' 'even if being called with arrays as part of reV but found: {}' .format(unique)) assert len(unique) == 1, msg while isinstance(inp, np.ndarray): inp = inp[0] return inp @staticmethod def _clean_params(params, arr_shape, scipy_dist): """Verify and clean 2D parameter arrays for passing into empirical distribution or scipy continuous distribution functions. Parameters ---------- params : np.ndarray Input params shape should be (space, N) where N is the number of parameters for the distribution. arr_shape : tuple Array shape should be (time, space). scipy_dist : scipy.stats.rv_continuous | None Any continuous distribution class from ``scipy.stats`` or None if using an empirical distribution (taken from attribute ``QuantileDeltaMapping.scipy_dist``) Returns ------- params : np.ndarray | list If a scipy continuous dist is set, this output will be params unpacked along axis=1 into a list so that the list entries represent the scipy distribution parameters (e.g., shape, scale, loc) and each list entry is of shape (space,) """ msg = f'params must be 2D array but received {type(params)}' assert hasattr(params, 'shape'), msg if len(params.shape) == 1: params = np.expand_dims(params, 0) msg = (f'params must be 2D array of shape ({arr_shape[1]}, N) ' f'but received shape {params.shape}') assert len(params.shape) == 2, msg assert params.shape[0] == arr_shape[1], msg if scipy_dist is not None: params = [params[:, i] for i in range(params.shape[1])] return params @staticmethod def _get_quantiles(n_samples, sampling, log_base): """If dist='empirical', this will get the quantile values for the CDF x-values specified in the input params""" if sampling == 'linear': quantiles = sample_q_linear(n_samples) elif sampling == 'log': quantiles = sample_q_log(n_samples, log_base) elif sampling == 'invlog': quantiles = sample_q_invlog(n_samples, log_base) else: msg = ('sampling option must be linear, log, or invlog, but ' 'received: {}'.format(sampling)) logger.error(msg) raise KeyError(msg) return quantiles
[docs] @classmethod def cdf(cls, x, params, scipy_dist, sampling, log_base): """Run the CDF function e.g., convert physical variable to quantile""" if scipy_dist is None: p = np.zeros_like(x) for idx in range(x.shape[1]): xp = params[idx, :] fp = cls._get_quantiles(len(xp), sampling, log_base) p[:, idx] = np.interp(x[:, idx], xp, fp) else: p = scipy_dist.cdf(x, *params) return p
[docs] @classmethod def ppf(cls, p, params, scipy_dist, sampling, log_base): """Run the inverse CDF function (percent point function) e.g., convert quantile to physical variable""" if scipy_dist is None: x = np.zeros_like(p) for idx in range(p.shape[1]): fp = params[idx, :] xp = cls._get_quantiles(len(fp), sampling, log_base) x[:, idx] = np.interp(p[:, idx], xp, fp) else: x = scipy_dist.ppf(p, *params) return x
[docs] @classmethod def run_qdm(cls, arr, params_oh, params_mh, params_mf, scipy_dist, relative, sampling, log_base, delta_denom_min, delta_denom_zero, delta_range): """Run the actual QDM operation from args without initializing the ``QuantileDeltaMapping`` object Parameters ---------- arr : np.ndarray 2D array of values in shape (time, space) params_oh : np.ndarray 2D array of **observed historical** distribution parameters created from a multi-year set of data where the shape is (space, N). This can be the output of a parametric distribution fit like ``scipy.stats.weibull_min.fit()`` where N is the number of parameters for that distribution, or this can define the x-values of N points from an empirical CDF that will be linearly interpolated between. If this is an empirical CDF, this must include the 0th and 100th percentile values and have even percentile spacing between values. params_mh : np.ndarray Same requirements as params_oh. This input arg is for the **modeled historical distribution**. params_mf : np.ndarray Same requirements as params_oh. This input arg is for the **modeled future distribution**. scipy_dist : scipy.stats.rv_continuous | None Any continuous distribution class from ``scipy.stats`` or None if using an empirical distribution (taken from attribute ``QuantileDeltaMapping.scipy_dist``) relative : bool | np.ndarray Flag to preserve relative rather than absolute changes in quantiles. relative=False (default) will multiply by the change in quantiles while relative=True will add. See Equations 4-6 from Cannon et al., 2015 for more details. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. sampling : str | np.ndarray If dist="empirical", this is an option for how the quantiles were sampled to produce the params inputs, e.g., how to sample the y-axis of the distribution (see sampling functions in ``rex.utilities.bc_utils``). "linear" will do even spacing, "log" will concentrate samples near quantile=0, and "invlog" will concentrate samples near quantile=1. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. log_base : int | float | np.ndarray Log base value if sampling is "log" or "invlog". A higher value will concentrate more samples at the extreme sides of the distribution. Can also be a 1D array of dist inputs if being used from reV, but they must all be the same option. delta_denom_min : float | None Option to specify a minimum value for the denominator term in the calculation of a relative delta value. This prevents division by a very small number making delta blow up and resulting in very large output bias corrected values. See equation 4 of Cannon et al., 2015 for the delta term. delta_denom_zero : float | None Option to specify a value to replace zeros in the denominator term in the calculation of a relative delta value. This prevents division by a very small number making delta blow up and resulting in very large output bias corrected values. See equation 4 of Cannon et al., 2015 for the delta term. delta_range : tuple | None Option to set a (min, max) on the delta term in QDM. This can help prevent QDM from making non-realistic increases/decreases in otherwise physical values. See equation 4 of Cannon et al., 2015 for the delta term. Returns ------- arr : np.ndarray Bias corrected copy of the input array with same shape. """ params_oh = cls._clean_params(params_oh, arr.shape, scipy_dist) params_mh = cls._clean_params(params_mh, arr.shape, scipy_dist) params_mf = cls._clean_params(params_mf, arr.shape, scipy_dist) # Equation references are from Section 3 of Cannon et al 2015: # Cannon, A. J., Sobie, S. R. & Murdock, T. Q. Bias Correction of GCM # Precipitation by Quantile Mapping: How Well Do Methods Preserve # Changes in Quantiles and Extremes? Journal of Climate 28, 6938–6959 # (2015). logger.debug('Computing CDF on modeled future data') # Eq.3: Tau_m_p = F_m_p(x_m_p) q_mf = cls.cdf(arr, params_mf, scipy_dist, sampling, log_base) logger.debug('Computing PPF on observed historical data') # Eq.5: x^_o:m_h:p = F-1_o_h(Tau_m_p) x_oh = cls.ppf(q_mf, params_oh, scipy_dist, sampling, log_base) logger.debug('Computing PPF on modeled historical data') # Eq.4 denom: F-1_m_h(Tau_m_p) x_mh_mf = cls.ppf(q_mf, params_mh, scipy_dist, sampling, log_base) logger.debug('Finished computing distributions.') if relative: if delta_denom_zero is not None: x_mh_mf[x_mh_mf == 0] = delta_denom_zero if delta_denom_min is not None: x_mh_mf = np.maximum(x_mh_mf, delta_denom_min) delta = arr / x_mh_mf # Eq.4: x_m_p / F-1_m_h(Tau_m_p) if delta_range is not None: delta = np.maximum(delta, np.min(delta_range)) delta = np.minimum(delta, np.max(delta_range)) arr_bc = x_oh * delta # Eq.6: x^_m_p = x^_o:m_h:p * delta else: delta = arr - x_mh_mf # Eq.4: x_m_p - F-1_m_h(Tau_m_p) if delta_range is not None: delta = np.maximum(delta, np.min(delta_range)) delta = np.minimum(delta, np.max(delta_range)) arr_bc = x_oh + delta # Eq.6: x^_m_p = x^_o:m_h:p + delta return arr_bc
[docs] def __call__(self, arr, max_workers=1): """Run the QDM function to bias correct an array Parameters ---------- arr : np.ndarray 2D array of values in shape (time, space) max_workers : int, None Number of parallel workers to use in QDM bias correction. 1 will run in serial (default), None will use all available cores. Returns ------- arr : np.ndarray Bias corrected copy of the input array with same shape. """ if len(arr.shape) == 1: arr = np.expand_dims(arr, 1) if max_workers == 1: arr_bc = self.run_qdm(arr, self.params_oh, self.params_mh, self.params_mf, self.scipy_dist, self.relative, self.sampling, self.log_base, self.delta_denom_min, self.delta_denom_zero, self.delta_range) else: max_workers = max_workers or os.cpu_count() sslices = np.array_split(np.arange(arr.shape[1]), arr.shape[1]) sslices = [slice(idx[0], idx[-1] + 1) for idx in sslices] arr_bc = arr.copy() futures = {} with ProcessPoolExecutor(max_workers=max_workers) as exe: for idx in range(arr.shape[1]): idx = slice(idx, idx + 1) fut = exe.submit(self.run_qdm, arr[:, idx], self.params_oh[idx], self.params_mh[idx], self.params_mf[idx], self.scipy_dist, self.relative, self.sampling, self.log_base, self.delta_denom_min, self.delta_denom_zero, self.delta_range) futures[fut] = idx for future in as_completed(futures): idx = futures[future] arr_bc[:, idx] = future.result() msg = ('Input shape {} does not match QDM bias corrected output ' 'shape {}!'.format(arr.shape, arr_bc.shape)) assert arr.shape == arr_bc.shape, msg return arr_bc