Source code for rex.temporal_stats.temporal_stats

# -*- coding: utf-8 -*-
"""
Temporal Statistics Extraction
"""
from concurrent.futures import as_completed
import gc
import logging
import numpy as np
import os
import pandas as pd

from rex.resource import Resource
from rex.utilities.bc_utils import (sample_q_linear, sample_q_log,
                                    sample_q_invlog)
from rex.utilities.execution import SpawnProcessPool
from rex.utilities.loggers import log_mem, log_versions, create_dirs
from rex.utilities.utilities import get_lat_lon_cols, slice_sites

logger = logging.getLogger(__name__)



[docs]
def circular_mean(data, weights=None, degrees=True, axis=0,
                  exponential_weights=True):
    """
    Computed the ciruclar average. if provided compute the weighed average
    with the given weights. For example, if averaging wind direction with wind
    speed as weights, wind directions that occur at higher wind speeds will
    have a larger weight of the final mean value.

    Parameters
    ----------
    data : ndarray
        Data to average
    weights : ndarray, optional
        Weights to apply to data during averaging, must be of the same
        shape as data, by default None
    degree : bool, optional
        Flag indicating that data is in degrees and needs to be converted
        to/from radians during averaging. By default True
    axis : int, optional
        Axis to compute average along, by default 0 which will produce
        site averages
    norm_weights: : bool, optional
        Flag to normalize weights, by default True
    exponential_weights : bool
        Flag to convert weights to exponential, by default True

    Returns
    -------
    mean : ndarray
        Weighted circular mean along the given axis
    """
    if degrees:
        data = np.radians(data, dtype=np.float32)

    sin = np.sin(data)
    cos = np.cos(data)
    if weights is None:
        sin = np.nanmean(sin, axis=axis)
        cos = np.nanmean(cos, axis=axis)
    else:
        if exponential_weights:
            weights = np.exp(weights)

        if weights.shape != data.shape:
            msg = ('The shape of weights {} does not match the shape of the '
                   'data {} to which it is to be applied!'
                   .format(weights.shape, data.shape))
            logger.error(msg)
            raise RuntimeError(msg)

        n_weights = np.expand_dims(np.nansum(weights, axis=axis), axis)
        sin = np.nansum(sin * weights, axis=axis) / n_weights
        cos = np.nansum(cos * weights, axis=axis) / n_weights

    mean = np.arctan2(sin, cos)
    if degrees:
        mean = np.degrees(mean)
        mask = mean < 0
        if isinstance(mask, np.ndarray):
            mean[mask] += 360
        elif mask:
            mean += 360

    return mean




[docs]
def cdf(data, n_samples=50, sampling='linear', log_base=10, decimals=None):
    """Get a number of x-values that define a CDF for the input data.

    Parameters
    ----------
    data : np.ndarray
        1D array of data to make a CDF for
    n_samples : int
        Number of points to fit the CDF
    sampling : str
        Option for quantile sampling (see sampling functions in
        ``rex.utilities.bc_utils``), e.g., how to sample the y-axis of the
        distribution. "linear" will do even spacing, "log" will concentrate
        samples near quantile=0, and "invlog" will concentrate samples near
        quantile=1
    log_base : int | float
        Log base value if sampling is "log" or "invlog". A higher value will
        concentrate more samples at the extreme sides of the distribution.
    decimals : int | None
        Precision to round output to (see docstring for np.round). None will
        not round outputs (default).

    Returns
    -------
    x_values : np.ndarray
        1D array of values with shape (n_samples,). Each value is in the same
        units as the input data argument. The x_values[0] is the minimum value
        of data (0th percentile) and x_values[-1] is the maximum
        (100th percentile). The values are spaced in quantile space (y-axis of
        the CDF) according to the sampling option (e.g., evenly spaced if
        sampling='linear').
    """

    nan_mask = np.isnan(data)
    if nan_mask.all():
        return np.zeros(n_samples)

    sampling = sampling.casefold()
    if sampling == 'linear':
        quantiles = sample_q_linear(n_samples)
    elif sampling == 'log':
        quantiles = sample_q_log(n_samples, log_base)
    elif sampling == 'invlog':
        quantiles = sample_q_invlog(n_samples, log_base)
    else:
        msg = ('sampling option must be linear, log, or invlog, but received: '
               '{}'.format(sampling))
        logger.error(msg)
        raise KeyError(msg)

    x_values = np.interp(quantiles, np.linspace(0, 1, len(data[~nan_mask])),
                         sorted(data[~nan_mask]))

    msg = (f'First and last x-value points defining the CDF '
           '({x_values[0]}, {x_values[-1]}) '
           f'were not the min and max data values '
           f'({np.nanmin(data)}, {np.nanmin(data)}).')
    assert x_values[0] == np.nanmin(data), msg
    assert x_values[-1] == np.nanmax(data), msg

    if decimals is not None:
        x_values = np.round(x_values, decimals=decimals)

    return x_values




[docs]
class TemporalStats:
    """
    Temporal Statistics from Resource Data
    """
    STATS = {'mean': {'func': np.nanmean, 'kwargs': {'axis': 0}},
             'median': {'func': np.nanmedian, 'kwargs': {'axis': 0}},
             'std': {'func': np.nanstd, 'kwargs': {'axis': 0}}}

    def __init__(self, res_h5, statistics='mean', res_cls=Resource,
                 hsds=False):
        """
        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'mean'
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        """
        log_versions(logger)
        self._res_h5 = res_h5
        self._stats = None
        self.statistics = statistics

        self._res_cls = res_cls
        self._hsds = hsds

        with res_cls(res_h5, hsds=self._hsds) as f:
            self._time_index = f.time_index
            self._meta = f.meta

    @property
    def res_h5(self):
        """
        Path to resource h5 file(s)

        Returns
        -------
        str
        """
        return self._res_h5

    @property
    def statistics(self):
        """
        Dictionary of statistic functions/kwargs to run

        Returns
        -------
        dict
        """
        return self._stats

    @statistics.setter
    def statistics(self, statistics):
        """
         Statistics to extract, either a key or tuple of keys in
        cls.STATS, or a dictionary of the form
        {'stat_name': {'func': *, 'kwargs: {**}}}

        Parameters
        ----------
        statistics : dict
        """
        self._stats = self._check_stats(statistics)

    @property
    def res_cls(self):
        """
        Resource class to use to access res_h5

        Returns
        -------
        Class
        """
        return self._res_cls

    @property
    def time_index(self):
        """
        Resource Datetimes

        Returns
        -------
        pandas.DatetimeIndex
        """
        return self._time_index

    @property
    def meta(self):
        """
        Resource meta-data table

        Returns
        -------
        pandas.DataFrame
        """
        return self._meta

    @property
    def lat_lon(self):
        """
        Resource (lat, lon) coordinates

        Returns
        -------
        pandas.DataFrame
        """
        lat_lon_cols = get_lat_lon_cols(self.meta)

        return self.meta[lat_lon_cols]

    @staticmethod
    def _format_grp_names(grp_names):
        """
        Format groupby index values

        Parameters
        ----------
        grp_names : list
            Group by index values, these correspond to each unique group in
            the groupby

        Returns
        -------
        out : ndarray
            2D array of grp index values properly formatted as strings
        """
        month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May',
                     6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct',
                     11: 'Nov', 12: 'Dec'}

        # pylint: disable=unnecessary-lambda-assignment, unnecessary-lambda
        year = lambda s: "{}".format(s)
        month = lambda s: "{}".format(month_map[s])
        hour = lambda s: "{:02d}:00UTC".format(s)

        grp_names = np.array(grp_names).T
        if len(grp_names.shape) == 1:
            grp_names = np.expand_dims(grp_names, 0)

        out = []
        for grp_i in grp_names:  # pylint: disable=not-an-iterable
            grp_max = grp_i.max()
            if grp_max <= 12:
                out.append(list(map(month, grp_i)))
            elif grp_max <= 23:
                out.append(list(map(hour, grp_i)))
            else:
                out.append(list(map(year, grp_i)))

        return np.array(out).T

    @classmethod
    def _create_names(cls, groups, stats):
        """
        Generate statistics names

        Parameters
        ----------
        groups : list
            List of group names, some combination of year, month, hour
        stats : list
            Statistics to be computed

        Returns
        -------
        columns_map : dict
            Dictionary of column names to use for each statistic
        """
        group_names = cls._format_grp_names(groups)

        columns_map = {}
        for s in stats:
            # pylint: disable=not-an-iterable
            cols = ['{}_{}'.format('-'.join(n), s) for n
                    in group_names]
            columns_map[s] = cols

        return columns_map

    @staticmethod
    def _compute_weighted_stats(func, res_data, weights, column_names,
                                **kwargs):
        """
        Computed the weighted means using given function and kwargs

        Parameters
        ----------
        func : object
            Function to use to compute the weighted means
        res_data : pandas.DataFrame | pandas.GroupBy
            Resource data to compute weighted stats from
        weights : pandas.DataFrame | pandas.GroupBy
            Weights to use for weighted stats calculation
            Column names based on group by names, by default None
        column_names : list | str
            Either the state name or the list of output stat names,
            used out output column names.
        kwargs : dict
            Function kwargs
        """
        if isinstance(column_names, list):
            s_data = {}
            for c_name, (grp_name, res_grp) in zip(column_names, res_data):
                if weights is not None:
                    grp_w = weights.get_group(grp_name[0])
                else:
                    grp_w = None

                grp_s = func(res_grp, weights=grp_w, **kwargs)
                if grp_s.shape[0] == 1:
                    grp_s = grp_s[0]

                s_data[c_name] = grp_s

            s_data = pd.DataFrame(s_data)
        else:
            s_data = func(res_data, weights=weights, **kwargs)
            if s_data.shape[0] == 1:
                s_data = s_data[0]

            s_data = pd.DataFrame(s_data.flatten(), columns=[column_names])

        return s_data

    @classmethod
    def _compute_stats(cls, res_data, statistics, diurnal=False, month=False):
        """
        Compute desired stats for desired time intervals from res_data

        Parameters
        ----------
        res_data : pandas.DataFrame
            DataFrame or resource data. Index is time_index, columns are sites
        statistics : dict
            Dictionary of statistic functions/kwargs to run
        diurnal : bool, optional
            Extract diurnal stats, by default False
        month : bool, optional
            Extract monthly stats, by default False

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        groupby = []
        column_names = None
        if month:
            groupby.append(res_data.index.month)

        if diurnal:
            groupby.append(res_data.index.hour)

        if groupby:
            res_data = res_data.groupby(groupby)
            column_names = cls._create_names(list(res_data.groups),
                                             list(statistics))

        res_stats = []
        for name, stat in statistics.copy().items():
            func = stat['func']
            kwargs = stat.get('kwargs', {}).copy()
            if name.lower().startswith('weight'):
                weights = kwargs.pop('weights').copy()
                if groupby:
                    weights = weights.groupby(groupby)
                    weight_names = column_names[name]
                else:
                    weight_names = name

                s_data = cls._compute_weighted_stats(func, res_data, weights,
                                                     weight_names, **kwargs)
            else:
                axis = kwargs.pop('axis', 0)
                s_data = res_data.aggregate(func, axis=axis, **kwargs)

                if groupby:
                    columns = column_names[name]
                    s_data = s_data.T
                    s_data.columns = columns
                elif not isinstance(s_data, pd.DataFrame):
                    s_data = s_data.to_frame(name=name)
                elif isinstance(s_data, pd.DataFrame) and len(s_data) > 1:
                    # e.g., if func is scipy.stats.beta.fit(), this collapses
                    # multiple output parameters into list
                    s_data['name'] = name
                    s_data = s_data.groupby('name').agg(list).T

            res_stats.append(s_data)

        res_stats = pd.concat(res_stats, axis=1)

        return res_stats

    @staticmethod
    def _create_index(sites_slice):
        """
        Create index from site slice

        Parameters
        ----------
        sites_slice : slice | list | ndarray
            Sites to build index from

        Returns
        -------
        idx : list
            site gids
        """
        if isinstance(sites_slice, slice) and sites_slice.stop:
            idx = list(range(*sites_slice.indices(sites_slice.stop)))
        elif isinstance(sites_slice, (list, np.ndarray)):
            idx = sites_slice

        return idx

    @staticmethod
    def _extract_weights(res, weights_dsets, sites_slice, time_index):
        """
        Extract weights datasets from resource and combine into weights
        to use for weighted stats

        Parameters
        ----------
        res : rex.Resource
            Open Resource class or sub-class to extract datasets from
        weights_dsets : str | list | tuple
            List of weight(s) datasets to extract and combine
        sites_slice : slice
            Subslice of sites to extract weights for
        time_index : pandas.DatatimeIndex
            Resource DatetimeIndex, needed to output DataFrame Index

        Returns
        -------
        weights : pandas.DataFrame
            Weights DataFrame to match res_data
        """
        if not isinstance(weights_dsets, (list, tuple)):
            weights_dsets = [weights_dsets]

        weights = None
        for dset in weights_dsets:
            if weights is None:
                weights = res[dset, :, sites_slice]
            else:
                weights *= res[dset, :, sites_slice]

        return pd.DataFrame(weights, index=time_index)

    @classmethod
    def _extract_stats(cls, res_h5, statistics, dataset, res_cls=Resource,
                       hsds=False, time_index=None, sites_slice=None,
                       diurnal=False, month=False, combinations=False,
                       mask_zeros=False):
        """
        Extract stats for given dataset, sites, and temporal extent

        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        statistics : dict
            Statistics to extract a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}}
        dataset : str
            Dataset to extract stats for
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        time_index : pandas.DatatimeIndex | None, optional
            Resource DatetimeIndex, if None extract from res_h5,
            by default None
        sites_slice : slice | None, optional
            Sites to extract, if None all, by default None
        diurnal : bool, optional
            Extract diurnal stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        if sites_slice is None:
            sites_slice = slice(None, None, None)

        with res_cls(res_h5, hsds=hsds) as f:
            if time_index is None:
                time_index = f.time_index

            res_data = pd.DataFrame(f[dataset, :, sites_slice],
                                    index=time_index)

            if mask_zeros:
                res_data[(res_data == 0)] = np.nan

            for s, s_dict in statistics.items():
                weights = s_dict.get('kwargs', {}).get('weights')
                if weights is not None:
                    weights = cls._extract_weights(f, weights, sites_slice,
                                                   time_index)
                    statistics[s]['kwargs']['weights'] = weights

        if combinations:
            res_stats = [cls._compute_stats(res_data, statistics)]
            if month:
                res_stats.append(cls._compute_stats(res_data, statistics,
                                                    month=True))

            if diurnal:
                res_stats.append(cls._compute_stats(res_data, statistics,
                                                    diurnal=True))
            if month and diurnal:
                res_stats.append(cls._compute_stats(res_data, statistics,
                                                    month=True, diurnal=True))

            res_stats = pd.concat(res_stats, axis=1)
        else:
            res_stats = cls._compute_stats(res_data, statistics,
                                           diurnal=diurnal, month=month)

        res_stats.index = cls._create_index(sites_slice)
        res_stats.index.name = 'gid'

        return res_stats

    def _get_slices(self, dataset, sites=None, chunks_per_slice=5):
        """
        Get slices to extract

        Parameters
        ----------
        dataset : str
            Dataset to extract data from
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        chunks_per_slice : int, optional
            Number of chunks to extract in each slice, by default 5

        Returns
        -------
        slices : list
            List of slices to extract
        """
        with self.res_cls(self.res_h5) as f:
            shape, _, chunks = f.get_dset_properties(dataset)

        if len(shape) != 2:
            msg = ('Cannot extract temporal stats for dataset {}, as it is '
                   'not a timeseries dataset!'.format(dataset))
            logger.error(msg)
            raise RuntimeError(msg)

        slices = slice_sites(shape, chunks, sites=sites,
                             chunks_per_slice=chunks_per_slice)

        return slices

    def _check_stats(self, statistics):
        """
        check desired statistics to make sure inputs are valid

        Parameters
        ----------
        statistics : str | tuple | dict
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}}

        Returns
        -------
        stats : dict
            Dictionary of statistic functions/kwargs to run
        """
        if isinstance(statistics, str):
            statistics = (statistics, )

        if isinstance(statistics, (tuple, list)):
            statistics = {s: self.STATS[s] for s in statistics}

        for stat in statistics.values():
            msg = 'A "func"(tion) must be provided for each statistic'
            assert 'func' in stat, msg
            if 'kwargs' in stat:
                msg = 'statistic function kwargs must be a dictionary '
                assert isinstance(stat['kwargs'], dict), msg

        return statistics


[docs]
    def compute_statistics(self, dataset, sites=None, diurnal=False,
                           month=False, combinations=False, max_workers=None,
                           chunks_per_worker=5, lat_lon_only=True,
                           mask_zeros=False):
        """
        Compute statistics

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        diurnal : bool, optional
            Extract diurnal stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        if max_workers is None:
            max_workers = os.cpu_count()

        slices = self._get_slices(dataset, sites,
                                  chunks_per_slice=chunks_per_worker)

        if len(slices) == 1:
            max_workers = 1

        if max_workers > 1:
            msg = ('Extracting {} for {} in parallel using {} workers'
                   .format(list(self.statistics), dataset, max_workers))
            logger.info(msg)

            loggers = [__name__, 'rex']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                futures = []
                for sites_slice in slices:
                    future = exe.submit(self._extract_stats,
                                        self.res_h5, self.statistics, dataset,
                                        res_cls=self.res_cls,
                                        hsds=self._hsds,
                                        time_index=self.time_index,
                                        sites_slice=sites_slice,
                                        diurnal=diurnal,
                                        month=month,
                                        combinations=combinations,
                                        mask_zeros=mask_zeros)
                    futures.append(future)

                res_stats = []
                for i, future in enumerate(as_completed(futures)):
                    res_stats.append(future.result())
                    logger.debug('Completed {} out of {} workers'
                                 .format((i + 1), len(futures)))
        else:
            msg = ('Extracting {} for {} in serial'
                   .format(self.statistics.keys(), dataset))
            logger.info(msg)
            res_stats = []
            for i, sites_slice in enumerate(slices):
                res_stats.append(self._extract_stats(
                    self.res_h5, self.statistics, dataset,
                    res_cls=self.res_cls, hsds=self._hsds,
                    time_index=self.time_index, sites_slice=sites_slice,
                    diurnal=diurnal, month=month,
                    combinations=combinations,
                    mask_zeros=mask_zeros))
                logger.debug('Completed {} out of {} sets of sites'
                             .format((i + 1), len(slices)))

        gc.collect()
        log_mem(logger)
        res_stats = pd.concat(res_stats)

        if lat_lon_only:
            meta = self.lat_lon
        else:
            meta = self.meta

        res_stats = meta.join(res_stats.sort_index(), how='inner')

        return res_stats



[docs]
    def full_stats(self, dataset, sites=None, max_workers=None,
                   chunks_per_worker=5, lat_lon_only=True, mask_zeros=False):
        """
        Compute stats for entire temporal extent of file

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        full_stats : pandas.DataFrame
            DataFrame of statistics for the entire temporal extent of file
        """
        full_stats = self.compute_statistics(
            dataset, sites=sites,
            max_workers=max_workers,
            chunks_per_worker=chunks_per_worker,
            lat_lon_only=lat_lon_only,
            mask_zeros=mask_zeros)

        return full_stats



[docs]
    def monthly_stats(self, dataset, sites=None, max_workers=None,
                      chunks_per_worker=5, lat_lon_only=True,
                      mask_zeros=False):
        """
        Compute monthly stats

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        monthly_stats : pandas.DataFrame
            DataFrame of monthly statistics
        """
        monthly_stats = self.compute_statistics(
            dataset, sites=sites, month=True,
            max_workers=max_workers,
            chunks_per_worker=chunks_per_worker,
            lat_lon_only=lat_lon_only,
            mask_zeros=mask_zeros)

        return monthly_stats



[docs]
    def diurnal_stats(self, dataset, sites=None, max_workers=None,
                      chunks_per_worker=5, lat_lon_only=True,
                      mask_zeros=False):
        """
        Compute diurnal stats

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        diurnal_stats : pandas.DataFrame
            DataFrame of diurnal statistics
        """
        diurnal_stats = self.compute_statistics(
            dataset, sites=sites, diurnal=True,
            max_workers=max_workers,
            chunks_per_worker=chunks_per_worker,
            lat_lon_only=lat_lon_only,
            mask_zeros=mask_zeros)

        return diurnal_stats



[docs]
    def monthly_diurnal_stats(self, dataset, sites=None,
                              max_workers=None, chunks_per_worker=5,
                              lat_lon_only=True, mask_zeros=False):
        """
        Compute monthly-diurnal stats

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        monthly_diurnal_stats : pandas.DataFrame
            DataFrame of monthly-diurnal statistics
        """
        diurnal_stats = self.compute_statistics(
            dataset, sites=sites, month=True, diurnal=True,
            max_workers=max_workers,
            chunks_per_worker=chunks_per_worker,
            lat_lon_only=lat_lon_only,
            mask_zeros=mask_zeros)

        return diurnal_stats



[docs]
    def all_stats(self, dataset, sites=None, max_workers=None,
                  chunks_per_worker=5, lat_lon_only=True, mask_zeros=False):
        """
        Compute annual, monthly, monthly-diurnal, and diurnal stats

        Parameters
        ----------
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).

        Returns
        -------
        all_diurnal_stats : pandas.DataFrame
            DataFrame of temporal statistics
        """
        all_stats = self.compute_statistics(
            dataset, sites=sites, month=True, diurnal=True, combinations=True,
            max_workers=max_workers,
            chunks_per_worker=chunks_per_worker,
            lat_lon_only=lat_lon_only,
            mask_zeros=mask_zeros)

        return all_stats



[docs]
    def save_stats(self, res_stats, out_path):
        """
        Save statistics to disk

        Parameters
        ----------
        res_stats : pandas.DataFrame
            Table of statistics to save
        out_path : str
            Directory, .csv, or .json path to save statistics too
        """
        if os.path.isdir(out_path):
            out_fpath = os.path.splitext(os.path.basename(self.res_h5))[0]
            out_fpath = os.path.join(out_path, out_fpath + '.csv')
        else:
            out_fpath = out_path

        # Drop any wild card values
        out_fpath = out_fpath.replace('*', '')

        out_dir = os.path.dirname(out_fpath)
        create_dirs(out_dir)

        logger.info('Writing temporal statistics to {}'.format(out_fpath))
        if out_fpath.endswith('.csv'):
            res_stats.to_csv(out_fpath)
        elif out_fpath.endswith('.json'):
            res_stats.to_json(out_fpath)
        else:
            msg = ("Cannot save statistics, expecting a directory, .csv, or "
                   ".json path, but got: {}".format(out_path))
            logger.error(msg)
            raise OSError(msg)



[docs]
    @classmethod
    def run(cls, res_h5, dataset, sites=None, statistics='mean',
            diurnal=False, month=False, combinations=False,
            res_cls=Resource, hsds=False, max_workers=None,
            chunks_per_worker=5, lat_lon_only=True, mask_zeros=False,
            out_path=None):
        """
        Compute temporal stats, by default full temporal extent stats

        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'mean'
        diurnal : bool, optional
            Extract diurnal stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).
        out_path : str, optional
            Directory, .csv, or .json path to save statistics too,
            by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        logger.info('Computing temporal stats for {} in {}'
                    .format(dataset, res_h5))
        logger.debug('Computing {} using:'
                     '\n-diurnal={}'
                     '\n-month={}'
                     '\n-combinations={}'
                     '\n-max workers={}'
                     '\n-chunks per worker={}'
                     '\n-output lat lons only={}'
                     .format(statistics, diurnal, month, combinations,
                             max_workers, chunks_per_worker, lat_lon_only))
        res_stats = cls(res_h5, statistics=statistics, res_cls=res_cls,
                        hsds=hsds)
        out_stats = res_stats.compute_statistics(
            dataset, sites=sites,
            diurnal=diurnal, month=month, combinations=combinations,
            max_workers=max_workers, chunks_per_worker=chunks_per_worker,
            lat_lon_only=lat_lon_only, mask_zeros=mask_zeros)
        if out_path is not None:
            res_stats.save_stats(out_stats, out_path)

        return out_stats



[docs]
    @classmethod
    def monthly(cls, res_h5, dataset, sites=None, statistics='mean',
                res_cls=Resource, hsds=False, max_workers=None,
                chunks_per_worker=5, lat_lon_only=True, mask_zeros=False,
                out_path=None):
        """
        Compute monthly stats

        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'mean'
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).
        out_path : str, optional
            Directory, .csv, or .json path to save statistics too,
            by default None

        Returns
        -------
        monthly_stats : pandas.DataFrame
            DataFrame of monthly statistics
        """
        monthly_stats = cls.run(res_h5, dataset, sites=sites,
                                statistics=statistics, diurnal=False,
                                month=True, combinations=False,
                                res_cls=res_cls, hsds=hsds,
                                max_workers=max_workers,
                                chunks_per_worker=chunks_per_worker,
                                lat_lon_only=lat_lon_only,
                                mask_zeros=mask_zeros,
                                out_path=out_path)

        return monthly_stats



[docs]
    @classmethod
    def diurnal(cls, res_h5, dataset, sites=None, statistics='mean',
                res_cls=Resource, hsds=False, max_workers=None,
                chunks_per_worker=5, lat_lon_only=True, mask_zeros=False,
                out_path=None):
        """
        Compute diurnal stats

        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'mean'
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).
        out_path : str, optional
            Directory, .csv, or .json path to save statistics too,
            by default None

        Returns
        -------
        diurnal_stats : pandas.DataFrame
            DataFrame of diurnal statistics
        """
        diurnal_stats = cls.run(res_h5, dataset, sites=sites,
                                statistics=statistics, diurnal=True,
                                month=False, combinations=False,
                                res_cls=res_cls, hsds=hsds,
                                max_workers=max_workers,
                                chunks_per_worker=chunks_per_worker,
                                lat_lon_only=lat_lon_only,
                                mask_zeros=mask_zeros,
                                out_path=out_path)

        return diurnal_stats



[docs]
    @classmethod
    def monthly_diurnal(cls, res_h5, dataset, sites=None,
                        statistics='mean', res_cls=Resource, hsds=False,
                        max_workers=None, chunks_per_worker=5,
                        lat_lon_only=True, mask_zeros=False, out_path=None):
        """
        Compute monthly-diurnal stats

        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'mean'
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).
        out_path : str, optional
            Directory, .csv, or .json path to save statistics too,
            by default None

        Returns
        -------
        monthly_diurnal_stats : pandas.DataFrame
            DataFrame of monthly-diurnal statistics
        """
        monthly_diurnal_stats = cls.run(res_h5, dataset, sites=sites,
                                        statistics=statistics, diurnal=True,
                                        month=True, combinations=False,
                                        res_cls=res_cls, hsds=hsds,
                                        max_workers=max_workers,
                                        chunks_per_worker=chunks_per_worker,
                                        lat_lon_only=lat_lon_only,
                                        mask_zeros=mask_zeros,
                                        out_path=out_path)

        return monthly_diurnal_stats



[docs]
    @classmethod
    def all(cls, res_h5, dataset, sites=None, statistics='mean',
            res_cls=Resource, hsds=False, max_workers=None,
            chunks_per_worker=5, lat_lon_only=True, mask_zeros=False,
            out_path=None):
        """
        Compute annual, monthly, monthly-diurnal, and diurnal stats

        Parameters
        ----------
        res_h5 : str
            Path to resource h5 file(s)
        dataset : str
            Dataset to extract stats for
        sites : list | slice, optional
            Subset of sites to extract, by default None or all sites
            (sites is synonymous with gids aka spatial indices)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'mean'
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        chunks_per_worker : int, optional
            Number of chunks to extract on each worker, by default 5
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        mask_zeros : bool
            Flag to only calculate stats when all data is > 0 (useful for
            global horizontal irradiance).
        out_path : str, optional
            Directory, .csv, or .json path to save statistics too,
            by default None

        Returns
        -------
        all_stats : pandas.DataFrame
            DataFrame of temporal statistics
        """
        all_stats = cls.run(res_h5, dataset, sites=sites,
                            statistics=statistics, diurnal=True,
                            month=True, combinations=True,
                            res_cls=res_cls, hsds=hsds,
                            max_workers=max_workers,
                            chunks_per_worker=chunks_per_worker,
                            lat_lon_only=lat_lon_only, out_path=out_path,
                            mask_zeros=mask_zeros)

        return all_stats