Source code for reVX.hybrid_stats.hybrid_stats

# -*- coding: utf-8 -*-
"""
Module to compute hybrid solar-wind generation correlations
"""
from concurrent.futures import as_completed
import gc
import logging
import numpy as np
import os
import pandas as pd
from scipy.spatial import cKDTree
from scipy.stats import pearsonr, spearmanr, kendalltau

from reVX.utilities.utilities import log_versions
from rex.resource import Resource
from rex.utilities.execution import SpawnProcessPool
from rex.utilities.loggers import log_mem
from rex.utilities.utilities import (get_lat_lon_cols, roll_timeseries,
                                     parse_year)

logger = logging.getLogger(__name__)


[docs]def pearson_correlation(solar_ts, wind_ts):
    """
    Compute the Pearson correlation coefficient between the given solar
    and wind timeseries data. Return just the correlation coefficient.

    Parameters
    ----------
    solar_ts : ndarray
        Solar time-series vector for a single site
    wind_ts : ndarray
        Wind time-series vector for a single site

    Returns
    -------
    float
        Pearson correlation coefficient
    """
    return pearsonr(solar_ts, wind_ts)[0]


[docs]def spearman_correlation(solar_ts, wind_ts):
    """
    Compute the Spearman correlation coefficient between the given solar
    and wind timeseries data. Return just the correlation coefficient.

    Parameters
    ----------
    solar_ts : ndarray
        Solar time-series vector for a single site
    wind_ts : ndarray
        Wind time-series vector for a single site

    Returns
    -------
    float
        Spearman correlation coefficient
    """
    return spearmanr(solar_ts, wind_ts)[0]


[docs]def kendall_tau(solar_ts, wind_ts):
    """
    Compute Kendall's tau correlation between the given solar
    and wind timeseries data. Return just the correlation coefficient.

    Parameters
    ----------
    solar_ts : ndarray
        Solar time-series vector for a single site
    wind_ts : ndarray
        Wind time-series vector for a single site

    Returns
    -------
    float
        Kendall's tau
    """
    return kendalltau(solar_ts, wind_ts)[0]


[docs]class HybridStats:
    """
    Compute comparitive correlations for co-located wind and solar generation
    """
    STATS = {'pearson': {'func': pearson_correlation},
             'spearman': {'func': spearman_correlation},
             'kendall': {'func': kendall_tau}}

    def __init__(self, solar_h5, wind_h5, statistics='pearson',
                 res_cls=Resource, year=None):
        """
        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'pearson'
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None
        """
        log_versions(logger)
        self._solar_h5 = solar_h5
        self._wind_h5 = wind_h5
        self._stats = None
        self.statistics = statistics

        self._res_cls = res_cls
        out = self._pre_flight_check(year=year)
        self._meta, self._time_index = out[:2]
        self._solar_time_slice, self._wind_time_slice = out[2:]

    def __repr__(self):
        msg = ('Computing {} between {} and {}'
               .format(list(self.statistics), self.solar_h5, self.wind_h5))

        return msg

    @property
    def solar_h5(self):
        """
        Path to solar h5 file(s)

        Returns
        -------
        str
        """
        return self._solar_h5

    @property
    def wind_h5(self):
        """
        Path to wind h5 file(s)

        Returns
        -------
        str
        """
        return self._wind_h5

    @property
    def statistics(self):
        """
        Dictionary of statistic functions/kwargs to run

        Returns
        -------
        dict
        """
        return self._stats

    @statistics.setter
    def statistics(self, statistics):
        """
         Statistics to extract, either a key or tuple of keys in
        cls.STATS, or a dictionary of the form
        {'stat_name': {'func': *, 'kwargs: {**}}}

        Parameters
        ----------
        statistics : dict
        """
        if isinstance(statistics, str):
            statistics = (statistics, )

        if isinstance(statistics, (tuple, list)):
            statistics = {s: self.STATS[s] for s in statistics}

        for stat in statistics.values():
            msg = 'A "func"(tion) must be provided for each statistic'
            assert 'func' in stat, msg
            if 'kwargs' in stat:
                msg = 'statistic function kwargs must be a dictionary '
                assert isinstance(stat['kwargs'], dict), msg

        self._stats = statistics

    @property
    def res_cls(self):
        """
        Resource class to use to access res_h5

        Returns
        -------
        Class
        """
        return self._res_cls

    @property
    def time_index(self):
        """
        Resource Datetimes

        Returns
        -------
        pandas.DatetimeIndex
        """
        return self._time_index

    @property
    def meta(self):
        """
        Resource meta-data table

        Returns
        -------
        pandas.DataFrame
        """
        return self._meta

    @property
    def lat_lon(self):
        """
        Resource (lat, lon) coordinates

        Returns
        -------
        pandas.DataFrame
        """
        lat_lon_cols = get_lat_lon_cols(self.meta)

        return self.meta[lat_lon_cols]

    @staticmethod
    def _clean_meta(meta, tech, drop=False):
        """
        Clean up input meta data:
        - rename and move index to table
        - rename capacity and any mean_* columns
        - drop all but capacity, gid and mean columns if `drop=True`

        Parameters
        ----------
        meta : pandas.DataFrame
            Sites meta data table for given technology
        tech : str
            Technology of meta data table, either 'solar' or 'wind'

        Returns
        -------
        meta : pandas.DataFrame
            updated site meta data table
        """
        meta.index.name = f"{tech}_gid"
        cols = {}
        for c in meta:
            if c.startswith('cap'):
                cols[c] = f'{tech}_cap'
            elif c.startswith('mean'):
                cols[c] = f'{tech}_res_gid'

        meta = meta.rename(columns=cols)
        if drop:
            meta = meta[cols.values()]

        return meta.reset_index()

    @classmethod
    def _map_sites(cls, solar_meta, wind_meta):
        """
        Map solar to wind sites retaining the higher resolution meta data.
        Mapping it done using a cKDTree to determine the closest coarser
        resolution site to each finer resolution site.

        Parameters
        ----------
        solar_meta : pandas.DataFrame
            Solar sites meta data table
        wind_meta : pandas.DataFrame
            Wind sites meta data table

        Returns
        -------
        meta : pandas.DataFrame
            Meta data table mapping the finer resolution data to the coarser
        """
        solar_coords = solar_meta[get_lat_lon_cols(solar_meta)].values
        wind_coords = wind_meta[get_lat_lon_cols(wind_meta)].values
        solar_n = len(solar_meta)
        wind_n = len(wind_meta)

        # pylint: disable=not-callable
        if solar_n < wind_n:
            solar_meta = cls._clean_meta(solar_meta, 'solar', drop=True)
            wind_meta = cls._clean_meta(wind_meta, 'wind')
            tree = cKDTree(solar_coords)
            meta = wind_meta.copy()

            dist, pos = tree.query(wind_coords)
            mask = dist <= np.median(dist) * 1.5
            pos = pos[mask]
            meta = meta.loc[mask].reset_index(drop=True)

            meta = pd.concat((meta,
                              solar_meta.iloc[pos].reset_index(drop=True)),
                             axis=1)
        else:
            solar_meta = cls._clean_meta(solar_meta, 'solar')
            wind_meta = cls._clean_meta(wind_meta, 'wind', drop=True)
            tree = cKDTree(wind_coords)
            meta = solar_meta.copy()

            dist, pos = tree.query(solar_coords)
            mask = dist <= np.median(dist) * 1.5
            pos = pos[mask]
            meta = meta.loc[mask].reset_index(drop=True)

            meta = pd.concat((meta,
                              wind_meta.iloc[pos].reset_index(drop=True)),
                             axis=1)

        return meta

    @staticmethod
    def _check_time_index(solar_time_index, wind_time_index):
        """
        Compare solar and wind time indexes. Determine coincident time_index
        and requisite time_slices to reduce solar and wind dataset coincident

        Parameters
        ----------
        solar_time_index : pandas.DatatimeIndex
            Datetime index for solar data, respresents the temporal resolution
            of the solar datasets
        wind_time_index : pands.DatatimeINdex
            Datetime index for wind data, respresents the temporal resolution
            of the wind datasets

        Returns
        -------
        time_index : pandas.DatatimeIndex
            Coincident datetime index between solar and wind datasets,
            i.e. datetime steps that are in both wind and solar data
        solar_time_slice : slice | ndarray
            slice or boolean index of the solar timesteps that are in the
            coincident time_index
        wind_time_slice : slice | ndarray
            slice or boolean index of the wind timesteps that are in the
            coincident time_index
        """
        solar_n = len(solar_time_index)
        solar_time_slice = slice(None)
        wind_n = len(wind_time_index)
        wind_time_slice = slice(None)
        if solar_n < wind_n:
            wind_time_slice = wind_time_index.isin(solar_time_index)
            time_index = solar_time_index
            if not wind_time_slice.all():
                msg = ("The following timesteps are not in both wind and "
                       "solar dataset:\n{}"
                       .format(wind_time_index[wind_time_slice]))
                logger.error(msg)
                raise RuntimeError(msg)
        elif wind_n < solar_n:
            solar_time_slice = solar_time_index.isin(wind_time_index)
            time_index = wind_time_slice
            if not solar_time_slice.all():
                msg = ("The following timesteps are not in both wind and "
                       "solar dataset:\n{}"
                       .format(solar_time_index[solar_time_slice]))
                logger.error(msg)
                raise RuntimeError(msg)
        else:
            time_index = solar_time_index

        return time_index, solar_time_slice, wind_time_slice

    @staticmethod
    def _groupby_data(data, annual=True, diurnal=False, doy=False,
                      month=False):
        """
        Groupby data by month and/or hour

        Parameters
        ----------
        data : pandas.DataFrame
            DataFrame of data where index is time_index, columns are sites
        annual : bool, optional
            Extract annaul stats, by default True
        diurnal : bool, optional
            Extract diurnal stats, by default False
        doy : bool, optional
            Extract doy-of-year stats, by default False
        month : bool, optional
            Extract monthly stats, by default False

        Returns
        -------
        data : pandas.Groupby
            Input DataFrame grouped by month and or hour if requested
        """
        groupby = []
        if annual:
            groupby.append(data.index.year)

        if month:
            groupby.append(data.index.month)

        if doy:
            groupby.append(data.index.dayofyear)

        if diurnal:
            groupby.append(data.index.hour)

        m = "Data must be groupby year, month, day of year or dirunal (hour)!"
        assert groupby, m

        data = data.groupby(groupby)

        return data

    @staticmethod
    def _format_grp_names(grp_names):
        """
        Format groupby index values

        Parameters
        ----------
        grp_names : list
            Group by index values, these correspond to each unique group in
            the groupby

        Returns
        -------
        out : ndarray
            2D array of grp index values properly formatted as strings
        """
        month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May',
                     6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct',
                     11: 'Nov', 12: 'Dec'}

        # pylint: disable=unnecessary-lambda
        # pylint: disable=unnecessary-lambda-assignment
        year = lambda s: "{}".format(s)
        month = lambda s: "{}".format(month_map[s])
        hour = lambda s: "{:02d}:00UTC".format(s)

        grp_names = np.array(grp_names).T
        if len(grp_names.shape) == 1:
            grp_names = np.expand_dims(grp_names, 0)

        out = []
        for grp_i in grp_names:  # pylint: disable=not-an-iterable
            grp_max = grp_i.max()
            if grp_max <= 12:
                out.append(list(map(month, grp_i)))
            elif grp_max <= 23:
                out.append(list(map(hour, grp_i)))
            else:
                out.append(list(map(year, grp_i)))

        return np.array(out).T

    @classmethod
    def _create_names(cls, index, stats):
        """
        Generate statistics names

        Parameters
        ----------
        index : pandas.Index | pandas.MultiIndex
            Temporal index, either month, hour, or (month, hour)
        stats : list
            Statistics to be computed

        Returns
        -------
        columns_map : dict
            Dictionary of column names to use for each statistic
        columns : list
            Column names to use
        """
        column_names = cls._format_grp_names(index)

        columns_map = {}
        columns = []
        for s in stats:
            cols = {i: '{}_{}'.format('-'.join(n), s) for i, n
                    in zip(index, column_names)}
            columns_map[s] = cols
            columns.extend(list(cols.values()))

        return columns_map, columns

    @classmethod
    def _compute_correlations(cls, solar_data, wind_data, statistics,
                              annual=True, diurnal=False, doy=False,
                              month=False):
        """
        Compute desired correlations for desired time intervals between
        solar and wind data

        Parameters
        ----------
        solar_data : pandas.DataFrame
            DataFrame of solar data. Index is time_index, columns are sites
        wind_data : pandas.DataFrame
            DataFrame of wind data. Index is time_index, columns are sites
        statistics : dict
            Dictionary of statistic functions/kwargs to run
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        diurnal : bool, optional
            Extract diurnal stats, by default False
        doy : bool, optional
            Extract doy-of-year stats, by default False
        month : bool, optional
            Extract monthly stats, by default False

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        sites = solar_data.columns.values
        solar_data = cls._groupby_data(solar_data, annual=annual,
                                       diurnal=diurnal, doy=doy,
                                       month=month)
        wind_data = cls._groupby_data(wind_data, annual=annual,
                                      diurnal=diurnal, doy=doy,
                                      month=month)

        cols_map, col_names = cls._create_names(list(solar_data.groups),
                                                list(statistics))
        out_stats = pd.DataFrame(columns=col_names, index=sites,
                                 dtype=np.float32)
        for grp_name, solar_grp in solar_data:

            print(grp_name, wind_data)
            grp_name = grp_name[0] if len(grp_name) == 1 else grp_name
            wind_grp = wind_data.get_group(grp_name)
            msg = ('solar and wind data shapes do not match! {} != {}'
                   .format(solar_grp.shape, wind_grp.shape))
            assert solar_grp.shape == wind_grp.shape, msg
            for i in sites:
                solar_ts = solar_grp.iloc[:, i]
                wind_ts = wind_grp.iloc[:, i]
                for name, stat in statistics.items():
                    col = cols_map[name][grp_name]
                    func = stat['func']
                    kwargs = stat.get('kwargs', {})
                    out_stats.at[i, col] = func(solar_ts, wind_ts, **kwargs)

        return out_stats

    @classmethod
    def _extract_stats(cls, solar_h5, wind_h5, dataset, sites,
                       solar_time_slice, wind_time_slice, statistics=None,
                       time_index=None, res_cls=Resource, annual=True,
                       diurnal=False, doy=False, month=False,
                       combinations=False):
        """
        Extract stats for given dataset, sites, and temporal extent

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : tuple
            Datasets to compare, in the form: (solar_dataset, wind_dataset)
        sites : pandas.DataFrame
            Subset of meta DataFrame with sites to extract
        solar_time_slice : slice | ndarray
            slice or boolean index of the solar timesteps that are in the
            coincident time_index
        wind_time_slice : slice | ndarray
            slice or boolean index of the wind timesteps that are in the
            coincident time_index
        statistics : dict, optional
            Dictionary of statistic functions/kwargs to run, if None default
            to: {'pearson', {'func': pearson_correlation}},
            by default None
        time_index : pandas.DatatimeIndex, optional
            Timeseries DatetimeIndex, if None extract from wind_h5,
            by default None
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        diurnal : bool, optional
            Extract diurnal stats, by default False
        doy : bool, optional
            Extract doy-of-year stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        if statistics is None:
            statistics = {'pearson': {'func': pearson_correlation}}

        wind_sites = sites['wind_gid'].values
        solar_sites = sites['solar_gid'].values

        solar_dataset, wind_dataset = dataset
        with res_cls(wind_h5) as f:
            if time_index is None:
                time_index = f.time_index

            wind_data = f[wind_dataset, wind_time_slice, wind_sites]
            wind_data = pd.DataFrame(wind_data, index=time_index)

        with res_cls(solar_h5) as f:
            solar_data = f[solar_dataset, solar_time_slice, solar_sites]
            solar_data = pd.DataFrame(solar_data, index=time_index)

        if combinations:
            out_stats = [cls._compute_correlations(solar_data, wind_data,
                                                   statistics)]
            if month:
                out_stats.append(cls._compute_correlations(solar_data,
                                                           wind_data,
                                                           statistics,
                                                           annual=False,
                                                           month=True))

            if doy:
                out_stats.append(cls._compute_correlations(solar_data,
                                                           wind_data,
                                                           statistics,
                                                           annual=False,
                                                           doy=True))

            if diurnal:
                out_stats.append(cls._compute_correlations(solar_data,
                                                           wind_data,
                                                           statistics,
                                                           annual=False,
                                                           diurnal=True))
            if month and diurnal:
                out_stats.append(cls._compute_correlations(solar_data,
                                                           wind_data,
                                                           statistics,
                                                           annual=False,
                                                           month=True,
                                                           diurnal=True))

            out_stats = pd.concat(out_stats, axis=1)
        else:
            out_stats = cls._compute_correlations(solar_data,
                                                  wind_data,
                                                  statistics,
                                                  annual=annual,
                                                  diurnal=diurnal,
                                                  doy=doy,
                                                  month=month)

        out_stats.index = sites.index.values

        out_stats.index.name = 'gid'

        return out_stats

[docs]    @staticmethod
    def save_stats(out_stats, out_fpath):
        """
        Save correlations to disk

        Parameters
        ----------
        out_stats : pandas.DataFrame
            Table of correlations to save
        out_path : str
            .csv, or .json path to save statistics too
        """
        logger.info('Saving hybrid stats to {}'.format(out_fpath))
        if out_fpath.endswith('.csv'):
            out_stats.to_csv(out_fpath)
        elif out_fpath.endswith('.json'):
            out_stats.to_json(out_fpath)
        else:
            msg = ("Cannot save statistics, expecting a .csv, or "
                   ".json path, but got: {}".format(out_fpath))
            logger.error(msg)
            raise OSError(msg)

    @staticmethod
    def _check_dataset(dataset):
        """
        Check user provided dataset for proper format

        Parameters
        ----------
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)

        Returns
        -------
        dataset : tuple
            Datasets to compare, in the form: (solar_dataset, wind_dataset)
        """
        if isinstance(dataset, str):
            dataset = (dataset, dataset)
        elif isinstance(dataset, (tuple, list)):
            if len(dataset) < 2:
                msg = ("Must supply a solar and wind dataset in the form: "
                       "(solar, wind)")
                logger.error(msg)
                raise ValueError(msg)

        return dataset

    @staticmethod
    def _parse_meta_time_index(h5_path, res_cls=Resource, year=None):
        """
        Parse meta data table and time_index from .h5 file. If 'year' is
        provided extact time_index for given year.

        Parameters
        ----------
        h5_path : str
            Path to .h5 file to extract meta and time_index
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None

        Returns
        -------
        meta : pandas.DataFrame
            Site meta data table
        time_index: pandas.DatatimeIndex
            Datetime Index
        """
        with res_cls(h5_path) as f:
            meta = f.meta
            if 'time_index' in f:
                time_index = f.time_index
            elif year is not None:
                time_index = f[f'time_index-{year}']
            else:
                ti_dsets = [dset for dset in f.datasets
                            if dset.startswith('time_index')]
                msg = ("'time_index' is not available in {}. The following "
                       "potential annual time_index are available: {}. Please "
                       "specify a 'year' to use the 'time_index' for a "
                       "specific year.".format(h5_path, ti_dsets))
                logger.error(ti_dsets)
                raise ValueError(msg)

        return meta, time_index

    def _pre_flight_check(self, year=None):
        """
        Compare solar and wind site meta data and time index to ensure
        they can be compared

        Parameters
        ----------
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None

        Returns
        -------
        meta : pandas.DataFrame
            Meta data table mapping the finer resolution data to the coarser
        time_index : pandas.DatatimeIndex
            Coincident datetime index between solar and wind datasets,
            i.e. datetime steps that are in both wind and solar data
        solar_time_slice : slice | ndarray
            slice or boolean index of the solar timesteps that are in the
            coincident time_index
        wind_time_slice : slice | ndarray
            slice or boolean index of the wind timesteps that are in the
            coincident time_index
        """
        solar_meta, solar_ti = self._parse_meta_time_index(
            self._solar_h5, res_cls=self._res_cls, year=year)
        wind_meta, wind_ti = self._parse_meta_time_index(
            self._wind_h5, res_cls=self._res_cls, year=year)

        time_index, solar_time_slice, wind_time_slice = \
            self._check_time_index(solar_ti, wind_ti)
        meta = self._map_sites(solar_meta, wind_meta)

        return meta, time_index, solar_time_slice, wind_time_slice

    def _compute_stats(self, dataset, max_workers=None, sites_per_worker=1000,
                       lat_lon_only=True, extract_stats_kwargs=None):
        """
        Compute correlations

        Parameters
        ----------
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        extract_stats_kwargs : dict, optional
            Kwargs to pass to _extract_stats method, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of desired correlation coefficients at desired time
            intervals
        """
        if extract_stats_kwargs is None:
            extract_stats_kwargs = {'res_cls': self.res_cls}

        dataset = self._check_dataset(dataset)

        solar_h5 = self.solar_h5
        wind_h5 = self.wind_h5
        if max_workers is None:
            max_workers = os.cpu_count()

        slices = len(self.meta) // sites_per_worker
        if slices:
            slices = np.array_split(self.meta, slices)
        else:
            slices = [self.meta]
            max_workers = 1

        if max_workers > 1:
            msg = ('Extracting {} for {} in parallel using {} workers'
                   .format(self, dataset, max_workers))
            logger.info(msg)

            loggers = [__name__, 'reVX', 'rex']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                futures = []
                for sites in slices:
                    future = exe.submit(self._extract_stats,
                                        solar_h5, wind_h5, dataset, sites,
                                        self._solar_time_slice,
                                        self._wind_time_slice,
                                        **extract_stats_kwargs)
                    futures.append(future)

                out_stats = []
                for i, future in enumerate(as_completed(futures)):
                    out_stats.append(future.result())
                    logger.debug('Completed {} out of {} workers'
                                 .format((i + 1), len(futures)))
        else:
            msg = ('Extracting {} for {} in serial'
                   .format(self, dataset))
            logger.info(msg)
            out_stats = []
            for i, sites in enumerate(slices):
                out_stats.append(self._extract_stats(
                    solar_h5, wind_h5, dataset, sites,
                    self._solar_time_slice,
                    self._wind_time_slice,
                    **extract_stats_kwargs))
                logger.debug('Completed {} out of {} sets of sites'
                             .format((i + 1), len(slices)))

        gc.collect()
        log_mem(logger)
        out_stats = pd.concat(out_stats)

        if lat_lon_only:
            meta = self.lat_lon
        else:
            meta = self.meta

        out_stats = meta.join(out_stats.sort_index(), how='inner')

        return out_stats

[docs]    def compute_stats(self, dataset, annual=True, diurnal=False, doy=False,
                      month=False, combinations=False, max_workers=None,
                      sites_per_worker=1000, lat_lon_only=True):
        """
        Compute correlations

        Parameters
        ----------
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        diurnal : bool, optional
            Extract diurnal stats, by default False
        doy : bool, optional
            Extract doy-of-year stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        kwargs = {'res_cls': self.res_cls,
                  'annual': annual,
                  'diurnal': diurnal,
                  'doy': doy,
                  'month': month,
                  'combinations': combinations,
                  'statistics': self.statistics,
                  'time_index': self.time_index
                  }

        logger.info('Computing correlations from {}'.format(dataset))
        logger.debug('- Using the following options: {}'.format(kwargs))
        out_stats = self._compute_stats(dataset, max_workers=max_workers,
                                        sites_per_worker=sites_per_worker,
                                        lat_lon_only=lat_lon_only,
                                        extract_stats_kwargs=kwargs)

        return out_stats

[docs]    @classmethod
    def run(cls, solar_h5, wind_h5, dataset, statistics='pearson', annual=True,
            diurnal=False, doy=False, month=False, combinations=False,
            res_cls=Resource, year=None, max_workers=None,
            sites_per_worker=1000, lat_lon_only=True, out_path=None):
        """
        Compute temporal stats between solar and wind time-series at desired
        temporal scales

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'pearson'
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        diurnal : bool, optional
            Extract diurnal stats, by default False
        doy : bool, optional
            Extract doy-of-year stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        out_path : str, optional
            .csv, or .json path to save statistics too, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        if isinstance(dataset, str) and year is None:
            try:
                year = parse_year(dataset)
            except RuntimeError:
                year = None

        hybrid_stats = cls(solar_h5, wind_h5, statistics=statistics,
                           res_cls=res_cls, year=year)
        out_stats = hybrid_stats.compute_stats(
            dataset, annual=annual, diurnal=diurnal, doy=doy, month=month,
            combinations=combinations, max_workers=max_workers,
            sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only)
        if out_path is not None:
            hybrid_stats.save_stats(out_stats, out_path)

        return out_stats

[docs]    @classmethod
    def cf_profile(cls, solar_h5, wind_h5, statistics='pearson', annual=True,
                   diurnal=False, doy=False, month=False, combinations=False,
                   res_cls=Resource, max_workers=None,
                   sites_per_worker=1000, lat_lon_only=True, out_path=None):
        """
        Compute temporal stats on cf_profile dataset

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        statistics : str | tuple | dict, optional
            Statistics to extract, either a key or tuple of keys in
            cls.STATS, or a dictionary of the form
            {'stat_name': {'func': *, 'kwargs: {**}}},
            by default 'pearson'
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        diurnal : bool, optional
            Extract diurnal stats, by default False
        doy : bool, optional
            Extract doy-of-year stats, by default False
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        out_path : str, optional
            .csv, or .json path to save statistics too, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        out_stats = cls.run(solar_h5, wind_h5, 'cf_profile',
                            statistics=statistics, res_cls=res_cls,
                            annual=annual, diurnal=diurnal, doy=doy,
                            month=month, combinations=combinations,
                            max_workers=max_workers,
                            sites_per_worker=sites_per_worker,
                            lat_lon_only=lat_lon_only, out_path=out_path)

        return out_stats


[docs]class HybridCrossCorrelation(HybridStats):
    """
    Compute the temporal cross correlations for co-located wind and solar
    generation
    """

    def __init__(self, solar_h5, wind_h5, res_cls=Resource, year=None):
        """
        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None
        """
        self._solar_h5 = solar_h5
        self._wind_h5 = wind_h5
        self._res_cls = res_cls
        self._stats = None

        out = self._pre_flight_check(year=year)
        self._meta, self._time_index = out[:2]
        self._solar_time_slice, self._wind_time_slice = out[2:]

    def __repr__(self):
        msg = ('Computing cross-correlations between {} and {}'
               .format(self.solar_h5, self.wind_h5))

        return msg

[docs]    @staticmethod
    def cross_correlation(solar_data, wind_data, m):
        """
        Compute the cross-correlation between solar and wind time-series data
        with time-lag m

        Parameters
        ----------
        solar_data : ndarray
            Time-series solar data
        wind_data : ndarray
            Time-series wind data
        m : int
            Integer shift between solar and wind time-series, is pass directly
            to np.roll, so it will be the number of time-steps that are
            shifted. The lag time will be m * dt where dt is the time-step
            size.

        Returns
        -------
        corr : ndarray
            Cross-correlation coefficient for each solar, wind site pair
        """
        solar_u = solar_data.mean(axis=0)
        solar_s = solar_data.std(axis=0)
        solar_data = (solar_data - solar_u) / solar_s

        wind_data = np.roll(wind_data, m, axis=0)
        wind_u = wind_data.mean(axis=0)
        wind_s = wind_data.mean(axis=0)
        wind_data = (wind_data - wind_u) / wind_s

        n = len(solar_data)
        corr = (1 / (n - 1)) * np.sum(solar_data * wind_data, axis=0)

        return corr

    @classmethod
    def _extract_stats(cls, solar_h5, wind_h5, dataset, sites,
                       solar_time_slice, wind_time_slice,
                       lag_range=(-50, 51, 1), res_cls=Resource):
        """
        Extract stats for given dataset, sites, and temporal extent

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : tuple
            Datasets to compare, in the form: (solar_dataset, wind_dataset)
        sites : pandas.DataFrame
            Subset of meta DataFrame with sites to extract
        solar_time_slice : slice | ndarray
            slice or boolean index of the solar timesteps that are in the
            coincident time_index
        wind_time_slice : slice | ndarray
            slice or boolean index of the wind timesteps that are in the
            coincident time_index
        lag_range : tuple, optional
            The range of lag (m) values to compute the cross-correlation for
            (start, stop, step). Cross-correlation will be run for all lags in
            range(start, stop, step), each value in the
            range is the number of timesteps by which the time-series will be
            shifted to compute the cross-correlation.
            by default (-50, 51, 1)
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        wind_sites = sites['wind_gid'].values
        solar_sites = sites['solar_gid'].values

        solar_dataset, wind_dataset = dataset
        with res_cls(solar_h5) as f:
            solar_data = f[solar_dataset, solar_time_slice, solar_sites]

        with res_cls(wind_h5) as f:
            wind_data = f[wind_dataset, wind_time_slice, wind_sites]

        out_stats = {}
        for m in range(*lag_range):
            out_stats[m] = cls.cross_correlation(solar_data, wind_data, m)

        index = pd.Index(sites.index.values, name='gid')
        out_stats = pd.DataFrame(out_stats,
                                 index=index)

        out_stats['optimal_m'] = \
            out_stats.columns[out_stats.values.argmax(axis=1)]

        return out_stats

[docs]    def compute_stats(self, dataset, lag_range=(-50, 51, 1),
                      max_workers=None, sites_per_worker=1000,
                      lat_lon_only=True):
        """
        Compute correlations

        Parameters
        ----------
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        lag_range : tuple, optional
            The range of lag (m) values to compute the cross-correlation for
            (start, stop, step). Cross-correlation will be run for all lags in
            range(start, stop, step), each value in the
            range is the number of timesteps by which the time-series will be
            shifted to compute the cross-correlation.
            by default (-50, 51, 1)
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        kwargs = {'res_cls': self.res_cls,
                  'lag_range': lag_range}

        logger.info('Computing cross correlations from {}'.format(dataset))
        logger.debug('- Using the following options: {}'.format(kwargs))
        out_stats = self._compute_stats(dataset, max_workers=max_workers,
                                        sites_per_worker=sites_per_worker,
                                        lat_lon_only=lat_lon_only,
                                        extract_stats_kwargs=kwargs)

        return out_stats

[docs]    @classmethod
    def run(cls, solar_h5, wind_h5, dataset, lag_range=(-50, 51, 1),
            res_cls=Resource, year=None, max_workers=None,
            sites_per_worker=1000, lat_lon_only=True, out_path=None):
        """
        Compute cross correlations between solar and wind time-series

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None
        lag_range : tuple, optional
            The range of lag (m) values to compute the cross-correlation for
            (start, stop, step). Cross-correlation will be run for all lags in
            range(start, stop, step), each value in the
            range is the number of timesteps by which the time-series will be
            shifted to compute the cross-correlation.
            by default (-50, 51, 1)
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        out_path : str, optional
            .csv, or .json path to save statistics too, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        if isinstance(dataset, str) and year is None:
            try:
                year = parse_year(dataset)
            except RuntimeError:
                year = None

        hybrid_stats = cls(solar_h5, wind_h5, res_cls=res_cls, year=year)
        out_stats = hybrid_stats.compute_stats(
            dataset, lag_range=lag_range,
            max_workers=max_workers, sites_per_worker=sites_per_worker,
            lat_lon_only=lat_lon_only)
        if out_path is not None:
            hybrid_stats.save_stats(out_stats, out_path)

        return out_stats

[docs]    @classmethod
    def cf_profile(cls, solar_h5, wind_h5, lag_range=(-50, 51, 1),
                   res_cls=Resource, max_workers=None,
                   sites_per_worker=1000, lat_lon_only=True, out_path=None):
        """
        Compute cross correlations on cf_profile dataset

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : str
            Dataset to extract stats for
        lag_range : tuple, optional
            The range of lag (m) values to compute the cross-correlation for
            (start, stop, step). Cross-correlation will be run for all lags in
            range(start, stop, step), each value in the
            range is the number of timesteps by which the time-series will be
            shifted to compute the cross-correlation.
            by default (-50, 51, 1)
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        out_path : str, optional
            .csv, or .json path to save statistics too, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        out_stats = cls.run(solar_h5, wind_h5, 'cf_profile', res_cls=res_cls,
                            lag_range=lag_range,
                            max_workers=max_workers,
                            sites_per_worker=sites_per_worker,
                            lat_lon_only=lat_lon_only, out_path=out_path)

        return out_stats


[docs]class HybridStabilityCoefficient(HybridStats):
    """
    Compute the annual/monthly stability coefficient for co-located wind and
    solar
    """

    def __init__(self, solar_h5, wind_h5, res_cls=Resource, year=None):
        """
        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None
        """
        self._solar_h5 = solar_h5
        self._wind_h5 = wind_h5
        self._res_cls = res_cls
        self._stats = None

        out = self._pre_flight_check(year=year)
        self._meta, self._time_index = out[:2]
        self._solar_time_slice, self._wind_time_slice = out[2:]

    def __repr__(self):
        msg = ('Computing stability coefficient between {} and {}'
               .format(self.solar_h5, self.wind_h5))

        return msg

    @staticmethod
    def _daily_variability(doy):
        """
        Compute the daily variability

        Parameters
        ----------
        doy : pandas.DataFrameGroupby
            Time-series DataFrame grouped by day-of-year

        Returns
        -------
        var : pandas.DataFrame
            Daily variablility by site
        """
        var = np.sqrt(np.sum((doy - doy.mean())**2))

        return var

[docs]    @classmethod
    def stability_coefficient(cls, mix, ref):
        """
        Compute average stability coefficient

        Parameters
        ----------
        mix : pandas.DataFrame
            DataFrame of mixed solar and wind time-series
        ref : pandas.DataFrame
            DataFrame of reference (solar or wind) time-series

        Returns
        -------
        stab : ndarray
            Vector of the average stability coefficient for all days in the
            provided time-series data. Averages are by site.
        """
        mix = mix.groupby(mix.index.dayofyear)
        mix_var = mix.apply(cls._daily_variability)

        ref = ref.groupby(ref.index.dayofyear)
        ref_var = ref.apply(cls._daily_variability)

        stab = 1 - ((mix_var / ref_var) * (ref.mean() / mix.mean()))

        mask = np.isfinite(stab)
        if not np.all(mask):
            stab[~mask] = np.nan

        return stab.mean().values.astype(np.float32)

    @classmethod
    def _compute_coefficients(cls, solar_data, wind_data, solar_cap=None,
                              wind_cap=None, annual=True, month=False,
                              reference='solar'):
        """
        Compute compute average stability coefficient of solar and wind data
        over desired time intervals

        Parameters
        ----------
        solar_data : pandas.DataFrame
            DataFrame of solar data. Index is time_index, columns are sites
        wind_data : pandas.DataFrame
            DataFrame of wind data. Index is time_index, columns are sites
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        month : bool, optional
            Extract monthly stats, by default False
        reference : str, optional
            Which data to use as the reference (denominator) when computing
            the stability coefficient, by default 'solar'

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of stability coefficients for given sites and desired
            time intervals
        """
        sites = solar_data.columns.values
        if solar_cap is None or wind_cap is None:
            mix = (solar_data + wind_data) / 2
        else:
            mix = ((solar_data * solar_cap + wind_data * wind_cap)
                   / (solar_cap + wind_cap))

        mix = cls._groupby_data(mix, annual=annual, month=month)
        solar_data = cls._groupby_data(solar_data, annual=annual, month=month)
        wind_data = cls._groupby_data(wind_data, annual=annual, month=month)

        if reference.lower() == 'solar':
            ref = solar_data
        else:
            ref = wind_data

        cols_map, _ = cls._create_names(list(mix.groups),
                                        ['stability'])
        out_stats = {}
        for grp_name, mix_grp in mix:
            grp_name = grp_name[0] if len(grp_name) == 1 else grp_name
            col = cols_map['stability'][grp_name]
            ref_grp = ref.get_group(grp_name)
            msg = ('mixed and reference data shapes do not match! {} != {}'
                   .format(mix_grp.shape, ref_grp.shape))
            assert mix_grp.shape == ref_grp.shape, msg
            out_stats[col] = cls.stability_coefficient(mix_grp, ref_grp)

        out_stats = [pd.DataFrame(out_stats, index=sites, dtype=np.float32)]

        means = zip(['solar', 'wind', 'reference', 'mixed'],
                    [solar_data, wind_data, ref, mix])
        for name, data in means:
            _, cols = cls._create_names(list(data.groups),
                                        [f'{name}_cf'])
            mean_data = data.aggregate(np.nanmean).T.astype(np.float32)
            mean_data.columns = cols
            out_stats.append(mean_data)

        return pd.concat(out_stats, axis=1)

    @classmethod
    def _extract_stats(cls, solar_h5, wind_h5, dataset, sites,
                       solar_time_slice, wind_time_slice,
                       time_index=None, res_cls=Resource,
                       reference='solar', annual=True, month=False,
                       combinations=False):
        """
        Extract stats for given dataset, sites, and temporal extent

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : tuple
            Datasets to compare, in the form: (solar_dataset, wind_dataset)
        sites : pandas.DataFrame
            Subset of meta DataFrame with sites to extract
        solar_time_slice : slice | ndarray
            slice or boolean index of the solar timesteps that are in the
            coincident time_index
        wind_time_slice : slice | ndarray
            slice or boolean index of the wind timesteps that are in the
            coincident time_index
        time_index : pandas.DatatimeIndex, optional
            Timeseries DatetimeIndex, if None extract from wind_h5,
            by default None
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        reference : str, optional
            Which data to use as the reference (denominator) when computing
            the stability coefficient, by default 'solar'
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        solar_dataset, wind_dataset = dataset
        solar_sites = sites['solar_gid'].values
        tz = sites['timezone'].values.copy()

        solar_cap = None
        if 'solar_cap' in sites:
            solar_cap = sites['solar_cap'].values

        with res_cls(solar_h5) as f:
            solar_data = f[solar_dataset, solar_time_slice, solar_sites]
            solar_data = roll_timeseries(solar_data, tz)
            solar_data = pd.DataFrame(solar_data, index=time_index)

        wind_sites = sites['wind_gid'].values
        wind_cap = None
        if 'wind_cap' in sites:
            wind_cap = sites['wind_cap'].values

        with res_cls(wind_h5) as f:
            if time_index is None:
                time_index = f.time_index

            wind_data = f[wind_dataset, wind_time_slice, wind_sites]
            wind_data = roll_timeseries(wind_data, tz)
            wind_data = pd.DataFrame(wind_data, index=time_index)

        if combinations:
            out_stats = [cls._compute_coefficients(solar_data, wind_data,
                                                   solar_cap=solar_cap,
                                                   wind_cap=wind_cap,
                                                   reference=reference
                                                   )]
            if month:
                out_stats.append(cls._compute_coefficients(solar_data,
                                                           wind_data,
                                                           solar_cap=solar_cap,
                                                           wind_cap=wind_cap,
                                                           reference=reference,
                                                           annual=False,
                                                           month=True))

            out_stats = pd.concat(out_stats, axis=1)
        else:
            out_stats = cls._compute_coefficients(solar_data,
                                                  wind_data,
                                                  solar_cap=solar_cap,
                                                  wind_cap=wind_cap,
                                                  reference=reference,
                                                  annual=annual,
                                                  month=month)

        out_stats.index = sites.index.values

        out_stats.index.name = 'gid'

        return out_stats

[docs]    def compute_stats(self, dataset, reference='solar', annual=True,
                      month=False, combinations=False, max_workers=None,
                      sites_per_worker=1000, lat_lon_only=True):
        """
        Compute stability coefficients

        Parameters
        ----------
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        reference : str, optional
            Which data to use as the reference (denominator) when computing
            the stability coefficient, by default 'solar'
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True

        Returns
        -------
        res_stats : pandas.DataFrame
            DataFrame of desired statistics at desired time intervals
        """
        kwargs = {'time_index': self.time_index,
                  'res_cls': self.res_cls,
                  'annual': annual,
                  'month': month,
                  'combinations': combinations,
                  'reference': reference}

        logger.info('Computing stability coefficients from {}'.format(dataset))
        logger.debug('- Using the following options: {}'.format(kwargs))
        out_stats = self._compute_stats(dataset, max_workers=max_workers,
                                        sites_per_worker=sites_per_worker,
                                        lat_lon_only=lat_lon_only,
                                        extract_stats_kwargs=kwargs)

        return out_stats

[docs]    @classmethod
    def run(cls, solar_h5, wind_h5, dataset, reference='solar', annual=True,
            month=False, combinations=False, res_cls=Resource, year=None,
            max_workers=None, sites_per_worker=1000, lat_lon_only=True,
            out_path=None):
        """
        Compute stability coefficient between solar and wind time-series.
        Time-series are shifted to local time before computing the daily
        stability coefficient. Final data is the average of daily stability
        coefficients for each month and/or year.

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        dataset : tuple | str
            Dataset to compare, if a string, extract the same
            dataset for both with and solar, other wise a tuple of the form:
            (solar_dataset, wind_dataset)
        reference : str, optional
            Which data to use as the reference (denominator) when computing
            the stability coefficient, by default 'solar'
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        year : str | int, optional
            Year to extract time-index for if running on a multi-year file,
            by default None
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        out_path : str, optional
            .csv, or .json path to save statistics too, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        if isinstance(dataset, str) and year is None:
            try:
                year = parse_year(dataset)
            except RuntimeError:
                year = None

        hybrid_stats = cls(solar_h5, wind_h5, res_cls=res_cls, year=year)
        out_stats = hybrid_stats.compute_stats(
            dataset, annual=annual, month=month, combinations=combinations,
            reference=reference, max_workers=max_workers,
            sites_per_worker=sites_per_worker,
            lat_lon_only=lat_lon_only)
        if out_path is not None:
            hybrid_stats.save_stats(out_stats, out_path)

        return out_stats

[docs]    @classmethod
    def cf_profile(cls, solar_h5, wind_h5, reference='solar', annual=True,
                   month=False, combinations=False, res_cls=Resource,
                   max_workers=None, sites_per_worker=1000, lat_lon_only=True,
                   out_path=None):
        """
        Compute stability coefficient between solar and wind time-series.
        Time-series are shifted to local time before computing the daily
        stability coefficient. Final data is the average of daily stability
        coefficients for each month and/or year.

        Parameters
        ----------
        solar_h5 : str
            Path to solar h5 file(s)
        wind_h5 : str
            Path to wind h5 file(s)
        reference : str, optional
            Which data to use as the reference (denominator) when computing
            the stability coefficient, by default 'solar'
        annual : bool, optional,
            Extract stats annualy. To extract multi-year monthly stats set
            `annual=False` and month=True`, by default True
        month : bool, optional
            Extract monthly stats, by default False
        combinations : bool, optional
            Extract all combinations of temporal stats, by default False
        res_cls : Class, optional
            Resource class to use to access res_h5, by default Resource
        max_workers : None | int, optional
            Number of workers to use, if 1 run in serial, if None use all
            available cores, by default None
        sites_per_worker : int, optional
            Number of sites to extract on each worker, by default 1000
        lat_lon_only : bool, optional
            Only append lat, lon coordinates to stats, by default True
        out_path : str, optional
            .csv, or .json path to save statistics too, by default None

        Returns
        -------
        out_stats : pandas.DataFrame
            DataFrame of resource statistics
        """
        out_stats = cls.run(solar_h5, wind_h5, 'cf_profile',
                            reference=reference, annual=annual, month=month,
                            combinations=combinations, res_cls=res_cls,
                            max_workers=max_workers,
                            sites_per_worker=sites_per_worker,
                            lat_lon_only=lat_lon_only, out_path=out_path)

        return out_stats