Source code for reVX.hybrid_stats.hybrid_stats

# -*- coding: utf-8 -*-
"""
Module to compute hybrid solar-wind generation correlations
"""
from concurrent.futures import as_completed
import gc
import logging
import numpy as np
import os
import pandas as pd
from scipy.spatial import cKDTree
from scipy.stats import pearsonr, spearmanr, kendalltau

from reVX.utilities.utilities import log_versions
from rex.resource import Resource
from rex.utilities.execution import SpawnProcessPool
from rex.utilities.loggers import log_mem
from rex.utilities.utilities import (get_lat_lon_cols, roll_timeseries,
                                     parse_year)

logger = logging.getLogger(__name__)


[docs]def pearson_correlation(solar_ts, wind_ts): """ Compute the Pearson correlation coefficient between the given solar and wind timeseries data. Return just the correlation coefficient. Parameters ---------- solar_ts : ndarray Solar time-series vector for a single site wind_ts : ndarray Wind time-series vector for a single site Returns ------- float Pearson correlation coefficient """ return pearsonr(solar_ts, wind_ts)[0]
[docs]def spearman_correlation(solar_ts, wind_ts): """ Compute the Spearman correlation coefficient between the given solar and wind timeseries data. Return just the correlation coefficient. Parameters ---------- solar_ts : ndarray Solar time-series vector for a single site wind_ts : ndarray Wind time-series vector for a single site Returns ------- float Spearman correlation coefficient """ return spearmanr(solar_ts, wind_ts)[0]
[docs]def kendall_tau(solar_ts, wind_ts): """ Compute Kendall's tau correlation between the given solar and wind timeseries data. Return just the correlation coefficient. Parameters ---------- solar_ts : ndarray Solar time-series vector for a single site wind_ts : ndarray Wind time-series vector for a single site Returns ------- float Kendall's tau """ return kendalltau(solar_ts, wind_ts)[0]
[docs]class HybridStats: """ Compute comparitive correlations for co-located wind and solar generation """ STATS = {'pearson': {'func': pearson_correlation}, 'spearman': {'func': spearman_correlation}, 'kendall': {'func': kendall_tau}} def __init__(self, solar_h5, wind_h5, statistics='pearson', res_cls=Resource, year=None): """ Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) statistics : str | tuple | dict, optional Statistics to extract, either a key or tuple of keys in cls.STATS, or a dictionary of the form {'stat_name': {'func': *, 'kwargs: {**}}}, by default 'pearson' res_cls : Class, optional Resource class to use to access res_h5, by default Resource year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None """ log_versions(logger) self._solar_h5 = solar_h5 self._wind_h5 = wind_h5 self._stats = None self.statistics = statistics self._res_cls = res_cls out = self._pre_flight_check(year=year) self._meta, self._time_index = out[:2] self._solar_time_slice, self._wind_time_slice = out[2:] def __repr__(self): msg = ('Computing {} between {} and {}' .format(list(self.statistics), self.solar_h5, self.wind_h5)) return msg @property def solar_h5(self): """ Path to solar h5 file(s) Returns ------- str """ return self._solar_h5 @property def wind_h5(self): """ Path to wind h5 file(s) Returns ------- str """ return self._wind_h5 @property def statistics(self): """ Dictionary of statistic functions/kwargs to run Returns ------- dict """ return self._stats @statistics.setter def statistics(self, statistics): """ Statistics to extract, either a key or tuple of keys in cls.STATS, or a dictionary of the form {'stat_name': {'func': *, 'kwargs: {**}}} Parameters ---------- statistics : dict """ if isinstance(statistics, str): statistics = (statistics, ) if isinstance(statistics, (tuple, list)): statistics = {s: self.STATS[s] for s in statistics} for stat in statistics.values(): msg = 'A "func"(tion) must be provided for each statistic' assert 'func' in stat, msg if 'kwargs' in stat: msg = 'statistic function kwargs must be a dictionary ' assert isinstance(stat['kwargs'], dict), msg self._stats = statistics @property def res_cls(self): """ Resource class to use to access res_h5 Returns ------- Class """ return self._res_cls @property def time_index(self): """ Resource Datetimes Returns ------- pandas.DatetimeIndex """ return self._time_index @property def meta(self): """ Resource meta-data table Returns ------- pandas.DataFrame """ return self._meta @property def lat_lon(self): """ Resource (lat, lon) coordinates Returns ------- pandas.DataFrame """ lat_lon_cols = get_lat_lon_cols(self.meta) return self.meta[lat_lon_cols] @staticmethod def _clean_meta(meta, tech, drop=False): """ Clean up input meta data: - rename and move index to table - rename capacity and any mean_* columns - drop all but capacity, gid and mean columns if `drop=True` Parameters ---------- meta : pandas.DataFrame Sites meta data table for given technology tech : str Technology of meta data table, either 'solar' or 'wind' Returns ------- meta : pandas.DataFrame updated site meta data table """ meta.index.name = f"{tech}_gid" cols = {} for c in meta: if c.startswith('cap'): cols[c] = f'{tech}_cap' elif c.startswith('mean'): cols[c] = f'{tech}_res_gid' meta = meta.rename(columns=cols) if drop: meta = meta[cols.values()] return meta.reset_index() @classmethod def _map_sites(cls, solar_meta, wind_meta): """ Map solar to wind sites retaining the higher resolution meta data. Mapping it done using a cKDTree to determine the closest coarser resolution site to each finer resolution site. Parameters ---------- solar_meta : pandas.DataFrame Solar sites meta data table wind_meta : pandas.DataFrame Wind sites meta data table Returns ------- meta : pandas.DataFrame Meta data table mapping the finer resolution data to the coarser """ solar_coords = solar_meta[get_lat_lon_cols(solar_meta)].values wind_coords = wind_meta[get_lat_lon_cols(wind_meta)].values solar_n = len(solar_meta) wind_n = len(wind_meta) # pylint: disable=not-callable if solar_n < wind_n: solar_meta = cls._clean_meta(solar_meta, 'solar', drop=True) wind_meta = cls._clean_meta(wind_meta, 'wind') tree = cKDTree(solar_coords) meta = wind_meta.copy() dist, pos = tree.query(wind_coords) mask = dist <= np.median(dist) * 1.5 pos = pos[mask] meta = meta.loc[mask].reset_index(drop=True) meta = pd.concat((meta, solar_meta.iloc[pos].reset_index(drop=True)), axis=1) else: solar_meta = cls._clean_meta(solar_meta, 'solar') wind_meta = cls._clean_meta(wind_meta, 'wind', drop=True) tree = cKDTree(wind_coords) meta = solar_meta.copy() dist, pos = tree.query(solar_coords) mask = dist <= np.median(dist) * 1.5 pos = pos[mask] meta = meta.loc[mask].reset_index(drop=True) meta = pd.concat((meta, wind_meta.iloc[pos].reset_index(drop=True)), axis=1) return meta @staticmethod def _check_time_index(solar_time_index, wind_time_index): """ Compare solar and wind time indexes. Determine coincident time_index and requisite time_slices to reduce solar and wind dataset coincident Parameters ---------- solar_time_index : pandas.DatatimeIndex Datetime index for solar data, respresents the temporal resolution of the solar datasets wind_time_index : pands.DatatimeINdex Datetime index for wind data, respresents the temporal resolution of the wind datasets Returns ------- time_index : pandas.DatatimeIndex Coincident datetime index between solar and wind datasets, i.e. datetime steps that are in both wind and solar data solar_time_slice : slice | ndarray slice or boolean index of the solar timesteps that are in the coincident time_index wind_time_slice : slice | ndarray slice or boolean index of the wind timesteps that are in the coincident time_index """ solar_n = len(solar_time_index) solar_time_slice = slice(None) wind_n = len(wind_time_index) wind_time_slice = slice(None) if solar_n < wind_n: wind_time_slice = wind_time_index.isin(solar_time_index) time_index = solar_time_index if not wind_time_slice.all(): msg = ("The following timesteps are not in both wind and " "solar dataset:\n{}" .format(wind_time_index[wind_time_slice])) logger.error(msg) raise RuntimeError(msg) elif wind_n < solar_n: solar_time_slice = solar_time_index.isin(wind_time_index) time_index = wind_time_slice if not solar_time_slice.all(): msg = ("The following timesteps are not in both wind and " "solar dataset:\n{}" .format(solar_time_index[solar_time_slice])) logger.error(msg) raise RuntimeError(msg) else: time_index = solar_time_index return time_index, solar_time_slice, wind_time_slice @staticmethod def _groupby_data(data, annual=True, diurnal=False, doy=False, month=False): """ Groupby data by month and/or hour Parameters ---------- data : pandas.DataFrame DataFrame of data where index is time_index, columns are sites annual : bool, optional Extract annaul stats, by default True diurnal : bool, optional Extract diurnal stats, by default False doy : bool, optional Extract doy-of-year stats, by default False month : bool, optional Extract monthly stats, by default False Returns ------- data : pandas.Groupby Input DataFrame grouped by month and or hour if requested """ groupby = [] if annual: groupby.append(data.index.year) if month: groupby.append(data.index.month) if doy: groupby.append(data.index.dayofyear) if diurnal: groupby.append(data.index.hour) m = "Data must be groupby year, month, day of year or dirunal (hour)!" assert groupby, m data = data.groupby(groupby) return data @staticmethod def _format_grp_names(grp_names): """ Format groupby index values Parameters ---------- grp_names : list Group by index values, these correspond to each unique group in the groupby Returns ------- out : ndarray 2D array of grp index values properly formatted as strings """ month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'} # pylint: disable=unnecessary-lambda # pylint: disable=unnecessary-lambda-assignment year = lambda s: "{}".format(s) month = lambda s: "{}".format(month_map[s]) hour = lambda s: "{:02d}:00UTC".format(s) grp_names = np.array(grp_names).T if len(grp_names.shape) == 1: grp_names = np.expand_dims(grp_names, 0) out = [] for grp_i in grp_names: # pylint: disable=not-an-iterable grp_max = grp_i.max() if grp_max <= 12: out.append(list(map(month, grp_i))) elif grp_max <= 23: out.append(list(map(hour, grp_i))) else: out.append(list(map(year, grp_i))) return np.array(out).T @classmethod def _create_names(cls, index, stats): """ Generate statistics names Parameters ---------- index : pandas.Index | pandas.MultiIndex Temporal index, either month, hour, or (month, hour) stats : list Statistics to be computed Returns ------- columns_map : dict Dictionary of column names to use for each statistic columns : list Column names to use """ column_names = cls._format_grp_names(index) columns_map = {} columns = [] for s in stats: cols = {i: '{}_{}'.format('-'.join(n), s) for i, n in zip(index, column_names)} columns_map[s] = cols columns.extend(list(cols.values())) return columns_map, columns @classmethod def _compute_correlations(cls, solar_data, wind_data, statistics, annual=True, diurnal=False, doy=False, month=False): """ Compute desired correlations for desired time intervals between solar and wind data Parameters ---------- solar_data : pandas.DataFrame DataFrame of solar data. Index is time_index, columns are sites wind_data : pandas.DataFrame DataFrame of wind data. Index is time_index, columns are sites statistics : dict Dictionary of statistic functions/kwargs to run annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True diurnal : bool, optional Extract diurnal stats, by default False doy : bool, optional Extract doy-of-year stats, by default False month : bool, optional Extract monthly stats, by default False Returns ------- out_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ sites = solar_data.columns.values solar_data = cls._groupby_data(solar_data, annual=annual, diurnal=diurnal, doy=doy, month=month) wind_data = cls._groupby_data(wind_data, annual=annual, diurnal=diurnal, doy=doy, month=month) cols_map, col_names = cls._create_names(list(solar_data.groups), list(statistics)) out_stats = pd.DataFrame(columns=col_names, index=sites, dtype=np.float32) for grp_name, solar_grp in solar_data: print(grp_name, wind_data) grp_name = grp_name[0] if len(grp_name) == 1 else grp_name wind_grp = wind_data.get_group(grp_name) msg = ('solar and wind data shapes do not match! {} != {}' .format(solar_grp.shape, wind_grp.shape)) assert solar_grp.shape == wind_grp.shape, msg for i in sites: solar_ts = solar_grp.iloc[:, i] wind_ts = wind_grp.iloc[:, i] for name, stat in statistics.items(): col = cols_map[name][grp_name] func = stat['func'] kwargs = stat.get('kwargs', {}) out_stats.at[i, col] = func(solar_ts, wind_ts, **kwargs) return out_stats @classmethod def _extract_stats(cls, solar_h5, wind_h5, dataset, sites, solar_time_slice, wind_time_slice, statistics=None, time_index=None, res_cls=Resource, annual=True, diurnal=False, doy=False, month=False, combinations=False): """ Extract stats for given dataset, sites, and temporal extent Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : tuple Datasets to compare, in the form: (solar_dataset, wind_dataset) sites : pandas.DataFrame Subset of meta DataFrame with sites to extract solar_time_slice : slice | ndarray slice or boolean index of the solar timesteps that are in the coincident time_index wind_time_slice : slice | ndarray slice or boolean index of the wind timesteps that are in the coincident time_index statistics : dict, optional Dictionary of statistic functions/kwargs to run, if None default to: {'pearson', {'func': pearson_correlation}}, by default None time_index : pandas.DatatimeIndex, optional Timeseries DatetimeIndex, if None extract from wind_h5, by default None res_cls : Class, optional Resource class to use to access res_h5, by default Resource annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True diurnal : bool, optional Extract diurnal stats, by default False doy : bool, optional Extract doy-of-year stats, by default False month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False Returns ------- out_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ if statistics is None: statistics = {'pearson': {'func': pearson_correlation}} wind_sites = sites['wind_gid'].values solar_sites = sites['solar_gid'].values solar_dataset, wind_dataset = dataset with res_cls(wind_h5) as f: if time_index is None: time_index = f.time_index wind_data = f[wind_dataset, wind_time_slice, wind_sites] wind_data = pd.DataFrame(wind_data, index=time_index) with res_cls(solar_h5) as f: solar_data = f[solar_dataset, solar_time_slice, solar_sites] solar_data = pd.DataFrame(solar_data, index=time_index) if combinations: out_stats = [cls._compute_correlations(solar_data, wind_data, statistics)] if month: out_stats.append(cls._compute_correlations(solar_data, wind_data, statistics, annual=False, month=True)) if doy: out_stats.append(cls._compute_correlations(solar_data, wind_data, statistics, annual=False, doy=True)) if diurnal: out_stats.append(cls._compute_correlations(solar_data, wind_data, statistics, annual=False, diurnal=True)) if month and diurnal: out_stats.append(cls._compute_correlations(solar_data, wind_data, statistics, annual=False, month=True, diurnal=True)) out_stats = pd.concat(out_stats, axis=1) else: out_stats = cls._compute_correlations(solar_data, wind_data, statistics, annual=annual, diurnal=diurnal, doy=doy, month=month) out_stats.index = sites.index.values out_stats.index.name = 'gid' return out_stats
[docs] @staticmethod def save_stats(out_stats, out_fpath): """ Save correlations to disk Parameters ---------- out_stats : pandas.DataFrame Table of correlations to save out_path : str .csv, or .json path to save statistics too """ logger.info('Saving hybrid stats to {}'.format(out_fpath)) if out_fpath.endswith('.csv'): out_stats.to_csv(out_fpath) elif out_fpath.endswith('.json'): out_stats.to_json(out_fpath) else: msg = ("Cannot save statistics, expecting a .csv, or " ".json path, but got: {}".format(out_fpath)) logger.error(msg) raise OSError(msg)
@staticmethod def _check_dataset(dataset): """ Check user provided dataset for proper format Parameters ---------- dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) Returns ------- dataset : tuple Datasets to compare, in the form: (solar_dataset, wind_dataset) """ if isinstance(dataset, str): dataset = (dataset, dataset) elif isinstance(dataset, (tuple, list)): if len(dataset) < 2: msg = ("Must supply a solar and wind dataset in the form: " "(solar, wind)") logger.error(msg) raise ValueError(msg) return dataset @staticmethod def _parse_meta_time_index(h5_path, res_cls=Resource, year=None): """ Parse meta data table and time_index from .h5 file. If 'year' is provided extact time_index for given year. Parameters ---------- h5_path : str Path to .h5 file to extract meta and time_index res_cls : Class, optional Resource class to use to access res_h5, by default Resource year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None Returns ------- meta : pandas.DataFrame Site meta data table time_index: pandas.DatatimeIndex Datetime Index """ with res_cls(h5_path) as f: meta = f.meta if 'time_index' in f: time_index = f.time_index elif year is not None: time_index = f[f'time_index-{year}'] else: ti_dsets = [dset for dset in f.datasets if dset.startswith('time_index')] msg = ("'time_index' is not available in {}. The following " "potential annual time_index are available: {}. Please " "specify a 'year' to use the 'time_index' for a " "specific year.".format(h5_path, ti_dsets)) logger.error(ti_dsets) raise ValueError(msg) return meta, time_index def _pre_flight_check(self, year=None): """ Compare solar and wind site meta data and time index to ensure they can be compared Parameters ---------- year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None Returns ------- meta : pandas.DataFrame Meta data table mapping the finer resolution data to the coarser time_index : pandas.DatatimeIndex Coincident datetime index between solar and wind datasets, i.e. datetime steps that are in both wind and solar data solar_time_slice : slice | ndarray slice or boolean index of the solar timesteps that are in the coincident time_index wind_time_slice : slice | ndarray slice or boolean index of the wind timesteps that are in the coincident time_index """ solar_meta, solar_ti = self._parse_meta_time_index( self._solar_h5, res_cls=self._res_cls, year=year) wind_meta, wind_ti = self._parse_meta_time_index( self._wind_h5, res_cls=self._res_cls, year=year) time_index, solar_time_slice, wind_time_slice = \ self._check_time_index(solar_ti, wind_ti) meta = self._map_sites(solar_meta, wind_meta) return meta, time_index, solar_time_slice, wind_time_slice def _compute_stats(self, dataset, max_workers=None, sites_per_worker=1000, lat_lon_only=True, extract_stats_kwargs=None): """ Compute correlations Parameters ---------- dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True extract_stats_kwargs : dict, optional Kwargs to pass to _extract_stats method, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of desired correlation coefficients at desired time intervals """ if extract_stats_kwargs is None: extract_stats_kwargs = {'res_cls': self.res_cls} dataset = self._check_dataset(dataset) solar_h5 = self.solar_h5 wind_h5 = self.wind_h5 if max_workers is None: max_workers = os.cpu_count() slices = len(self.meta) // sites_per_worker if slices: slices = np.array_split(self.meta, slices) else: slices = [self.meta] max_workers = 1 if max_workers > 1: msg = ('Extracting {} for {} in parallel using {} workers' .format(self, dataset, max_workers)) logger.info(msg) loggers = [__name__, 'reVX', 'rex'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: futures = [] for sites in slices: future = exe.submit(self._extract_stats, solar_h5, wind_h5, dataset, sites, self._solar_time_slice, self._wind_time_slice, **extract_stats_kwargs) futures.append(future) out_stats = [] for i, future in enumerate(as_completed(futures)): out_stats.append(future.result()) logger.debug('Completed {} out of {} workers' .format((i + 1), len(futures))) else: msg = ('Extracting {} for {} in serial' .format(self, dataset)) logger.info(msg) out_stats = [] for i, sites in enumerate(slices): out_stats.append(self._extract_stats( solar_h5, wind_h5, dataset, sites, self._solar_time_slice, self._wind_time_slice, **extract_stats_kwargs)) logger.debug('Completed {} out of {} sets of sites' .format((i + 1), len(slices))) gc.collect() log_mem(logger) out_stats = pd.concat(out_stats) if lat_lon_only: meta = self.lat_lon else: meta = self.meta out_stats = meta.join(out_stats.sort_index(), how='inner') return out_stats
[docs] def compute_stats(self, dataset, annual=True, diurnal=False, doy=False, month=False, combinations=False, max_workers=None, sites_per_worker=1000, lat_lon_only=True): """ Compute correlations Parameters ---------- dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True diurnal : bool, optional Extract diurnal stats, by default False doy : bool, optional Extract doy-of-year stats, by default False month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True Returns ------- res_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ kwargs = {'res_cls': self.res_cls, 'annual': annual, 'diurnal': diurnal, 'doy': doy, 'month': month, 'combinations': combinations, 'statistics': self.statistics, 'time_index': self.time_index } logger.info('Computing correlations from {}'.format(dataset)) logger.debug('- Using the following options: {}'.format(kwargs)) out_stats = self._compute_stats(dataset, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only, extract_stats_kwargs=kwargs) return out_stats
[docs] @classmethod def run(cls, solar_h5, wind_h5, dataset, statistics='pearson', annual=True, diurnal=False, doy=False, month=False, combinations=False, res_cls=Resource, year=None, max_workers=None, sites_per_worker=1000, lat_lon_only=True, out_path=None): """ Compute temporal stats between solar and wind time-series at desired temporal scales Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) statistics : str | tuple | dict, optional Statistics to extract, either a key or tuple of keys in cls.STATS, or a dictionary of the form {'stat_name': {'func': *, 'kwargs: {**}}}, by default 'pearson' annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True diurnal : bool, optional Extract diurnal stats, by default False doy : bool, optional Extract doy-of-year stats, by default False month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False res_cls : Class, optional Resource class to use to access res_h5, by default Resource year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True out_path : str, optional .csv, or .json path to save statistics too, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of resource statistics """ if isinstance(dataset, str) and year is None: try: year = parse_year(dataset) except RuntimeError: year = None hybrid_stats = cls(solar_h5, wind_h5, statistics=statistics, res_cls=res_cls, year=year) out_stats = hybrid_stats.compute_stats( dataset, annual=annual, diurnal=diurnal, doy=doy, month=month, combinations=combinations, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only) if out_path is not None: hybrid_stats.save_stats(out_stats, out_path) return out_stats
[docs] @classmethod def cf_profile(cls, solar_h5, wind_h5, statistics='pearson', annual=True, diurnal=False, doy=False, month=False, combinations=False, res_cls=Resource, max_workers=None, sites_per_worker=1000, lat_lon_only=True, out_path=None): """ Compute temporal stats on cf_profile dataset Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) statistics : str | tuple | dict, optional Statistics to extract, either a key or tuple of keys in cls.STATS, or a dictionary of the form {'stat_name': {'func': *, 'kwargs: {**}}}, by default 'pearson' annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True diurnal : bool, optional Extract diurnal stats, by default False doy : bool, optional Extract doy-of-year stats, by default False month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False res_cls : Class, optional Resource class to use to access res_h5, by default Resource max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True out_path : str, optional .csv, or .json path to save statistics too, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of resource statistics """ out_stats = cls.run(solar_h5, wind_h5, 'cf_profile', statistics=statistics, res_cls=res_cls, annual=annual, diurnal=diurnal, doy=doy, month=month, combinations=combinations, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only, out_path=out_path) return out_stats
[docs]class HybridCrossCorrelation(HybridStats): """ Compute the temporal cross correlations for co-located wind and solar generation """ def __init__(self, solar_h5, wind_h5, res_cls=Resource, year=None): """ Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) res_cls : Class, optional Resource class to use to access res_h5, by default Resource year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None """ self._solar_h5 = solar_h5 self._wind_h5 = wind_h5 self._res_cls = res_cls self._stats = None out = self._pre_flight_check(year=year) self._meta, self._time_index = out[:2] self._solar_time_slice, self._wind_time_slice = out[2:] def __repr__(self): msg = ('Computing cross-correlations between {} and {}' .format(self.solar_h5, self.wind_h5)) return msg
[docs] @staticmethod def cross_correlation(solar_data, wind_data, m): """ Compute the cross-correlation between solar and wind time-series data with time-lag m Parameters ---------- solar_data : ndarray Time-series solar data wind_data : ndarray Time-series wind data m : int Integer shift between solar and wind time-series, is pass directly to np.roll, so it will be the number of time-steps that are shifted. The lag time will be m * dt where dt is the time-step size. Returns ------- corr : ndarray Cross-correlation coefficient for each solar, wind site pair """ solar_u = solar_data.mean(axis=0) solar_s = solar_data.std(axis=0) solar_data = (solar_data - solar_u) / solar_s wind_data = np.roll(wind_data, m, axis=0) wind_u = wind_data.mean(axis=0) wind_s = wind_data.mean(axis=0) wind_data = (wind_data - wind_u) / wind_s n = len(solar_data) corr = (1 / (n - 1)) * np.sum(solar_data * wind_data, axis=0) return corr
@classmethod def _extract_stats(cls, solar_h5, wind_h5, dataset, sites, solar_time_slice, wind_time_slice, lag_range=(-50, 51, 1), res_cls=Resource): """ Extract stats for given dataset, sites, and temporal extent Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : tuple Datasets to compare, in the form: (solar_dataset, wind_dataset) sites : pandas.DataFrame Subset of meta DataFrame with sites to extract solar_time_slice : slice | ndarray slice or boolean index of the solar timesteps that are in the coincident time_index wind_time_slice : slice | ndarray slice or boolean index of the wind timesteps that are in the coincident time_index lag_range : tuple, optional The range of lag (m) values to compute the cross-correlation for (start, stop, step). Cross-correlation will be run for all lags in range(start, stop, step), each value in the range is the number of timesteps by which the time-series will be shifted to compute the cross-correlation. by default (-50, 51, 1) res_cls : Class, optional Resource class to use to access res_h5, by default Resource Returns ------- out_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ wind_sites = sites['wind_gid'].values solar_sites = sites['solar_gid'].values solar_dataset, wind_dataset = dataset with res_cls(solar_h5) as f: solar_data = f[solar_dataset, solar_time_slice, solar_sites] with res_cls(wind_h5) as f: wind_data = f[wind_dataset, wind_time_slice, wind_sites] out_stats = {} for m in range(*lag_range): out_stats[m] = cls.cross_correlation(solar_data, wind_data, m) index = pd.Index(sites.index.values, name='gid') out_stats = pd.DataFrame(out_stats, index=index) out_stats['optimal_m'] = \ out_stats.columns[out_stats.values.argmax(axis=1)] return out_stats
[docs] def compute_stats(self, dataset, lag_range=(-50, 51, 1), max_workers=None, sites_per_worker=1000, lat_lon_only=True): """ Compute correlations Parameters ---------- dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) lag_range : tuple, optional The range of lag (m) values to compute the cross-correlation for (start, stop, step). Cross-correlation will be run for all lags in range(start, stop, step), each value in the range is the number of timesteps by which the time-series will be shifted to compute the cross-correlation. by default (-50, 51, 1) max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True Returns ------- res_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ kwargs = {'res_cls': self.res_cls, 'lag_range': lag_range} logger.info('Computing cross correlations from {}'.format(dataset)) logger.debug('- Using the following options: {}'.format(kwargs)) out_stats = self._compute_stats(dataset, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only, extract_stats_kwargs=kwargs) return out_stats
[docs] @classmethod def run(cls, solar_h5, wind_h5, dataset, lag_range=(-50, 51, 1), res_cls=Resource, year=None, max_workers=None, sites_per_worker=1000, lat_lon_only=True, out_path=None): """ Compute cross correlations between solar and wind time-series Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None lag_range : tuple, optional The range of lag (m) values to compute the cross-correlation for (start, stop, step). Cross-correlation will be run for all lags in range(start, stop, step), each value in the range is the number of timesteps by which the time-series will be shifted to compute the cross-correlation. by default (-50, 51, 1) res_cls : Class, optional Resource class to use to access res_h5, by default Resource max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True out_path : str, optional .csv, or .json path to save statistics too, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of resource statistics """ if isinstance(dataset, str) and year is None: try: year = parse_year(dataset) except RuntimeError: year = None hybrid_stats = cls(solar_h5, wind_h5, res_cls=res_cls, year=year) out_stats = hybrid_stats.compute_stats( dataset, lag_range=lag_range, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only) if out_path is not None: hybrid_stats.save_stats(out_stats, out_path) return out_stats
[docs] @classmethod def cf_profile(cls, solar_h5, wind_h5, lag_range=(-50, 51, 1), res_cls=Resource, max_workers=None, sites_per_worker=1000, lat_lon_only=True, out_path=None): """ Compute cross correlations on cf_profile dataset Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : str Dataset to extract stats for lag_range : tuple, optional The range of lag (m) values to compute the cross-correlation for (start, stop, step). Cross-correlation will be run for all lags in range(start, stop, step), each value in the range is the number of timesteps by which the time-series will be shifted to compute the cross-correlation. by default (-50, 51, 1) res_cls : Class, optional Resource class to use to access res_h5, by default Resource max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True out_path : str, optional .csv, or .json path to save statistics too, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of resource statistics """ out_stats = cls.run(solar_h5, wind_h5, 'cf_profile', res_cls=res_cls, lag_range=lag_range, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only, out_path=out_path) return out_stats
[docs]class HybridStabilityCoefficient(HybridStats): """ Compute the annual/monthly stability coefficient for co-located wind and solar """ def __init__(self, solar_h5, wind_h5, res_cls=Resource, year=None): """ Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) res_cls : Class, optional Resource class to use to access res_h5, by default Resource year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None """ self._solar_h5 = solar_h5 self._wind_h5 = wind_h5 self._res_cls = res_cls self._stats = None out = self._pre_flight_check(year=year) self._meta, self._time_index = out[:2] self._solar_time_slice, self._wind_time_slice = out[2:] def __repr__(self): msg = ('Computing stability coefficient between {} and {}' .format(self.solar_h5, self.wind_h5)) return msg @staticmethod def _daily_variability(doy): """ Compute the daily variability Parameters ---------- doy : pandas.DataFrameGroupby Time-series DataFrame grouped by day-of-year Returns ------- var : pandas.DataFrame Daily variablility by site """ var = np.sqrt(np.sum((doy - doy.mean())**2)) return var
[docs] @classmethod def stability_coefficient(cls, mix, ref): """ Compute average stability coefficient Parameters ---------- mix : pandas.DataFrame DataFrame of mixed solar and wind time-series ref : pandas.DataFrame DataFrame of reference (solar or wind) time-series Returns ------- stab : ndarray Vector of the average stability coefficient for all days in the provided time-series data. Averages are by site. """ mix = mix.groupby(mix.index.dayofyear) mix_var = mix.apply(cls._daily_variability) ref = ref.groupby(ref.index.dayofyear) ref_var = ref.apply(cls._daily_variability) stab = 1 - ((mix_var / ref_var) * (ref.mean() / mix.mean())) mask = np.isfinite(stab) if not np.all(mask): stab[~mask] = np.nan return stab.mean().values.astype(np.float32)
@classmethod def _compute_coefficients(cls, solar_data, wind_data, solar_cap=None, wind_cap=None, annual=True, month=False, reference='solar'): """ Compute compute average stability coefficient of solar and wind data over desired time intervals Parameters ---------- solar_data : pandas.DataFrame DataFrame of solar data. Index is time_index, columns are sites wind_data : pandas.DataFrame DataFrame of wind data. Index is time_index, columns are sites annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True month : bool, optional Extract monthly stats, by default False reference : str, optional Which data to use as the reference (denominator) when computing the stability coefficient, by default 'solar' Returns ------- out_stats : pandas.DataFrame DataFrame of stability coefficients for given sites and desired time intervals """ sites = solar_data.columns.values if solar_cap is None or wind_cap is None: mix = (solar_data + wind_data) / 2 else: mix = ((solar_data * solar_cap + wind_data * wind_cap) / (solar_cap + wind_cap)) mix = cls._groupby_data(mix, annual=annual, month=month) solar_data = cls._groupby_data(solar_data, annual=annual, month=month) wind_data = cls._groupby_data(wind_data, annual=annual, month=month) if reference.lower() == 'solar': ref = solar_data else: ref = wind_data cols_map, _ = cls._create_names(list(mix.groups), ['stability']) out_stats = {} for grp_name, mix_grp in mix: grp_name = grp_name[0] if len(grp_name) == 1 else grp_name col = cols_map['stability'][grp_name] ref_grp = ref.get_group(grp_name) msg = ('mixed and reference data shapes do not match! {} != {}' .format(mix_grp.shape, ref_grp.shape)) assert mix_grp.shape == ref_grp.shape, msg out_stats[col] = cls.stability_coefficient(mix_grp, ref_grp) out_stats = [pd.DataFrame(out_stats, index=sites, dtype=np.float32)] means = zip(['solar', 'wind', 'reference', 'mixed'], [solar_data, wind_data, ref, mix]) for name, data in means: _, cols = cls._create_names(list(data.groups), [f'{name}_cf']) mean_data = data.aggregate(np.nanmean).T.astype(np.float32) mean_data.columns = cols out_stats.append(mean_data) return pd.concat(out_stats, axis=1) @classmethod def _extract_stats(cls, solar_h5, wind_h5, dataset, sites, solar_time_slice, wind_time_slice, time_index=None, res_cls=Resource, reference='solar', annual=True, month=False, combinations=False): """ Extract stats for given dataset, sites, and temporal extent Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : tuple Datasets to compare, in the form: (solar_dataset, wind_dataset) sites : pandas.DataFrame Subset of meta DataFrame with sites to extract solar_time_slice : slice | ndarray slice or boolean index of the solar timesteps that are in the coincident time_index wind_time_slice : slice | ndarray slice or boolean index of the wind timesteps that are in the coincident time_index time_index : pandas.DatatimeIndex, optional Timeseries DatetimeIndex, if None extract from wind_h5, by default None res_cls : Class, optional Resource class to use to access res_h5, by default Resource reference : str, optional Which data to use as the reference (denominator) when computing the stability coefficient, by default 'solar' annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False Returns ------- out_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ solar_dataset, wind_dataset = dataset solar_sites = sites['solar_gid'].values tz = sites['timezone'].values.copy() solar_cap = None if 'solar_cap' in sites: solar_cap = sites['solar_cap'].values with res_cls(solar_h5) as f: solar_data = f[solar_dataset, solar_time_slice, solar_sites] solar_data = roll_timeseries(solar_data, tz) solar_data = pd.DataFrame(solar_data, index=time_index) wind_sites = sites['wind_gid'].values wind_cap = None if 'wind_cap' in sites: wind_cap = sites['wind_cap'].values with res_cls(wind_h5) as f: if time_index is None: time_index = f.time_index wind_data = f[wind_dataset, wind_time_slice, wind_sites] wind_data = roll_timeseries(wind_data, tz) wind_data = pd.DataFrame(wind_data, index=time_index) if combinations: out_stats = [cls._compute_coefficients(solar_data, wind_data, solar_cap=solar_cap, wind_cap=wind_cap, reference=reference )] if month: out_stats.append(cls._compute_coefficients(solar_data, wind_data, solar_cap=solar_cap, wind_cap=wind_cap, reference=reference, annual=False, month=True)) out_stats = pd.concat(out_stats, axis=1) else: out_stats = cls._compute_coefficients(solar_data, wind_data, solar_cap=solar_cap, wind_cap=wind_cap, reference=reference, annual=annual, month=month) out_stats.index = sites.index.values out_stats.index.name = 'gid' return out_stats
[docs] def compute_stats(self, dataset, reference='solar', annual=True, month=False, combinations=False, max_workers=None, sites_per_worker=1000, lat_lon_only=True): """ Compute stability coefficients Parameters ---------- dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) reference : str, optional Which data to use as the reference (denominator) when computing the stability coefficient, by default 'solar' annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True Returns ------- res_stats : pandas.DataFrame DataFrame of desired statistics at desired time intervals """ kwargs = {'time_index': self.time_index, 'res_cls': self.res_cls, 'annual': annual, 'month': month, 'combinations': combinations, 'reference': reference} logger.info('Computing stability coefficients from {}'.format(dataset)) logger.debug('- Using the following options: {}'.format(kwargs)) out_stats = self._compute_stats(dataset, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only, extract_stats_kwargs=kwargs) return out_stats
[docs] @classmethod def run(cls, solar_h5, wind_h5, dataset, reference='solar', annual=True, month=False, combinations=False, res_cls=Resource, year=None, max_workers=None, sites_per_worker=1000, lat_lon_only=True, out_path=None): """ Compute stability coefficient between solar and wind time-series. Time-series are shifted to local time before computing the daily stability coefficient. Final data is the average of daily stability coefficients for each month and/or year. Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) dataset : tuple | str Dataset to compare, if a string, extract the same dataset for both with and solar, other wise a tuple of the form: (solar_dataset, wind_dataset) reference : str, optional Which data to use as the reference (denominator) when computing the stability coefficient, by default 'solar' annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False res_cls : Class, optional Resource class to use to access res_h5, by default Resource year : str | int, optional Year to extract time-index for if running on a multi-year file, by default None max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True out_path : str, optional .csv, or .json path to save statistics too, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of resource statistics """ if isinstance(dataset, str) and year is None: try: year = parse_year(dataset) except RuntimeError: year = None hybrid_stats = cls(solar_h5, wind_h5, res_cls=res_cls, year=year) out_stats = hybrid_stats.compute_stats( dataset, annual=annual, month=month, combinations=combinations, reference=reference, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only) if out_path is not None: hybrid_stats.save_stats(out_stats, out_path) return out_stats
[docs] @classmethod def cf_profile(cls, solar_h5, wind_h5, reference='solar', annual=True, month=False, combinations=False, res_cls=Resource, max_workers=None, sites_per_worker=1000, lat_lon_only=True, out_path=None): """ Compute stability coefficient between solar and wind time-series. Time-series are shifted to local time before computing the daily stability coefficient. Final data is the average of daily stability coefficients for each month and/or year. Parameters ---------- solar_h5 : str Path to solar h5 file(s) wind_h5 : str Path to wind h5 file(s) reference : str, optional Which data to use as the reference (denominator) when computing the stability coefficient, by default 'solar' annual : bool, optional, Extract stats annualy. To extract multi-year monthly stats set `annual=False` and month=True`, by default True month : bool, optional Extract monthly stats, by default False combinations : bool, optional Extract all combinations of temporal stats, by default False res_cls : Class, optional Resource class to use to access res_h5, by default Resource max_workers : None | int, optional Number of workers to use, if 1 run in serial, if None use all available cores, by default None sites_per_worker : int, optional Number of sites to extract on each worker, by default 1000 lat_lon_only : bool, optional Only append lat, lon coordinates to stats, by default True out_path : str, optional .csv, or .json path to save statistics too, by default None Returns ------- out_stats : pandas.DataFrame DataFrame of resource statistics """ out_stats = cls.run(solar_h5, wind_h5, 'cf_profile', reference=reference, annual=annual, month=month, combinations=combinations, res_cls=res_cls, max_workers=max_workers, sites_per_worker=sites_per_worker, lat_lon_only=lat_lon_only, out_path=out_path) return out_stats