Source code for reV.rep_profiles.rep_profiles

# -*- coding: utf-8 -*-
"""Representative profile extraction utilities.

Created on Thu Oct 31 12:49:23 2019

@author: gbuster
"""
from abc import ABC, abstractmethod
from concurrent.futures import as_completed
from copy import deepcopy
import json
import logging
import numpy as np
import os
import pandas as pd
from scipy import stats
from warnings import warn


from reV.handlers.outputs import Outputs
from reV.utilities.exceptions import FileInputError, DataShapeError
from reV.utilities import log_versions

from rex.resource import Resource
from rex.utilities.execution import SpawnProcessPool
from rex.utilities.loggers import log_mem
from rex.utilities.utilities import parse_year, to_records_array

logger = logging.getLogger(__name__)


[docs]class RepresentativeMethods: """Class for organizing the methods to determine representative-ness""" def __init__(self, profiles, weights=None, rep_method='meanoid', err_method='rmse'): """ Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. weights : np.ndarray | list 1D array of weighting factors (multiplicative) for profiles. rep_method : str Method identifier for calculation of the representative profile. err_method : str | None Method identifier for calculation of error from the representative profile (e.g. "rmse", "mae", "mbe"). If this is None, the representative meanoid / medianoid profile will be returned directly """ self._rep_method = self.rep_methods[rep_method] self._err_method = self.err_methods[err_method] self._profiles = profiles self._weights = weights self._parse_weights() def _parse_weights(self): """Parse the weights attribute. Check shape and make np.array.""" if isinstance(self._weights, (list, tuple)): self._weights = np.array(self._weights) if self._weights is not None: emsg = ('Weighting factors array of length {} does not match ' 'profiles of shape {}' .format(len(self._weights), self._profiles.shape[1])) assert len(self._weights) == self._profiles.shape[1], emsg @property def rep_methods(self): """Lookup table of representative methods""" methods = {'mean': self.meanoid, 'meanoid': self.meanoid, 'median': self.medianoid, 'medianoid': self.medianoid, } return methods @property def err_methods(self): """Lookup table of error methods""" methods = {'mbe': self.mbe, 'mae': self.mae, 'rmse': self.rmse, None: None, } return methods
[docs] @staticmethod def nargmin(arr, n): """Get the index of the Nth min value in arr. Parameters ---------- arr : np.ndarray 1D array. n : int If n is 0, this returns the location of the min value in arr. If n is 1, this returns the location of the 2nd min value in arr. Returns ------- i : int Location of the Nth min value in arr. """ return arr.argsort()[:(n + 1)][-1]
[docs] @staticmethod def meanoid(profiles, weights=None): """Find the mean profile across all sites. Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. weights : np.ndarray | list 1D array of weighting factors (multiplicative) for profiles. Returns ------- arr : np.ndarray (time, 1) timeseries of the mean of all cf profiles across sites. """ if weights is None: arr = profiles.mean(axis=1).reshape((len(profiles), 1)) else: if not isinstance(weights, np.ndarray): weights = np.array(weights) arr = (profiles * weights).sum(axis=1) / weights.sum() if len(arr.shape) == 1: arr = np.expand_dims(arr, axis=1) return arr
[docs] @staticmethod def medianoid(profiles): """Find the median profile across all sites. Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. Returns ------- arr : np.ndarray (time, 1) timeseries of the median at every timestep of all cf profiles across sites. """ arr = np.median(profiles, axis=1) arr = arr.reshape((len(profiles), 1)) return arr
[docs] @classmethod def mbe(cls, profiles, baseline, i_profile=0): """Calculate the mean bias error of profiles vs. a baseline profile. Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. baseline : np.ndarray (time, 1) timeseries of the meanoid or medianoid to which cf profiles should be compared. i_profile : int The index of the represntative profile being saved (for n_profiles). 0 is the most representative profile. Returns ------- profile : np.ndarray (time, 1) array for the most representative profile i_rep : int Column Index in profiles of the representative profile. """ diff = profiles - baseline.reshape((len(baseline), 1)) mbe = diff.mean(axis=0) i_rep = cls.nargmin(mbe, i_profile) return profiles[:, i_rep], i_rep
[docs] @classmethod def mae(cls, profiles, baseline, i_profile=0): """Calculate the mean absolute error of profiles vs. a baseline profile Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. baseline : np.ndarray (time, 1) timeseries of the meanoid or medianoid to which cf profiles should be compared. i_profile : int The index of the represntative profile being saved (for n_profiles). 0 is the most representative profile. Returns ------- profile : np.ndarray (time, 1) array for the most representative profile i_rep : int Column Index in profiles of the representative profile. """ diff = profiles - baseline.reshape((len(baseline), 1)) mae = np.abs(diff).mean(axis=0) i_rep = cls.nargmin(mae, i_profile) return profiles[:, i_rep], i_rep
[docs] @classmethod def rmse(cls, profiles, baseline, i_profile=0): """Calculate the RMSE of profiles vs. a baseline profile Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. baseline : np.ndarray (time, 1) timeseries of the meanoid or medianoid to which cf profiles should be compared. i_profile : int The index of the represntative profile being saved (for n_profiles). 0 is the most representative profile. Returns ------- profile : np.ndarray (time, 1) array for the most representative profile i_rep : int Column Index in profiles of the representative profile. """ rmse = profiles - baseline.reshape((len(baseline), 1)) rmse **= 2 rmse = np.sqrt(np.mean(rmse, axis=0)) i_rep = cls.nargmin(rmse, i_profile) return profiles[:, i_rep], i_rep
[docs] @classmethod def run(cls, profiles, weights=None, rep_method='meanoid', err_method='rmse', n_profiles=1): """Run representative profile methods. Parameters ---------- profiles : np.ndarray (time, sites) timeseries array of cf profile data. weights : np.ndarray | list 1D array of weighting factors (multiplicative) for profiles. rep_method : str Method identifier for calculation of the representative profile. err_method : str | None Method identifier for calculation of error from the representative profile (e.g. "rmse", "mae", "mbe"). If this is None, the representative meanoid / medianoid profile will be returned directly. n_profiles : int Number of representative profiles to save to fout. Returns ------- profiles : np.ndarray (time, n_profiles) array for the most representative profile(s) i_reps : list | None List (length of n_profiles) with column Index in profiles of the representative profile(s). If err_method is None, this value is also set to None. """ inst = cls(profiles, weights=weights, rep_method=rep_method, err_method=err_method) if inst._weights is not None: baseline = inst._rep_method(inst._profiles, weights=inst._weights) else: baseline = inst._rep_method(inst._profiles) if err_method is None: profiles = baseline i_reps = [None] else: profiles = None i_reps = [] for i in range(n_profiles): p, ir = inst._err_method(inst._profiles, baseline, i_profile=i) if profiles is None: profiles = np.zeros((len(p), n_profiles), dtype=p.dtype) profiles[:, i] = p i_reps.append(ir) return profiles, i_reps
[docs]class RegionRepProfile: """Framework to handle rep profile for one resource region""" RES_GID_COL = 'res_gids' GEN_GID_COL = 'gen_gids' def __init__(self, gen_fpath, rev_summary, cf_dset='cf_profile', rep_method='meanoid', err_method='rmse', weight='gid_counts', n_profiles=1): """ Parameters ---------- gen_fpath : str Filepath to reV gen output file to extract "cf_profile" from. rev_summary : pd.DataFrame Aggregated rev supply curve summary file trimmed to just one region to get a rep profile for. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) cf_dset : str Dataset name to pull generation profiles from. rep_method : str Method identifier for calculation of the representative profile. err_method : str | None Method identifier for calculation of error from the representative profile (e.g. "rmse", "mae", "mbe"). If this is None, the representative meanoid / medianoid profile will be returned directly weight : str | None Column in rev_summary used to apply weighted mean to profiles. The supply curve table data in the weight column should have weight values corresponding to the res_gids in the same row. n_profiles : int Number of representative profiles to retrieve. """ self._gen_fpath = gen_fpath self._rev_summary = rev_summary self._cf_dset = cf_dset self._profiles = None self._source_profiles = None self._weights = None self._i_reps = None self._rep_method = rep_method self._err_method = err_method self._weight = weight self._n_profiles = n_profiles self._gen_gids = None self._res_gids = None self._init_profiles_weights() def _init_profiles_weights(self): """Initialize the base source profiles and weight arrays""" gen_gids = self._get_region_attr(self._rev_summary, self.GEN_GID_COL) res_gids = self._get_region_attr(self._rev_summary, self.RES_GID_COL) self._weights = np.ones(len(res_gids)) if self._weight is not None: self._weights = self._get_region_attr(self._rev_summary, self._weight) df = pd.DataFrame({self.GEN_GID_COL: gen_gids, self.RES_GID_COL: res_gids, 'weights': self._weights}) df = df.sort_values(self.RES_GID_COL) self._gen_gids = df[self.GEN_GID_COL].values self._res_gids = df[self.RES_GID_COL].values if self._weight is not None: self._weights = df['weights'].values else: self._weights = None with Resource(self._gen_fpath) as res: meta = res.meta assert 'gid' in meta source_res_gids = meta['gid'].values msg = ('Resource gids from "gid" column in meta data from "{}" ' 'must be sorted! reV generation should always be run with ' 'sequential project points.'.format(self._gen_fpath)) assert np.all(source_res_gids[:-1] <= source_res_gids[1:]), msg missing = set(self._res_gids) - set(source_res_gids) msg = ('The following resource gids were found in the rev summary ' 'supply curve file but not in the source generation meta ' 'data: {}'.format(missing)) assert not any(missing), msg unique_res_gids, u_idxs = np.unique(self._res_gids, return_inverse=True) iloc = np.where(np.isin(source_res_gids, unique_res_gids))[0] self._source_profiles = res[self._cf_dset, :, iloc[u_idxs]] @property def source_profiles(self): """Retrieve the cf profile array from the source generation h5 file. Returns ------- profiles : np.ndarray Timeseries array of cf profile data. """ return self._source_profiles @property def weights(self): """Get the weights array Returns ------- weights : np.ndarray | None Flat array of weight values from the weight column. The supply curve table data in the weight column should have a list of weight values corresponding to the gen_gids list in the same row. """ return self._weights @staticmethod def _get_region_attr(rev_summary, attr_name): """Retrieve a flat list of attribute data from a col in rev summary. Parameters ---------- rev_summary : pd.DataFrame Aggregated rev supply curve summary file trimmed to just one region to get a rep profile for. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) attr_name : str Column label to extract flattened data from (gen_gids, gid_counts, etc...) Returns ------- data : list Flat list of data from the column with label "attr_name". Either a list of numbers or strings. Lists of jsonified lists will be unpacked. """ data = rev_summary[attr_name].values.tolist() if any(data): if isinstance(data[0], str): # pylint: disable=simplifiable-condition if ('[' and ']' in data[0]) or ('(' and ')' in data[0]): data = [json.loads(s) for s in data] if isinstance(data[0], (list, tuple)): data = [a for b in data for a in b] return data def _run_rep_methods(self): """Run the representative profile methods to find the meanoid/medianoid profile and find the profiles most similar.""" if self.weights is not None: if len(self.weights) != self.source_profiles.shape[1]: e = ('Weights column "{}" resulted in {} weight scalars ' 'which doesnt match gid column which yields ' 'profiles with shape {}.' .format(self._weight, len(self.weights), self.source_profiles.shape)) logger.debug('Gids from column "res_gids" with len {}: {}' .format(len(self._res_gids), self._res_gids)) logger.debug('Weights from column "{}" with len {}: {}' .format(self._weight, len(self.weights), self.weights)) logger.error(e) raise DataShapeError(e) self._profiles, self._i_reps = RepresentativeMethods.run( self.source_profiles, weights=self.weights, rep_method=self._rep_method, err_method=self._err_method, n_profiles=self._n_profiles) @property def rep_profiles(self): """Get the representative profiles of this region.""" if self._profiles is None: self._run_rep_methods() return self._profiles @property def i_reps(self): """Get the representative profile index(es) of this region.""" if self._i_reps is None: self._run_rep_methods() return self._i_reps @property def rep_gen_gids(self): """Get the representative profile gen gids of this region.""" gids = self._gen_gids if self.i_reps[0] is None: rep_gids = None else: rep_gids = [gids[i] for i in self.i_reps] return rep_gids @property def rep_res_gids(self): """Get the representative profile resource gids of this region.""" gids = self._res_gids if self.i_reps[0] is None or gids is None: rep_gids = [None] else: rep_gids = [gids[i] for i in self.i_reps] return rep_gids
[docs] @classmethod def get_region_rep_profile(cls, gen_fpath, rev_summary, cf_dset='cf_profile', rep_method='meanoid', err_method='rmse', weight='gid_counts', n_profiles=1): """Class method for parallelization of rep profile calc. Parameters ---------- gen_fpath : str Filepath to reV gen output file to extract "cf_profile" from. rev_summary : pd.DataFrame Aggregated rev supply curve summary file trimmed to just one region to get a rep profile for. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) cf_dset : str Dataset name to pull generation profiles from. rep_method : str Method identifier for calculation of the representative profile. err_method : str | None Method identifier for calculation of error from the representative profile (e.g. "rmse", "mae", "mbe"). If this is None, the representative meanoid / medianoid profile will be returned directly weight : str | None Column in rev_summary used to apply weighted mean to profiles. The supply curve table data in the weight column should have weight values corresponding to the res_gids in the same row. n_profiles : int Number of representative profiles to retrieve. Returns ------- rep_profile : np.ndarray (time, n_profiles) array for the most representative profile(s) i_rep : list Column Index in profiles of the representative profile(s). gen_gid_reps : list Generation gid(s) of the representative profile(s). res_gid_reps : list Resource gid(s) of the representative profile(s). """ r = cls(gen_fpath, rev_summary, cf_dset=cf_dset, rep_method=rep_method, err_method=err_method, weight=weight, n_profiles=n_profiles) return r.rep_profiles, r.i_reps, r.rep_gen_gids, r.rep_res_gids
[docs]class RepProfilesBase(ABC): """Abstract utility framework for representative profile run classes.""" def __init__(self, gen_fpath, rev_summary, reg_cols=None, cf_dset='cf_profile', rep_method='meanoid', err_method='rmse', weight='gid_counts', n_profiles=1): """ Parameters ---------- gen_fpath : str Filepath to reV gen output file to extract "cf_profile" from. rev_summary : str | pd.DataFrame Aggregated rev supply curve summary file. Str filepath or full df. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) reg_cols : str | list | None Label(s) for a categorical region column(s) to extract profiles for. e.g. "state" will extract a rep profile for each unique entry in the "state" column in rev_summary. cf_dset : str Dataset name to pull generation profiles from. rep_method : str Method identifier for calculation of the representative profile. err_method : str | None Method identifier for calculation of error from the representative profile (e.g. "rmse", "mae", "mbe"). If this is None, the representative meanoid / medianoid profile will be returned directly weight : str | None Column in rev_summary used to apply weighted mean to profiles. The supply curve table data in the weight column should have weight values corresponding to the res_gids in the same row. n_profiles : int Number of representative profiles to save to fout. """ logger.info('Running rep profiles with gen_fpath: "{}"' .format(gen_fpath)) logger.info('Running rep profiles with rev_summary: "{}"' .format(rev_summary)) logger.info('Running rep profiles with region columns: "{}"' .format(reg_cols)) logger.info('Running rep profiles with representative method: "{}"' .format(rep_method)) logger.info('Running rep profiles with error method: "{}"' .format(err_method)) logger.info('Running rep profiles with weight factor: "{}"' .format(weight)) self._weight = weight self._n_profiles = n_profiles self._cf_dset = cf_dset self._gen_fpath = gen_fpath self._reg_cols = reg_cols self._rev_summary = self._parse_rev_summary(rev_summary) self._check_req_cols(self._rev_summary, self._reg_cols) self._check_req_cols(self._rev_summary, self._weight) self._check_req_cols(self._rev_summary, RegionRepProfile.RES_GID_COL) self._check_req_cols(self._rev_summary, RegionRepProfile.GEN_GID_COL) self._check_rev_gen(gen_fpath, cf_dset, self._rev_summary) self._time_index = None self._meta = None self._profiles = None self._rep_method = rep_method self._err_method = err_method @staticmethod def _parse_rev_summary(rev_summary): """Extract, parse, and check the rev summary table. Parameters ---------- rev_summary : str | pd.DataFrame Aggregated rev supply curve summary file. Str filepath or full df. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) Returns ------- rev_summary : pd.DataFrame Aggregated rev supply curve summary file. Full df. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) """ if isinstance(rev_summary, str): if os.path.exists(rev_summary) and rev_summary.endswith('.csv'): rev_summary = pd.read_csv(rev_summary) elif os.path.exists(rev_summary) and rev_summary.endswith('.json'): rev_summary = pd.read_json(rev_summary) else: e = 'Could not parse reV summary file: {}'.format(rev_summary) logger.error(e) raise FileInputError(e) elif not isinstance(rev_summary, pd.DataFrame): e = ('Bad input dtype for rev_summary input: {}' .format(type(rev_summary))) logger.error(e) raise TypeError(e) return rev_summary @staticmethod def _check_req_cols(df, cols): """Check a dataframe for required columns. Parameters ---------- df : pd.DataFrame Dataframe to check columns. cols : str | list | tuple Required columns in df. """ if cols is not None: if isinstance(cols, str): cols = [cols] missing = [] for c in cols: if c not in df: missing.append(c) if any(missing): e = ('Column labels not found in rev_summary table: {}' .format(missing)) logger.error(e) raise KeyError(e) @staticmethod def _check_rev_gen(gen_fpath, cf_dset, rev_summary): """Check rev gen file for requisite datasets. Parameters ---------- gen_fpath : str Filepath to reV gen output file to extract "cf_profile" from. cf_dset : str Dataset name to pull generation profiles from. rev_summary : pd.DataFrame Aggregated rev supply curve summary file. Full df. Must include "res_gids", "gen_gids", and the "weight" column (if weight is not None) """ with Resource(gen_fpath) as res: dsets = res.datasets if cf_dset not in dsets: raise KeyError('reV gen file needs to have "{}" ' 'dataset to calculate representative profiles!' .format(cf_dset)) if 'time_index' not in str(dsets): raise KeyError('reV gen file needs to have "time_index" ' 'dataset to calculate representative profiles!') shape = res.get_dset_properties(cf_dset)[0] if len(rev_summary) > shape[1]: msg = ('WARNING: reV SC summary table has {} sc points and CF ' 'dataset "{}" has {} profiles. There should never be more ' 'SC points than CF profiles.' .format(len(rev_summary), cf_dset, shape[1])) logger.warning(msg) warn(msg) def _init_profiles(self): """Initialize the output rep profiles attribute.""" self._profiles = {k: np.zeros((len(self.time_index), len(self.meta)), dtype=np.float32) for k in range(self._n_profiles)} @property def time_index(self): """Get the time index for the rep profiles. Returns ------- time_index : pd.datetimeindex Time index sourced from the reV gen file. """ if self._time_index is None: with Resource(self._gen_fpath) as res: ds = 'time_index' if parse_year(self._cf_dset, option='bool'): year = parse_year(self._cf_dset, option='raise') ds += '-{}'.format(year) self._time_index = res._get_time_index(ds, slice(None)) return self._time_index @property def meta(self): """Meta data for the representative profiles. Returns ------- meta : pd.DataFrame Meta data for the representative profiles. At the very least, this has columns for the region and res class. """ return self._meta @property def profiles(self): """Get the arrays of representative CF profiles corresponding to meta. Returns ------- profiles : dict dict of n_profile-keyed arrays with shape (time, n) for the representative profiles for each region. """ return self._profiles def _init_h5_out(self, fout, save_rev_summary=True, scaled_precision=False): """Initialize an output h5 file for n_profiles Parameters ---------- fout : str None or filepath to output h5 file. save_rev_summary : bool Flag to save full reV SC table to rep profile output. scaled_precision : bool Flag to scale cf_profiles by 1000 and save as uint16. """ dsets = [] shapes = {} attrs = {} chunks = {} dtypes = {} for i in range(self._n_profiles): dset = 'rep_profiles_{}'.format(i) dsets.append(dset) shapes[dset] = self.profiles[0].shape chunks[dset] = None if scaled_precision: attrs[dset] = {'scale_factor': 1000} dtypes[dset] = np.uint16 else: attrs[dset] = None dtypes[dset] = self.profiles[0].dtype meta = self.meta.copy() for c in meta.columns: try: meta[c] = pd.to_numeric(meta[c]) except ValueError: pass Outputs.init_h5(fout, dsets, shapes, attrs, chunks, dtypes, meta, time_index=self.time_index) if save_rev_summary: with Outputs(fout, mode='a') as out: rev_sum = to_records_array(self._rev_summary) out._create_dset('rev_summary', rev_sum.shape, rev_sum.dtype, data=rev_sum) def _write_h5_out(self, fout, save_rev_summary=True): """Write profiles and meta to an output file. Parameters ---------- fout : str None or filepath to output h5 file. save_rev_summary : bool Flag to save full reV SC table to rep profile output. scaled_precision : bool Flag to scale cf_profiles by 1000 and save as uint16. """ with Outputs(fout, mode='a') as out: if 'rev_summary' in out.datasets and save_rev_summary: rev_sum = to_records_array(self._rev_summary) out['rev_summary'] = rev_sum for i in range(self._n_profiles): dset = 'rep_profiles_{}'.format(i) out[dset] = self.profiles[i]
[docs] def save_profiles(self, fout, save_rev_summary=True, scaled_precision=False): """Initialize fout and save profiles. Parameters ---------- fout : str None or filepath to output h5 file. save_rev_summary : bool Flag to save full reV SC table to rep profile output. scaled_precision : bool Flag to scale cf_profiles by 1000 and save as uint16. """ self._init_h5_out(fout, save_rev_summary=save_rev_summary, scaled_precision=scaled_precision) self._write_h5_out(fout, save_rev_summary=save_rev_summary)
@abstractmethod def _run_serial(self): """Abstract method for serial run method.""" @abstractmethod def _run_parallel(self): """Abstract method for parallel run method."""
[docs] @abstractmethod def run(self): """Abstract method for generic run method."""
[docs]class RepProfiles(RepProfilesBase): """RepProfiles""" def __init__(self, gen_fpath, rev_summary, reg_cols, cf_dset='cf_profile', rep_method='meanoid', err_method='rmse', weight='gid_counts', n_profiles=1, aggregate_profiles=False): """reV rep profiles class. ``reV`` rep profiles compute representative generation profiles for each supply curve point output by ``reV`` supply curve aggregation. Representative profiles can either be a spatial aggregation of generation profiles or actual generation profiles that most closely resemble an aggregated profile (selected based on an error metric). Parameters ---------- gen_fpath : str Filepath to ``reV`` generation output HDF5 file to extract `cf_dset` dataset from. .. Note:: If executing ``reV`` from the command line, this path can contain brackets ``{}`` that will be filled in by the `analysis_years` input. Alternatively, this input can be set to ``"PIPELINE"``, which will parse this input from one of these preceding pipeline steps: ``multi-year``, ``collect``, ``generation``, or ``supply-curve-aggregation``. However, note that duplicate executions of any of these commands within the pipeline may invalidate this parsing, meaning the `gen_fpath` input will have to be specified manually. rev_summary : str | pd.DataFrame Aggregated ``reV`` supply curve summary file. Must include the following columns: - ``res_gids`` : string representation of python list containing the resource GID values corresponding to each supply curve point. - ``gen_gids`` : string representation of python list containing the ``reV`` generation GID values corresponding to each supply curve point. - weight column (name based on `weight` input) : string representation of python list containing the resource GID weights for each supply curve point. .. Note:: If executing ``reV`` from the command line, this input can be set to ``"PIPELINE"``, which will parse this input from one of these preceding pipeline steps: ``supply-curve-aggregation`` or ``supply-curve``. However, note that duplicate executions of any of these commands within the pipeline may invalidate this parsing, meaning the `rev_summary` input will have to be specified manually. reg_cols : str | list Label(s) for a categorical region column(s) to extract profiles for. For example, ``"state"`` will extract a rep profile for each unique entry in the ``"state"`` column in `rev_summary`. To get a profile for each supply curve point, try setting `reg_cols` to a primary key such as ``"sc_gid"``. cf_dset : str, optional Dataset name to pull generation profiles from. This dataset must be present in the `gen_fpath` HDF5 file. By default, ``"cf_profile"`` .. Note:: If executing ``reV`` from the command line, this name can contain brackets ``{}`` that will be filled in by the `analysis_years` input (e.g. ``"cf_profile-{}"``). rep_method : {'mean', 'meanoid', 'median', 'medianoid'}, optional Method identifier for calculation of the representative profile. By default, ``'meanoid'`` err_method : {'mbe', 'mae', 'rmse'}, optional Method identifier for calculation of error from the representative profile. If this input is ``None``, the representative meanoid / medianoid profile will be returned directly. By default, ``'rmse'``. weight : str, optional Column in `rev_summary` used to apply weights when computing mean profiles. The supply curve table data in the weight column should have weight values corresponding to the `res_gids` in the same row (i.e. string representation of python list containing weight values). .. Important:: You'll often want to set this value to something other than ``None`` (typically ``"gid_counts"`` if running on standard ``reV`` outputs). Otherwise, the unique generation profiles within each supply curve point are weighted equally. For example, if you have a 64x64 supply curve point, and one generation profile takes up 4095 (99.98%) 90m cells while a second generation profile takes up only one 90m cell (0.02%), they will contribute *equally* to the meanoid profile unless these weights are specified. By default, ``'gid_counts'``. n_profiles : int, optional Number of representative profiles to save to the output file. By default, ``1``. aggregate_profiles : bool, optional Flag to calculate the aggregate (weighted meanoid) profile for each supply curve point. This behavior is in lieu of finding the single profile per region closest to the meanoid. If you set this flag to ``True``, the `rep_method`, `err_method`, and `n_profiles` inputs will be forcibly set to the default values. By default, ``False``. """ log_versions(logger) logger.info('Finding representative profiles that are most similar ' 'to the weighted meanoid for each supply curve region.') if reg_cols is None: e = ('Need to define "reg_cols"! If you want a profile for each ' 'supply curve point, try setting "reg_cols" to a primary ' 'key such as "sc_gid".') logger.error(e) raise ValueError(e) elif isinstance(reg_cols, str): reg_cols = [reg_cols] elif not isinstance(reg_cols, list): reg_cols = list(reg_cols) self._aggregate_profiles = aggregate_profiles if self._aggregate_profiles: logger.info("Aggregate profiles input set to `True`. Setting " "'rep_method' to `'meanoid'`, 'err_method' to `None`, " "and 'n_profiles' to `1`") rep_method = 'meanoid' err_method = None n_profiles = 1 super().__init__(gen_fpath, rev_summary, reg_cols=reg_cols, cf_dset=cf_dset, rep_method=rep_method, err_method=err_method, weight=weight, n_profiles=n_profiles) self._set_meta() self._init_profiles() def _set_meta(self): """Set the rep profile meta data with each row being a unique combination of the region columns.""" if self._err_method is None: self._meta = self._rev_summary else: self._meta = self._rev_summary.groupby(self._reg_cols) self._meta = ( self._meta['timezone'] .apply(lambda x: stats.mode(x, keepdims=True).mode[0]) ) self._meta = self._meta.reset_index() self._meta['rep_gen_gid'] = None self._meta['rep_res_gid'] = None def _get_mask(self, region_dict): """Get the mask for a given region and res class. Parameters ---------- region_dict : dict Column-value pairs to filter the rev summary on. Returns ------- mask : np.ndarray Boolean mask to filter rev_summary to the appropriate region_dict values. """ mask = None for k, v in region_dict.items(): temp = (self._rev_summary[k] == v) if mask is None: mask = temp else: mask = (mask & temp) return mask def _run_serial(self): """Compute all representative profiles in serial.""" logger.info('Running {} rep profile calculations in serial.' .format(len(self.meta))) meta_static = deepcopy(self.meta) for i, row in meta_static.iterrows(): region_dict = {k: v for (k, v) in row.to_dict().items() if k in self._reg_cols} mask = self._get_mask(region_dict) if not any(mask): logger.warning('Skipping profile {} out of {} ' 'for region: {} with no valid mask.' .format(i + 1, len(meta_static), region_dict)) else: logger.debug('Working on profile {} out of {} for region: {}' .format(i + 1, len(meta_static), region_dict)) out = RegionRepProfile.get_region_rep_profile( self._gen_fpath, self._rev_summary[mask], cf_dset=self._cf_dset, rep_method=self._rep_method, err_method=self._err_method, weight=self._weight, n_profiles=self._n_profiles) profiles, _, ggids, rgids = out logger.info('Profile {} out of {} complete ' 'for region: {}' .format(i + 1, len(meta_static), region_dict)) for n in range(profiles.shape[1]): self._profiles[n][:, i] = profiles[:, n] if ggids is None: self._meta.at[i, 'rep_gen_gid'] = None self._meta.at[i, 'rep_res_gid'] = None elif len(ggids) == 1: self._meta.at[i, 'rep_gen_gid'] = ggids[0] self._meta.at[i, 'rep_res_gid'] = rgids[0] else: self._meta.at[i, 'rep_gen_gid'] = str(ggids) self._meta.at[i, 'rep_res_gid'] = str(rgids) def _run_parallel(self, max_workers=None, pool_size=72): """Compute all representative profiles in parallel. Parameters ---------- max_workers : int | None Number of parallel workers. 1 will run serial, None will use all available. pool_size : int Number of futures to submit to a single process pool for parallel futures. """ logger.info('Kicking off {} rep profile futures.' .format(len(self.meta))) iter_chunks = np.array_split(self.meta.index.values, np.ceil(len(self.meta) / pool_size)) n_complete = 0 for iter_chunk in iter_chunks: logger.debug('Starting process pool...') futures = {} loggers = [__name__, 'reV'] with SpawnProcessPool(max_workers=max_workers, loggers=loggers) as exe: for i in iter_chunk: row = self.meta.loc[i, :] region_dict = {k: v for (k, v) in row.to_dict().items() if k in self._reg_cols} mask = self._get_mask(region_dict) if not any(mask): logger.info('Skipping profile {} out of {} ' 'for region: {} with no valid mask.' .format(i + 1, len(self.meta), region_dict)) else: future = exe.submit( RegionRepProfile.get_region_rep_profile, self._gen_fpath, self._rev_summary[mask], cf_dset=self._cf_dset, rep_method=self._rep_method, err_method=self._err_method, weight=self._weight, n_profiles=self._n_profiles) futures[future] = [i, region_dict] for future in as_completed(futures): i, region_dict = futures[future] profiles, _, ggids, rgids = future.result() n_complete += 1 logger.info('Future {} out of {} complete ' 'for region: {}' .format(n_complete, len(self.meta), region_dict)) log_mem(logger, log_level='DEBUG') for n in range(profiles.shape[1]): self._profiles[n][:, i] = profiles[:, n] if ggids is None: self._meta.at[i, 'rep_gen_gid'] = None self._meta.at[i, 'rep_res_gid'] = None elif len(ggids) == 1: self._meta.at[i, 'rep_gen_gid'] = ggids[0] self._meta.at[i, 'rep_res_gid'] = rgids[0] else: self._meta.at[i, 'rep_gen_gid'] = str(ggids) self._meta.at[i, 'rep_res_gid'] = str(rgids)
[docs] def run(self, fout=None, save_rev_summary=True, scaled_precision=False, max_workers=None): """ Run representative profiles in serial or parallel and save to disc Parameters ---------- fout : str, optional Filepath to output HDF5 file. If ``None``, output data are not written to a file. By default, ``None``. save_rev_summary : bool, optional Flag to save full ``reV`` supply curve table to rep profile output. By default, ``True``. scaled_precision : bool, optional Flag to scale `cf_profiles` by 1000 and save as uint16. By default, ``False``. max_workers : int, optional Number of parallel rep profile workers. ``1`` will run serial, while ``None`` will use all available. By default, ``None``. """ if max_workers == 1: self._run_serial() else: self._run_parallel(max_workers=max_workers) if fout is not None: if self._aggregate_profiles: logger.info("Aggregate profiles input set to `True`. Setting " "'save_rev_summary' input to `False`") save_rev_summary = False self.save_profiles(fout, save_rev_summary=save_rev_summary, scaled_precision=scaled_precision) logger.info('Representative profiles complete!') return fout