Source code for reV.generation.base

# -*- coding: utf-8 -*-
"""
reV base gen and econ module.
"""
from abc import ABC, abstractmethod
import copy
from concurrent.futures import TimeoutError
import logging
import pandas as pd
import numpy as np
import os
import psutil
import json
import sys
from warnings import warn

from reV.config.output_request import SAMOutputRequest
from reV.config.project_points import ProjectPoints, PointsControl
from reV.handlers.outputs import Outputs
from reV.SAM.version_checker import PySamVersionChecker
from reV.utilities.exceptions import (OutputWarning, ExecutionError,
                                      ParallelExecutionWarning,
                                      OffshoreWindInputWarning)
from reV.utilities import log_versions, ModuleName

from rex.resource import Resource
from rex.utilities.execution import SpawnProcessPool

logger = logging.getLogger(__name__)


ATTR_DIR = os.path.dirname(os.path.realpath(__file__))
ATTR_DIR = os.path.join(ATTR_DIR, 'output_attributes')
with open(os.path.join(ATTR_DIR, 'other.json'), 'r') as f:
    OTHER_ATTRS = json.load(f)
with open(os.path.join(ATTR_DIR, 'lcoe_fcr.json'), 'r') as f:
    LCOE_ATTRS = json.load(f)
with open(os.path.join(ATTR_DIR, 'single_owner.json'), 'r') as f:
    SO_ATTRS = json.load(f)
with open(os.path.join(ATTR_DIR, 'windbos.json'), 'r') as f:
    BOS_ATTRS = json.load(f)
with open(os.path.join(ATTR_DIR, 'lcoe_fcr_inputs.json'), 'r') as f:
    LCOE_IN_ATTRS = json.load(f)


[docs]class BaseGen(ABC):
    """Base class for reV gen and econ classes to run SAM simulations."""

    # Mapping of reV requests to SAM objects that should be used for simulation
    OPTIONS = {}

    # Mapping of reV generation / econ outputs to scale factors and units.
    OUT_ATTRS = copy.deepcopy(OTHER_ATTRS)

    # Mapping of reV econ outputs to scale factors and units.
    # Type is scalar or array and corresponds to the SAM single-site output
    # This is the OUT_ATTRS class attr for Econ but should also be accessible
    # to rev generation
    ECON_ATTRS = copy.deepcopy(OTHER_ATTRS)
    ECON_ATTRS.update(LCOE_ATTRS)
    ECON_ATTRS.update(SO_ATTRS)
    ECON_ATTRS.update(BOS_ATTRS)
    ECON_ATTRS.update(LCOE_IN_ATTRS)

    # SAM argument names used to calculate LCOE
    # Note that system_capacity is not included here because it is never used
    # downstream and could be confused with the supply_curve point capacity
    LCOE_ARGS = ('fixed_charge_rate', 'capital_cost', 'fixed_operating_cost',
                 'variable_operating_cost')

    def __init__(self, points_control, output_request, site_data=None,
                 drop_leap=False, memory_utilization_limit=0.4,
                 scale_outputs=True):
        """
        Parameters
        ----------
        points_control : reV.config.project_points.PointsControl
            Project points control instance for site and SAM config spec.
        output_request : list | tuple
            Output variables requested from SAM.
        site_data : str | pd.DataFrame | None
            Site-specific input data for SAM calculation. String should be a
            filepath that points to a csv, DataFrame is pre-extracted data.
            Rows match sites, columns are input keys. Need a "gid" column.
            Input as None if no site-specific data.
        drop_leap : bool
            Drop leap day instead of final day of year during leap years.
        memory_utilization_limit : float
            Memory utilization limit (fractional). This sets how many site
            results will be stored in-memory at any given time before flushing
            to disk.
        scale_outputs : bool
            Flag to scale outputs in-place immediately upon Gen returning data.
        """
        log_versions(logger)
        self._points_control = points_control
        self._year = None
        self._site_limit = None
        self._site_mem = None
        self._out_fpath = None
        self._meta = None
        self._time_index = None
        self._sam_module = None
        self._sam_obj_default = None
        self._drop_leap = drop_leap
        self.mem_util_lim = memory_utilization_limit
        self.scale_outputs = scale_outputs

        self._run_attrs = {'points_control': str(points_control),
                           'output_request': output_request,
                           'site_data': str(site_data),
                           'drop_leap': str(drop_leap),
                           'memory_utilization_limit': self.mem_util_lim}

        self._site_data = self._parse_site_data(site_data)
        self.add_site_data_to_pp(self._site_data)
        output_request = SAMOutputRequest(output_request)
        self._output_request = self._parse_output_request(output_request)

        # pre-initialize output arrays to store results when available.
        self._out = {}
        self._finished_sites = []
        self._out_n_sites = 0
        self._out_chunk = ()
        self._check_sam_version_inputs()

    @property
    def output_request(self):
        """Get the output variables requested from the user.

        Returns
        -------
        output_request : list
            Output variables requested from SAM.
        """
        return self._output_request

    @property
    def out_chunk(self):
        """Get the current output chunk index range (INCLUSIVE).

        Returns
        -------
        _out_chunk : tuple
            Two entry tuple (start, end) indicies (inclusive) for where the
            current data in-memory belongs in the final output.
        """
        return self._out_chunk

    @property
    def site_data(self):
        """Get the site-specific inputs in dataframe format.

        Returns
        -------
        _site_data : pd.DataFrame
            Site-specific input data for gen or econ calculation. Rows match
            sites, columns are variables.
        """
        return self._site_data

    @property
    def site_limit(self):
        """Get the number of sites results that can be stored in memory at once

        Returns
        -------
        _site_limit : int
            Number of site result sets that can be stored in memory at once
            without violating memory limits.
        """

        if self._site_limit is None:
            tot_mem = psutil.virtual_memory().total / 1e6
            avail_mem = self.mem_util_lim * tot_mem
            self._site_limit = int(np.floor(avail_mem / self.site_mem))
            logger.info('Limited to storing {0} sites in memory '
                        '({1:.1f} GB total hardware, {2:.1f} GB available '
                        'with {3:.1f}% utilization).'
                        .format(self._site_limit, tot_mem / 1e3,
                                avail_mem / 1e3, self.mem_util_lim * 100))

        return self._site_limit

    @property
    def site_mem(self):
        """Get the memory (MB) required to store all results for a single site.

        Returns
        -------
        _site_mem : float
            Memory (MB) required to store all results in requested in
            output_request for a single site.
        """

        if self._site_mem is None:
            # average the memory usage over n sites
            # (for better understanding of array overhead)
            n = 100
            self._site_mem = 0
            for request in self.output_request:
                dtype = 'float32'
                if request in self.OUT_ATTRS:
                    dtype = self.OUT_ATTRS[request].get('dtype', 'float32')

                shape = self._get_data_shape(request, n)
                self._site_mem += sys.getsizeof(np.ones(shape, dtype=dtype))

            self._site_mem = self._site_mem / 1e6 / n
            logger.info('Output results from a single site are calculated to '
                        'use {0:.1f} KB of memory.'
                        .format(self._site_mem / 1000))

        return self._site_mem

    @property
    def points_control(self):
        """Get project points controller.

        Returns
        -------
        points_control : reV.config.project_points.PointsControl
            Project points control instance for site and SAM config spec.
        """
        return self._points_control

    @property
    def project_points(self):
        """Get project points

        Returns
        -------
        project_points : reV.config.project_points.ProjectPoints
            Project points from the points control instance.
        """
        return self._points_control.project_points

    @property
    def sam_configs(self):
        """Get the sam config dictionary.

        Returns
        -------
        sam_configs : dict
            SAM config from the project points instance.
        """
        return self.project_points.sam_inputs

    @property
    def sam_metas(self):
        """
        SAM configurations including runtime module

        Returns
        -------
        sam_metas : dict
            Nested dictionary of SAM configuration files with module used
            at runtime
        """
        sam_metas = self.sam_configs.copy()
        for v in sam_metas.values():
            v.update({'module': self._sam_module.MODULE})

        return sam_metas

    @property
    def sam_module(self):
        """Get the SAM module class to be used for SAM simulations.

        Returns
        -------
        sam_module : object
            SAM object like PySAM.Pvwattsv7 or PySAM.Lcoefcr
        """
        return self._sam_module

    @property
    def meta(self):
        """Get resource meta for all sites in project points.

        Returns
        -------
        meta : pd.DataFrame
            Meta data df for sites in project points. Column names are meta
            data variables, rows are different sites. The row index
            does not indicate the site number if the project points are
            non-sequential or do not start from 0, so a 'gid' column is added.
        """
        return self._meta

    @property
    def time_index(self):
        """Get the resource time index data.

        Returns
        -------
        _time_index : pandas.DatetimeIndex
            Time-series datetime index
        """
        return self._time_index

    @property
    def run_attrs(self):
        """
        Run time attributes (__init__ args and kwargs)

        Returns
        -------
        run_attrs : dict
            Dictionary of runtime args and kwargs
        """
        return self._run_attrs

    @property
    def year(self):
        """Get the resource year.

        Returns
        -------
        _year : int
            Year of the time-series datetime index.
        """

        if self._year is None and self.time_index is not None:
            self._year = int(self.time_index.year[0])

        return self._year

    @property
    def tech(self):
        """Get the reV technology string.

        Returns
        -------
        tech : str
            SAM technology to analyze (pvwattsv7, windpower, tcsmoltensalt,
            solarwaterheat, troughphysicalheat, lineardirectsteam, econ)
            The string should be lower-cased with spaces and _ removed.
        """
        return self.project_points.tech

    @property
    def out(self):
        """Get the reV gen or econ output results.

        Returns
        -------
        out : dict
            Dictionary of gen or econ results from SAM.
        """
        out = {}
        for k, v in self._out.items():
            if k in self.OUT_ATTRS:
                scale_factor = self.OUT_ATTRS[k].get('scale_factor', 1)
            else:
                scale_factor = 1

            if scale_factor != 1 and self.scale_outputs:
                v = v.astype('float32')
                v /= scale_factor

            out[k] = v

        return out

    @out.setter
    def out(self, result):
        """Set the output attribute, unpack futures, clear output from mem.

        Parameters
        ----------
        result : list | dict | None
            Gen or Econ results to set to output dictionary. Use cases:
             - List input is interpreted as a futures list, which is unpacked
               before setting to the output dict.
             - Dictionary input is interpreted as an already unpacked result.
             - None is interpreted as a signal to clear the output dictionary.
        """
        if isinstance(result, list):
            # unpack futures list to dictionary first
            result = self.unpack_futures(result)

        if isinstance(result, dict):

            # iterate through dict where sites are keys and values are
            # corresponding results
            for site_gid, site_output in result.items():

                # check that the sites are stored sequentially then add to
                # the finished site list
                if self._finished_sites:
                    if int(site_gid) < np.max(self._finished_sites):
                        raise Exception('Site results are non sequential!')

                # unpack site output object
                self.unpack_output(site_gid, site_output)

                # add site gid to the finished list after outputs are unpacked
                self._finished_sites.append(site_gid)

        elif isinstance(result, type(None)):
            self._out.clear()
            self._finished_sites.clear()
        else:
            raise TypeError('Did not recognize the type of output. '
                            'Tried to set output type "{}", but requires '
                            'list, dict or None.'.format(type(result)))

    @staticmethod
    def _output_request_type_check(req):
        """Output request type check and ensure list for manipulation.

        Parameters
        ----------
        req : list | tuple | str
            Output request of variable type.

        Returns
        -------
        output_request : list
            Output request.
        """

        if isinstance(req, list):
            output_request = req
        elif isinstance(req, tuple):
            output_request = list(req)
        elif isinstance(req, str):
            output_request = [req]
        else:
            raise TypeError('Output request must be str, list, or tuple but '
                            'received: {}'.format(type(req)))

        return output_request

[docs]    @staticmethod
    def handle_leap_ti(ti, drop_leap=False):
        """Handle a time index for a leap year by dropping a day.

        Parameters
        ----------
        ti : pandas.DatetimeIndex
            Time-series datetime index with or without leap days.
        drop_leap : bool
            Option to drop leap days (if True) or drop the last day of each
            leap year (if False).

        Returns
        -------
        ti : pandas.DatetimeIndex
            Time-series datetime index with length a multiple of 365.
        """

        # Drop leap day or last day
        leap_day = ((ti.month == 2) & (ti.day == 29))
        leap_year = ti.year % 4 == 0
        last_day = ((ti.month == 12) & (ti.day == 31)) * leap_year
        if drop_leap:
            # Preference is to drop leap day if exists
            ti = ti.drop(ti[leap_day])
        elif any(leap_day):
            # Leap day exists but preference is to drop last day of year
            ti = ti.drop(ti[last_day])

        if len(ti) % 365 != 0:
            raise ValueError('Bad time index with length not a multiple of '
                             '365: {}'.format(ti))

        return ti

    @staticmethod
    def _pp_to_pc(points, points_range, sam_configs, tech,
                  sites_per_worker=None, res_file=None, curtailment=None):
        """
        Create ProjectControl from ProjectPoints

        Parameters
        ----------
        points : int | slice | list | str | pandas.DataFrame
                 | reV.config.project_points.PointsControl
            Single site integer,
            or slice or list specifying project points,
            or string pointing to a project points csv,
            or a pre-loaded project points DataFrame,
            or a fully instantiated PointsControl object.
        points_range : list | None
            Optional two-entry list specifying the index range of the sites to
            analyze. To be taken from the reV.config.PointsControl.split_range
            property.
        sam_configs : dict | str | SAMConfig
            SAM input configuration ID(s) and file path(s). Keys are the SAM
            config ID(s) which map to the config column in the project points
            CSV. Values are either a JSON SAM config file or dictionary of SAM
            config inputs. Can also be a single config file path or a
            pre loaded SAMConfig object.
        tech : str
            SAM technology to analyze (pvwattsv7, windpower, tcsmoltensalt,
            solarwaterheat, troughphysicalheat, lineardirectsteam)
            The string should be lower-cased with spaces and _ removed.
        sites_per_worker : int
            Number of sites to run in series on a worker. None defaults to the
            resource file chunk size.
        res_file : str
            Filepath to single resource file, multi-h5 directory,
            or /h5_dir/prefix*suffix
        curtailment : NoneType | dict | str | config.curtailment.Curtailment
            Inputs for curtailment parameters. If not None, curtailment inputs
            are expected. Can be:
                - Explicit namespace of curtailment variables (dict)
                - Pointer to curtailment config json file with path (str)
                - Instance of curtailment config object
                  (config.curtailment.Curtailment)

        Returns
        -------
        pc : reV.config.project_points.PointsControl
            PointsControl object instance.
        """
        if hasattr(points, "df"):
            points = points.df

        pp = ProjectPoints(points, sam_configs, tech=tech, res_file=res_file,
                           curtailment=curtailment)

        #  make Points Control instance
        if points_range is not None:
            # PointsControl is for just a subset of the project points...
            # this is the case if generation is being initialized on one
            # of many HPC nodes in a large project
            pc = PointsControl.split(points_range[0], points_range[1], pp,
                                     sites_per_split=sites_per_worker)
        else:
            # PointsControl is for all of the project points
            pc = PointsControl(pp, sites_per_split=sites_per_worker)

        return pc

[docs]    @classmethod
    def get_pc(cls, points, points_range, sam_configs, tech,
               sites_per_worker=None, res_file=None, curtailment=None):
        """Get a PointsControl instance.

        Parameters
        ----------
        points : int | slice | list | str | pandas.DataFrame | PointsControl
            Single site integer,
            or slice or list specifying project points,
            or string pointing to a project points csv,
            or a pre-loaded project points DataFrame,
            or a fully instantiated PointsControl object.
        points_range : list | None
            Optional two-entry list specifying the index range of the sites to
            analyze. To be taken from the reV.config.PointsControl.split_range
            property.
        sam_configs : dict | str | SAMConfig
            SAM input configuration ID(s) and file path(s). Keys are the SAM
            config ID(s) which map to the config column in the project points
            CSV. Values are either a JSON SAM config file or dictionary of SAM
            config inputs. Can also be a single config file path or a
            pre loaded SAMConfig object.
        tech : str
            SAM technology to analyze (pvwattsv7, windpower, tcsmoltensalt,
            solarwaterheat, troughphysicalheat, lineardirectsteam)
            The string should be lower-cased with spaces and _ removed.
        sites_per_worker : int
            Number of sites to run in series on a worker. None defaults to the
            resource file chunk size.
        res_file : str
            Filepath to single resource file, multi-h5 directory,
            or /h5_dir/prefix*suffix
        curtailment : NoneType | dict | str | config.curtailment.Curtailment
            Inputs for curtailment parameters. If not None, curtailment inputs
            are expected. Can be:

                - Explicit namespace of curtailment variables (dict)
                - Pointer to curtailment config json file with path (str)
                - Instance of curtailment config object
                  (config.curtailment.Curtailment)


        Returns
        -------
        pc : reV.config.project_points.PointsControl
            PointsControl object instance.
        """

        if tech not in cls.OPTIONS and tech.lower() != ModuleName.ECON:
            msg = ('Did not recognize reV-SAM technology string "{}". '
                   'Technology string options are: {}'
                   .format(tech, list(cls.OPTIONS.keys())))
            logger.error(msg)
            raise KeyError(msg)

        if sites_per_worker is None:
            # get the optimal sites per split based on res file chunk size
            sites_per_worker = cls.get_sites_per_worker(res_file)

        logger.debug('Sites per worker being set to {} for '
                     'PointsControl.'.format(sites_per_worker))

        if isinstance(points, PointsControl):
            # received a pre-intialized instance of pointscontrol
            pc = points
        else:
            pc = cls._pp_to_pc(points, points_range, sam_configs, tech,
                               sites_per_worker=sites_per_worker,
                               res_file=res_file, curtailment=curtailment)

        return pc

[docs]    @staticmethod
    def get_sites_per_worker(res_file, default=100):
        """Get the nominal sites per worker (x-chunk size) for a given file.

        This is based on the concept that it is most efficient for one core to
        perform one read on one chunk of resource data, such that chunks will
        not have to be read into memory twice and no sites will be read
        redundantly.

        Parameters
        ----------
        res_file : str
            Filepath to single resource file, multi-h5 directory,
            or /h5_dir/prefix*suffix
        default : int
            Sites to be analyzed on a single core if the chunk size cannot be
            determined from res_file.

        Returns
        -------
        sites_per_worker : int
            Nominal sites to be analyzed per worker. This is set to the x-axis
            chunk size for windspeed and dni datasets for the WTK and NSRDB
            data, respectively.
        """
        if not res_file or not os.path.isfile(res_file):
            return default

        with Resource(res_file) as res:
            if 'wtk' in res_file.lower():
                for dset in res.datasets:
                    if 'speed' in dset:
                        # take nominal WTK chunks from windspeed
                        _, _, chunks = res.get_dset_properties(dset)
                        break
            elif 'nsrdb' in res_file.lower():
                # take nominal NSRDB chunks from dni
                _, _, chunks = res.get_dset_properties('dni')
            else:
                warn('Could not infer dataset chunk size as the resource type '
                     'could not be determined from the filename: {}'
                     .format(res_file))
                chunks = None

        if chunks is None:
            # if chunks not set, go to default
            sites_per_worker = default
            logger.debug('Sites per worker being set to {} (default) based on '
                         'no set chunk size in {}.'
                         .format(sites_per_worker, res_file))
        else:
            sites_per_worker = chunks[1]
            logger.debug('Sites per worker being set to {} based on chunk '
                         'size of {}.'.format(sites_per_worker, res_file))

        return sites_per_worker

[docs]    @staticmethod
    def unpack_futures(futures):
        """Combine list of futures results into their native dict format/type.

        Parameters
        ----------
        futures : list
            List of dictionary futures results.

        Returns
        -------
        out : dict
            Compiled results of the native future results type (dict).
        """

        out = {}
        for x in futures:
            out.update(x)

        return out

    @staticmethod
    @abstractmethod
    def _run_single_worker(points_control, tech=None, res_file=None,
                           output_request=None, scale_outputs=True):
        """Run a reV-SAM analysis based on the points_control iterator.

        Parameters
        ----------
        points_control : reV.config.PointsControl
            A PointsControl instance dictating what sites and configs are run.
        tech : str
            SAM technology to analyze (pvwattsv7, windpower, tcsmoltensalt,
            solarwaterheat, troughphysicalheat, lineardirectsteam)
            The string should be lower-cased with spaces and _ removed.
        res_file : str
            Filepath to single resource file, multi-h5 directory,
            or /h5_dir/prefix*suffix
        output_request : list | tuple
            Output variables requested from SAM.
        scale_outputs : bool
            Flag to scale outputs in-place immediately upon returning data.

        Returns
        -------
        out : dict
            Output dictionary from the SAM reV_run function. Data is scaled
            within this function to the datatype specified in cls.OUT_ATTRS.
        """

    def _parse_site_data(self, inp):
        """Parse site-specific data from input arg

        Parameters
        ----------
        inp : str | pd.DataFrame | None
            Site data in .csv or pre-extracted dataframe format. None signifies
            that there is no extra site-specific data and that everything is
            fully defined in the input h5 and SAM json configs.

        Returns
        -------
        site_data : pd.DataFrame
            Site-specific data for econ calculation. Rows correspond to sites,
            columns are variables.
        """

        if inp is None or inp is False:
            # no input, just initialize dataframe with site gids as index
            site_data = pd.DataFrame(index=self.project_points.sites)
            site_data.index.name = 'gid'
        else:
            # explicit input, initialize df
            if isinstance(inp, str):
                if inp.endswith('.csv'):
                    site_data = pd.read_csv(inp)
            elif isinstance(inp, pd.DataFrame):
                site_data = inp
            else:
                # site data was not able to be set. Raise error.
                raise Exception('Site data input must be .csv or '
                                'dataframe, but received: {}'.format(inp))

            if 'gid' not in site_data and site_data.index.name != 'gid':
                # require gid as column label or index
                raise KeyError('Site data input must have "gid" column '
                               'to match reV site gid.')

            # pylint: disable=no-member
            if site_data.index.name != 'gid':
                # make gid the dataframe index if not already
                site_data = site_data.set_index('gid', drop=True)

        if 'offshore' in site_data:
            if site_data['offshore'].sum() > 1:
                w = ('Found offshore sites in econ site data input. '
                     'This functionality has been deprecated. '
                     'Please run the reV offshore module to '
                     'calculate offshore wind lcoe.')
                warn(w, OffshoreWindInputWarning)
                logger.warning(w)

        return site_data

[docs]    def add_site_data_to_pp(self, site_data):
        """Add the site df (site-specific inputs) to project points dataframe.

        This ensures that only the relevant site's data will be passed through
        to parallel workers when points_control is iterated and split.

        Parameters
        ----------
        site_data : pd.DataFrame
            Site-specific data for econ calculation. Rows correspond to sites,
            columns are variables.
        """
        self.project_points.join_df(site_data, key=self.site_data.index.name)

    @abstractmethod
    def _parse_output_request(self, req):
        """Set the output variables requested from the user.

        Parameters
        ----------
        req : list | tuple
            Output variables requested from SAM.

        Returns
        -------
        output_request : list
            Output variables requested from SAM.
        """

    def _get_data_shape(self, dset, n_sites):
        """Get the output array shape based on OUT_ATTRS or PySAM.Outputs.

        Parameters
        ----------
        dset : str
            Variable name to get shape for.
        n_sites : int
            Number of sites for this data shape.

        Returns
        -------
        shape : tuple
            1D or 2D shape tuple for dset.
        """

        if dset in self.OUT_ATTRS:
            return self._get_data_shape_from_out_attrs(dset, n_sites)

        if dset in self.project_points.all_sam_input_keys:
            return self._get_data_shape_from_sam_config(dset, n_sites)

        return self._get_data_shape_from_pysam(dset, n_sites)

    def _get_data_shape_from_out_attrs(self, dset, n_sites):
        """Get data shape from ``OUT_ATTRS`` variable"""
        if self.OUT_ATTRS[dset]['type'] == 'array':
            return (len(self.time_index), n_sites)
        return (n_sites,)

    def _get_data_shape_from_sam_config(self, dset, n_sites):
        """Get data shape from SAM input config """
        data = list(self.project_points.sam_inputs.values())[0][dset]
        if isinstance(data, (list, tuple, np.ndarray)):
            return (*np.array(data).shape, n_sites)

        if isinstance(data, str):
            msg = ('Cannot pass through non-scalar SAM input key "{}" '
                   'as an output_request!'.format(dset))
            logger.error(msg)
            raise ExecutionError(msg)

        return (n_sites, )

    def _get_data_shape_from_pysam(self, dset, n_sites):
        """Get data shape from PySAM output object"""
        if self._sam_obj_default is None:
            self._sam_obj_default = self.sam_module.default()

        try:
            out_data = getattr(self._sam_obj_default.Outputs, dset)
        except AttributeError as e:
            msg = ('Could not get data shape for dset "{}" '
                   'from object "{}". '
                   'Received the following error: "{}"'
                   .format(dset, self._sam_obj_default, e))
            logger.error(msg)
            raise ExecutionError(msg) from e

        if isinstance(out_data, (int, float, str)):
            return (n_sites,)

        if len(out_data) % len(self.time_index) == 0:
            return (len(self.time_index), n_sites)

        return (len(out_data), n_sites)

    def _init_fpath(self, out_fpath, module):
        """Combine directory and filename, ensure .h5 ext., make out dirs."""
        if out_fpath is None:
            return

        project_dir, out_fn = os.path.split(out_fpath)

        # ensure output file is an h5
        if not out_fn.endswith('.h5'):
            out_fn += '.h5'

        if module not in out_fn:
            extension_with_module = "_{}.h5".format(module)
            out_fn = out_fn.replace(".h5", extension_with_module)

        # ensure year is in out_fpath
        if self.year is not None:
            extension_with_year = "_{}.h5".format(self.year)
            if extension_with_year not in out_fn:
                out_fn = out_fn.replace(".h5", extension_with_year)

        # create and use optional output dir
        if project_dir and not os.path.exists(project_dir):
            os.makedirs(project_dir, exist_ok=True)

        self._out_fpath = os.path.join(project_dir, out_fn)
        self._run_attrs['out_fpath'] = out_fpath

    def _init_h5(self, mode='w'):
        """Initialize the single h5 output file with all output requests.

        Parameters
        ----------
        mode : str
            Mode to instantiate h5py.File instance
        """

        if self._out_fpath is None:
            return

        if 'w' in mode:
            logger.info('Initializing full output file: "{}" with mode: {}'
                        .format(self._out_fpath, mode))
        elif 'a' in mode:
            logger.info('Appending data to output file: "{}" with mode: {}'
                        .format(self._out_fpath, mode))

        attrs = {d: {} for d in self.output_request}
        chunks = {}
        dtypes = {}
        shapes = {}

        # flag to write time index if profiles are being output
        write_ti = False

        for dset in self.output_request:

            tmp = 'other'
            if dset in self.OUT_ATTRS:
                tmp = dset

            attrs[dset]['units'] = self.OUT_ATTRS[tmp].get('units',
                                                           'unknown')
            attrs[dset]['scale_factor'] = \
                self.OUT_ATTRS[tmp].get('scale_factor', 1)
            chunks[dset] = self.OUT_ATTRS[tmp].get('chunks', None)
            dtypes[dset] = self.OUT_ATTRS[tmp].get('dtype', 'float32')
            shapes[dset] = self._get_data_shape(dset, len(self.meta))
            if len(shapes[dset]) > 1:
                write_ti = True

        # only write time index if profiles were found in output request
        if write_ti:
            ti = self.time_index
        else:
            ti = None

        Outputs.init_h5(self._out_fpath, self.output_request, shapes,
                        attrs, chunks, dtypes, self.meta, time_index=ti,
                        configs=self.sam_metas, run_attrs=self.run_attrs,
                        mode=mode)

    def _init_out_arrays(self, index_0=0):
        """Initialize output arrays based on the number of sites that can be
        stored in memory safely.

        Parameters
        ----------
        index_0 : int
            This is the site list index (not gid) for the first site in the
            output data. If a node cannot process all sites in-memory at once,
            this is used to segment the sites in the current output chunk.
        """

        self._out = {}
        self._finished_sites = []

        # Output chunk is the index range (inclusive) of this set of site outs
        self._out_chunk = (index_0, np.min((index_0 + self.site_limit,
                                            len(self.project_points) - 1)))
        self._out_n_sites = int(self.out_chunk[1] - self.out_chunk[0]) + 1

        logger.info('Initializing in-memory outputs for {} sites with gids '
                    '{} through {} inclusive (site list index {} through {})'
                    .format(self._out_n_sites,
                            self.project_points.sites[self.out_chunk[0]],
                            self.project_points.sites[self.out_chunk[1]],
                            self.out_chunk[0], self.out_chunk[1]))

        for request in self.output_request:
            dtype = 'float32'
            if request in self.OUT_ATTRS and self.scale_outputs:
                dtype = self.OUT_ATTRS[request].get('dtype', 'float32')

            shape = self._get_data_shape(request, self._out_n_sites)

            # initialize the output request as an array of zeros
            self._out[request] = np.zeros(shape, dtype=dtype)

    def _check_sam_version_inputs(self):
        """Check the PySAM version and input keys. Fix where necessary."""
        for key, parameters in self.project_points.sam_inputs.items():
            updated = PySamVersionChecker.run(self.tech, parameters)
            sam_obj = self._points_control._project_points._sam_config_obj
            sam_obj._inputs[key] = updated

[docs]    def unpack_output(self, site_gid, site_output):
        """Unpack a SAM SiteOutput object to the output attribute.

        Parameters
        ----------
        site_gid : int
            Resource-native site gid (index).
        site_output : dict
            SAM site output object.
        """

        # iterate through the site results
        for var, value in site_output.items():
            if var not in self._out:
                raise KeyError('Tried to collect output variable "{}", but it '
                               'was not yet initialized in the output '
                               'dictionary.')

            # get the index in the output array for the current site
            i = self.site_index(site_gid, out_index=True)

            # check to see if we have exceeded the current output chunk.
            # If so, flush data to disk and reset the output initialization
            if i + 1 > self._out_n_sites:
                self.flush()
                global_site_index = self.site_index(site_gid)
                self._init_out_arrays(index_0=global_site_index)
                i = self.site_index(site_gid, out_index=True)

            if isinstance(value, (list, tuple, np.ndarray)):
                if not isinstance(value, np.ndarray):
                    value = np.array(value)

                self._out[var][:, i] = value.T

            elif value != 0:
                self._out[var][i] = value

[docs]    def site_index(self, site_gid, out_index=False):
        """Get the index corresponding to the site gid.

        Parameters
        ----------
        site_gid : int
            Resource-native site index (gid).
        out_index : bool
            Option to get output index (if true) which is the column index in
            the current in-memory output array, or (if false) the global site
            index from the project points site list.

        Returns
        -------
        index : int
            Global site index if out_index=False, otherwise column index in
            the current in-memory output array.
        """

        # get the index for site_gid in the (global) project points site list.
        global_site_index = self.project_points.sites.index(site_gid)

        if not out_index:
            output_index = global_site_index
        else:
            output_index = global_site_index - self.out_chunk[0]
            if output_index < 0:
                raise ValueError('Attempting to set output data for site with '
                                 'gid {} to global site index {}, which was '
                                 'already set based on the current output '
                                 'index chunk of {}'
                                 .format(site_gid, global_site_index,
                                         self.out_chunk))

        return output_index

[docs]    def flush(self):
        """Flush the output data in self.out attribute to disk in .h5 format.

        The data to be flushed is accessed from the instance attribute
        "self.out". The disk target is based on the instance attributes
        "self._out_fpath". Data is not flushed if _fpath is None or if .out is
        empty.
        """

        # handle output file request if file is specified and .out is not empty
        if isinstance(self._out_fpath, str) and self._out:
            logger.info('Flushing outputs to disk, target file: "{}"'
                        .format(self._out_fpath))

            # get the slice of indices to write outputs to
            islice = slice(self.out_chunk[0], self.out_chunk[1] + 1)

            # open output file in append mode to add output results to
            with Outputs(self._out_fpath, mode='a') as f:

                # iterate through all output requests writing each as a dataset
                for dset, arr in self._out.items():
                    if len(arr.shape) == 1:
                        # write array of scalars
                        f[dset, islice] = arr
                    else:
                        # write 2D array of profiles
                        f[dset, :, islice] = arr

            logger.debug('Flushed output successfully to disk.')

    def _pre_split_pc(self, pool_size=None):
        """Pre-split project control iterator into sub chunks to further
        split the parallelization.

        Parameters
        ----------
        pool_size : int
            Number of futures to submit to a single process pool for
            parallel futures. If ``None``, the pool size is set to
            ``os.cpu_count() * 2``. By default, ``None``.

        Returns
        -------
        N : int
            Total number of points control split instances.
        pc_chunks : list
            List of lists of points control split instances.
        """
        N = 0
        pc_chunks = []
        i_chunk = []
        if pool_size is None:
            pool_size = os.cpu_count() * 2

        for i, split in enumerate(self.points_control):
            N += 1
            i_chunk.append(split)
            if (i + 1) % pool_size == 0:
                pc_chunks.append(i_chunk)
                i_chunk = []

        if i_chunk:
            pc_chunks.append(i_chunk)

        logger.debug('Pre-splitting points control into {} chunks with the '
                     'following chunk sizes: {}'
                     .format(len(pc_chunks), [len(x) for x in pc_chunks]))
        return N, pc_chunks

    # pylint: disable=unused-argument
    def _reduce_kwargs(self, pc, **kwargs):
        """Placeholder for functions that need to reduce the global kwargs that
        they send to workers to reduce memory footprint

        Parameters
        ----------
        pc : PointsControl
            PointsControl object for a single worker chunk
        kwargs : dict
            Kwargs for all gids that needs to be reduced before being sent to
            ``_run_single_worker()``

        Returns
        -------
        kwargs : dict
            Same as input but reduced just for the gids in pc
        """
        return kwargs

    def _parallel_run(self, max_workers=None, pool_size=None, timeout=1800,
                      **kwargs):
        """Execute parallel compute.

        Parameters
        ----------
        max_workers : None | int
            Number of workers. None will default to cpu count.
        pool_size : int
            Number of futures to submit to a single process pool for
            parallel futures. If ``None``, the pool size is set to
            ``os.cpu_count() * 2``. By default, ``None``.
        timeout : int | float
            Number of seconds to wait for parallel run iteration to complete
            before returning zeros.
        kwargs : dict
            Keyword arguments to self._run_single_worker().
        """

        if pool_size is None:
            pool_size = os.cpu_count() * 2
        if max_workers is None:
            max_workers = os.cpu_count()
        logger.info('Running parallel execution with max_workers={}'
                    .format(max_workers))
        i = 0
        N, pc_chunks = self._pre_split_pc(pool_size=pool_size)
        for j, pc_chunk in enumerate(pc_chunks):
            logger.debug('Starting process pool for points control '
                         'iteration {} out of {}'
                         .format(j + 1, len(pc_chunks)))

            failed_futures = False
            chunks = {}
            futures = []
            loggers = [__name__, 'reV.gen', 'reV.econ', 'reV']
            with SpawnProcessPool(max_workers=max_workers,
                                  loggers=loggers) as exe:
                for pc in pc_chunk:
                    pc_kwargs = self._reduce_kwargs(pc, **kwargs)
                    future = exe.submit(self._run_single_worker, pc,
                                        **pc_kwargs)
                    futures.append(future)
                    chunks[future] = pc

                for future in futures:
                    i += 1
                    try:
                        result = future.result(timeout=timeout)
                    except TimeoutError:
                        failed_futures = True
                        sites = chunks[future].project_points.sites
                        result = self._handle_failed_future(future, i, sites,
                                                            timeout)

                    self.out = result

                    mem = psutil.virtual_memory()
                    m = ('Parallel run at iteration {0} out of {1}. '
                         'Memory utilization is {2:.3f} GB out of {3:.3f} GB '
                         'total ({4:.1f}% used, intended limit of {5:.1f}%)'
                         .format(i, N, mem.used / 1e9, mem.total / 1e9,
                                 100 * mem.used / mem.total,
                                 100 * self.mem_util_lim))
                    logger.info(m)

                if failed_futures:
                    logger.info('Forcing pool shutdown after failed futures.')
                    exe.shutdown(wait=False)
                    logger.info('Forced pool shutdown complete.')

        self.flush()

    def _handle_failed_future(self, future, i, sites, timeout):
        """Handle a failed future and return zeros.

        Parameters
        ----------
        future : concurrent.futures.Future
            Failed future to cancel.
        i : int
            Iteration number for logging
        sites : list
            List of site gids belonging to this failed future.
        timeout : int
            Number of seconds to wait for parallel run iteration to complete
            before returning zeros.
        """

        w = ('Iteration {} hit the timeout limit of {} seconds! Passing zeros.'
             .format(i, timeout))
        logger.warning(w)
        warn(w, OutputWarning)

        site_out = {k: 0 for k in self.output_request}
        result = {site: site_out for site in sites}

        try:
            cancelled = future.cancel()
        except Exception as e:
            w = 'Could not cancel future! Received exception: {}'.format(e)
            logger.warning(w)
            warn(w, ParallelExecutionWarning)

        if not cancelled:
            w = 'Could not cancel future!'
            logger.warning(w)
            warn(w, ParallelExecutionWarning)

        return result