Source code for rex.outputs

# -*- coding: utf-8 -*-
"""
Classes to handle h5 output files.
"""
import json
import logging
import numpy as np
import pandas as pd
import time
import sys
import click
import h5py
import h5pyd
import scipy

from rex.version import __version__
from rex.utilities.exceptions import (HandlerRuntimeError, HandlerValueError,
                                      ResourceKeyError)
from rex.resource import BaseResource
from rex.utilities.parse_keys import parse_keys, parse_slice
from rex.utilities.utilities import to_records_array

logger = logging.getLogger(__name__)


class Outputs(BaseResource):
    """
    Base class to handle output data in .h5 format

    Examples
    --------
    The Outputs handler can be used to initialize h5 files in the standard
    reV/rex resource data format.

    >>> from rex import Outputs
    >>> import pandas as pd
    >>> import numpy as np
    >>>
    >>> meta = pd.DataFrame({'latitude': np.ones(100),
    >>>                      'longitude': np.ones(100)})
    >>>
    >>> time_index = pd.date_range('20210101', '20220101', freq='1h',
    >>>                            closed='right')
    >>>
    >>> with Outputs('test.h5', 'w') as f:
    >>>     f.meta = meta
    >>>     f.time_index = time_index

    You can also use the Outputs handler to read output h5 files from disk.
    The Outputs handler will automatically parse the meta data and time index
    into the expected pandas objects (DataFrame and DatetimeIndex,
    respectively).

    >>> with Outputs('test.h5') as f:
    >>>     print(f.meta.head())
    >>>
         latitude  longitude
    gid
    0         1.0        1.0
    1         1.0        1.0
    2         1.0        1.0
    3         1.0        1.0
    4         1.0        1.0

    >>> with Outputs('test.h5') as f:
    >>>     print(f.time_index)
    DatetimeIndex(['2021-01-01 01:00:00+00:00', '2021-01-01 02:00:00+00:00',
                   '2021-01-01 03:00:00+00:00', '2021-01-01 04:00:00+00:00',
                   '2021-01-01 05:00:00+00:00', '2021-01-01 06:00:00+00:00',
                   '2021-01-01 07:00:00+00:00', '2021-01-01 08:00:00+00:00',
                   '2021-01-01 09:00:00+00:00', '2021-01-01 10:00:00+00:00',
                   ...
                   '2021-12-31 15:00:00+00:00', '2021-12-31 16:00:00+00:00',
                   '2021-12-31 17:00:00+00:00', '2021-12-31 18:00:00+00:00',
                   '2021-12-31 19:00:00+00:00', '2021-12-31 20:00:00+00:00',
                   '2021-12-31 21:00:00+00:00', '2021-12-31 22:00:00+00:00',
                   '2021-12-31 23:00:00+00:00', '2022-01-01 00:00:00+00:00'],
                  dtype='datetime64[ns, UTC]', length=8760, freq=None)

    There are a few ways to use the Outputs handler to write data to a file.
    Here is one example using the pre-initialized file we created earlier.
    Note that the Outputs handler will automatically scale float data using
    the "scale_factor" attribute. The Outputs handler will unscale the data
    while being read unless the unscale kwarg is explicityly set to False.
    This behavior is intended to reduce disk storage requirements for big
    data and can be disabled by setting dtype=np.float32 or dtype=np.float64
    when writing data.

    >>> Outputs.add_dataset(h5_file='test.h5', dset_name='dset1',
    >>>                     dset_data=np.ones((8760, 100)) * 42.42,
    >>>                     attrs={'scale_factor': 100}, dtype=np.int32)


    >>> with Outputs('test.h5') as f:
    >>>     print(f['dset1'])
    >>>     print(f['dset1'].dtype)
    [[42.42 42.42 42.42 ... 42.42 42.42 42.42]
     [42.42 42.42 42.42 ... 42.42 42.42 42.42]
     [42.42 42.42 42.42 ... 42.42 42.42 42.42]
     ...
     [42.42 42.42 42.42 ... 42.42 42.42 42.42]
     [42.42 42.42 42.42 ... 42.42 42.42 42.42]
     [42.42 42.42 42.42 ... 42.42 42.42 42.42]]
    float32

    >>> with Outputs('test.h5', unscale=False) as f:
    >>>     print(f['dset1'])
    >>>     print(f['dset1'].dtype)
    [[4242 4242 4242 ... 4242 4242 4242]
     [4242 4242 4242 ... 4242 4242 4242]
     [4242 4242 4242 ... 4242 4242 4242]
     ...
     [4242 4242 4242 ... 4242 4242 4242]
     [4242 4242 4242 ... 4242 4242 4242]
     [4242 4242 4242 ... 4242 4242 4242]]
    int32

    Note that the Outputs handler is specifically designed to read and
    write spatiotemporal data. It is therefore important to intialize the meta
    data and time index objects even if your data is only spatial or only
    temporal. Furthermore, the Outputs handler will always assume that 1D
    datasets represent scalar data (non-timeseries) that corresponds to the
    meta data shape, and that 2D datasets represent spatiotemporal data whose
    shape corresponds to (len(time_index), len(meta)). You can see these
    constraints here:

    >>> Outputs.add_dataset(h5_file='test.h5', dset_name='bad_shape',
                            dset_data=np.ones((1, 100)) * 42.42,
                            attrs={'scale_factor': 100}, dtype=np.int32)
    HandlerValueError: 2D data with shape (1, 100) is not of the proper
    spatiotemporal shape: (8760, 100)

    >>> Outputs.add_dataset(h5_file='test.h5', dset_name='bad_shape',
                            dset_data=np.ones((8760,)) * 42.42,
                            attrs={'scale_factor': 100}, dtype=np.int32)
    HandlerValueError: 1D data with shape (8760,) is not of the proper
    spatial shape: (100,)
    """

    def __init__(self, h5_file, mode='r', unscale=True, str_decode=True,
                 group=None):
        """
        Parameters
        ----------
        h5_file : str
            Path to .h5 resource file
        mode : str, optional
            Mode to instantiate h5py.File instance, by default 'r'
        unscale : bool, optional
            Boolean flag to automatically unscale variables on extraction,
            by default True
        str_decode : bool, optional
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read,
            by default True
        group : str, optional
            Group within .h5 resource file to open, by default None
        """
        super().__init__(h5_file, unscale=unscale, hsds=False,
                         str_decode=str_decode, group=group, mode=mode)
        self._mode = mode
        self._group = self._check_group(group)
        self._shape = None

        if self.writable:
            self.set_version_attr()

    def __len__(self):
        _len = 0
        if 'meta' in self.datasets:
            _len = self.h5['meta'].shape[0]

        return _len

    def __setitem__(self, keys, arr):
        if self.writable:
            ds, ds_slice = parse_keys(keys)

            slice_test = False
            if isinstance(ds_slice, tuple):
                slice_test = ds_slice[0] == slice(None, None, None)

            if ds.endswith('meta') and slice_test:
                self._set_meta(ds, arr)
            elif ds.endswith('time_index') and slice_test:
                self._set_time_index(ds, arr)
            else:
                self._set_ds_array(ds, arr, ds_slice)

    @property
    def full_version_record(self):
        """Get record of versions for dependencies

        Returns
        -------
        dict
            Dictionary of package versions for dependencies
        """
        versions = {'python': sys.version,
                    'rex': __version__,
                    'h5py': h5py.__version__,
                    'h5pyd': h5pyd.__version__,
                    'pandas': pd.__version__,
                    'numpy': np.__version__,
                    'scipy': scipy.__version__,
                    'click': click.__version__,
                    }
        return versions

    def set_version_attr(self):
        """Set the version attribute to the h5 file."""
        new_attrs = {'version': __version__,
                     'full_version_record': json.dumps(
                         self.full_version_record),
                     'package': 'rex'}
        for name, value in new_attrs.items():
            if name not in self.h5.attrs:
                self.h5.attrs[name] = value

    @property
    def version(self):
        """
        Version of package used to create file

        Returns
        -------
        str
        """
        return self.h5.attrs['version']

    @property
    def package(self):
        """
        Package used to create file

        Returns
        -------
        str
        """
        return self.h5.attrs['package']

    @property
    def source(self):
        """
        Package and version used to create file

        Returns
        -------
        str
        """
        out = ("{}_{}"
               .format(self.h5.attrs['package'], self.h5.attrs['version']))
        return out

    @property
    def shape(self):
        """
        Variable array shape from time_index and meta

        Returns
        -------
        tuple
            shape of variables arrays == (time, locations)
        """
        if self._shape is None:
            dsets = self.datasets
            if 'meta' in dsets:
                self._shape = self.h5['meta'].shape
                if 'time_index' in dsets:
                    self._shape = self.h5['time_index'].shape + self._shape

        return self._shape

    @property
    def writable(self):
        """
        Check to see if h5py.File instance is writable

        Returns
        -------
        is_writable : bool
            Flag if mode is writable
        """
        is_writable = True
        mode = ['a', 'w', 'w-', 'x']
        if self._mode not in mode:
            is_writable = False

        return is_writable

    @BaseResource.meta.setter  # pylint: disable-msg=E1101
    def meta(self, meta):
        """
        Write meta data to disk, convert type if neccessary

        Parameters
        ----------
        meta : pandas.DataFrame | numpy.recarray
            Locational meta data
        """
        self._set_meta('meta', meta)

    @BaseResource.time_index.setter  # pylint: disable-msg=E1101
    def time_index(self, time_index):
        """
        Write time_index to dics, convert type if neccessary

        Parameters
        ----------
        time_index : pandas.DatetimeIndex | ndarray
            Temporal index of timesteps
        """
        self._set_time_index('time_index', time_index)

    @property
    def SAM_configs(self):
        """
        SAM configuration JSONs used to create CF profiles

        Returns
        -------
        configs : dict
            Dictionary of SAM configuration JSONs
        """
        if 'meta' in self.datasets:
            configs = {k: json.loads(v)
                       for k, v in self.h5['meta'].attrs.items()}
        else:
            configs = {}

        return configs

    @property
    def run_attrs(self):
        """
        Runtime attributes stored at the global (file) level

        Returns
        -------
        global_attrs : dict
        """
        return self.global_attrs

    @run_attrs.setter
    def run_attrs(self, run_attrs):
        """
        Set runtime attributes as global (file) attributes

        Parameters
        ----------
        run_attrs : dict
            Dictionary of runtime attributes (args, kwargs)
        """
        if self.writable:
            for k, v in run_attrs.items():
                self.h5.attrs[k] = v

    @staticmethod
    def _check_data_dtype(dset_name, data, dtype, attrs=None):
        """
        Check data dtype and scale if needed

        Parameters
        ----------
        dset_name : str
            Name of dataset being written to disk
        data : ndarray
            Data to be written to disc
        dtype : str
            dtype of data on disc
        attrs : dict, optional
            Attributes to be set. May include 'scale_factor',
            by default None

        Returns
        -------
        data : ndarray
            Data ready for writing to disc:
            - Scaled and converted to dtype
        """
        if attrs is None:
            attrs = {}

        scale_factor = attrs.get('scale_factor', None)

        scale = (scale_factor is not None
                 and not np.issubdtype(data.dtype, np.integer))
        if scale:
            if scale_factor != 1 and not np.issubdtype(dtype, np.integer):
                msg = ('Output dtype for "{}" must be an integer in '
                       'order to apply scale factor {}".'
                       .format(dset_name, scale_factor))
                logger.error(msg)
                raise HandlerRuntimeError(msg)

            data_type_differs = not np.issubdtype(data.dtype, np.dtype(dtype))
            is_integer = np.issubdtype(dtype, np.integer)
            if data_type_differs and is_integer:
                # apply scale factor and dtype
                data = np.round(data * scale_factor).astype(dtype)

        elif (not np.issubdtype(data.dtype, np.dtype(dtype))
                and not np.issubdtype(np.dtype(dtype), np.floating)):
            msg = ('A scale_factor is needed to scale '
                   '"{}" of type "{}" to "{}".'
                   .format(dset_name, data.dtype, dtype))
            raise HandlerRuntimeError(msg)

        return data

    def _check_group(self, group):
        """
        Ensure group is in .h5 file

        Parameters
        ----------
        group : str
            Group of interest
        """
        if group is not None:
            if group not in self._h5:
                try:
                    if self.writable:
                        self._h5.create_group(group)
                except Exception as ex:
                    msg = ('Cannot create group {}: {}'
                           .format(group, ex))
                    raise HandlerRuntimeError(msg) from ex

        return group

    def _set_meta(self, ds, meta, attrs=None):
        """
        Write meta data to disk

        Parameters
        ----------
        ds : str
            meta dataset name
        meta : pandas.DataFrame | numpy.recarray
            Locational meta data
        attrs : dict
            Attributes to add to the meta data dataset
        """
        # pylint: disable=attribute-defined-outside-init
        self._meta = meta
        if isinstance(meta, pd.DataFrame):
            meta = to_records_array(meta)

        if ds in self.datasets:
            self.update_dset(ds, meta)
        else:
            self._create_dset(ds, meta.shape, meta.dtype, data=meta,
                              attrs=attrs)

    def _set_time_index(self, ds, time_index, attrs=None):
        """
        Write time index to disk

        Parameters
        ----------
        ds : str
            time index dataset name
        time_index : pandas.DatetimeIndex | ndarray
            Temporal index of timesteps
        attrs : dict
            Attributes to add to the meta data dataset
        """
        # pylint: disable=attribute-defined-outside-init
        self._time_index = time_index
        if isinstance(time_index, pd.DatetimeIndex):
            time_index = time_index.astype(str)
            dtype = "S{}".format(len(time_index[0]))
            time_index = np.array(time_index, dtype=dtype)

        if ds in self.datasets:
            self.update_dset(ds, time_index)
        else:
            self._create_dset(ds, time_index.shape, time_index.dtype,
                              data=time_index, attrs=attrs)


[docs]
    def get_config(self, config_name):
        """
        Get SAM config

        Parameters
        ----------
        config_name : str
            Name of config

        Returns
        -------
        config : dict
            SAM config JSON as a dictionary
        """
        if 'meta' in self.datasets:
            config = json.loads(self.h5['meta'].attrs[config_name])
        else:
            config = None

        return config



[docs]
    def set_configs(self, SAM_configs):
        """
        Set SAM configuration JSONs as attributes of 'meta'

        Parameters
        ----------
        SAM_configs : dict
            Dictionary of SAM configuration JSONs
        """
        if self.writable:
            for key, config in SAM_configs.items():
                if isinstance(config, dict):
                    config = json.dumps(config)

                if not isinstance(key, str):
                    key = str(key)

                self.h5['meta'].attrs[key] = config


    def _set_ds_array(self, ds_name, arr, ds_slice):
        """
        Write ds to disk

        Parameters
        ----------
        ds_name : str
            Dataset name
        arr : ndarray
            Dataset data array
        ds_slice : tuple
            Dataset slicing that corresponds to arr
        """
        if ds_name not in self.datasets:
            msg = '{} must be initialized!'.format(ds_name)
            raise HandlerRuntimeError(msg)

        dtype = self.h5[ds_name].dtype
        attrs = self.get_attrs(ds_name)
        ds_slice = parse_slice(ds_slice)
        self.h5[ds_name][ds_slice] = self._check_data_dtype(
            ds_name, arr, dtype, attrs=attrs)

    def _check_chunks(self, chunks, data=None):
        """
        Convert dataset chunk size into valid tuple based on variable array
        shape

        Parameters
        ----------
        chunks : tuple
            Desired dataset chunk size
        data : ndarray
            Dataset array being chunked

        Returns
        -------
        ds_chunks : tuple | None
            dataset chunk size
        """
        if chunks is None:
            return None

        if data is not None:
            shape = data.shape
        else:
            shape = self.shape

        if len(shape) != len(chunks):
            msg = ('Shape dimensions ({}) are not the same length as chunks '
                   '({}). Please provide a single chunk value for each '
                   'dimension!'
                   .format(shape, chunks))
            logger.error(msg)
            raise HandlerRuntimeError(msg)

        return tuple(np.min((s, s if c is None else c))
                     for s, c in zip(shape, chunks))

    def _create_dset(self, ds_name, shape, dtype, chunks=None, attrs=None,
                     data=None, replace=True):
        """
        Initialize dataset

        Parameters
        ----------
        ds_name : str
            Dataset name
        shape : tuple
            Dataset shape
        dtype : str
            Dataset numpy dtype
        chunks : tuple
            Dataset chunk size
        attrs : dict
            Dataset attributes
        data : ndarray
            Dataset data array
        replace : bool
            If previous dataset exists with the same name, it will be replaced.
        """
        ds = None
        if self.writable:
            if ds_name in self.datasets and replace:
                del self.h5[ds_name]

            elif ds_name in self.datasets:
                old_shape, old_dtype, _ = self.get_dset_properties(ds_name)
                if old_shape != shape or old_dtype != dtype:
                    e = ('Trying to create dataset "{}", but already exists '
                         'with mismatched shape and dtype. New shape/dtype '
                         'is {}/{}, previous shape/dtype is {}/{}'
                         .format(ds_name, shape, dtype, old_shape, old_dtype))
                    logger.error(e)
                    raise HandlerRuntimeError(e)

            if ds_name not in self.datasets:
                chunks = self._check_chunks(chunks, data=data)
                try:
                    ds = self.h5.create_dataset(ds_name, shape=shape,
                                                dtype=dtype, chunks=chunks)
                except Exception as e:
                    msg = ('Could not create dataset "{}" in file!'
                           .format(ds_name))
                    logger.error(msg)
                    raise IOError(msg) from e

            if attrs is not None:
                self._create_ds_attrs(ds, ds_name, attrs)

            if data is not None:
                ds[...] = data

    @staticmethod
    def _create_ds_attrs(ds, ds_name, attrs):
        """Create dataset attributes.

        Parameters
        ----------
        ds : h5py.Dataset
            Dataset object to write attributes to.
        ds_name : str
            Dataset name for logging / debugging
        attrs : dict | None
            Dataset attributes to write (None if no attributes to write).
        """
        if attrs is not None:
            for key, value in attrs.items():
                try:
                    ds.attrs[key] = value
                except Exception as e:
                    msg = ('Could not save datset "{}" attribute "{}" '
                           'to value: {}'.format(ds_name, key, value))
                    logger.error(msg)
                    raise IOError(msg) from e

    def _check_dset_shape(self, dset_name, dset_data):
        """
        Check to ensure that dataset array is of the proper shape

        Parameters
        ----------
        dset_name : str
            Dataset name being written to disk.
        dset_data : ndarray
            Dataset data array
        """
        dset_shape = dset_data.shape
        if len(dset_shape) == 1:
            possible_shapes = {}
            try:
                possible_shapes["spatial"] = (len(self.meta),)
            except ResourceKeyError:
                pass
            try:
                possible_shapes["temporal"] = (len(self.time_index),)
            except ResourceKeyError:
                pass

            if not possible_shapes:
                msg = ("Please load either 'meta' or 'time_index' before "
                       "loading a 1D dataset.")
                logger.error(msg)
                raise HandlerRuntimeError(msg)

            if dset_shape not in possible_shapes.values():
                possible_shapes_str = " or ".join(["{} {}".format(k, v)
                                                   for k, v
                                                   in possible_shapes.items()])
                msg = ('1D dataset "{}" with shape {} is not of '
                       'the proper {} shape!'
                       .format(dset_name, dset_shape, possible_shapes_str))
                logger.error(msg)
                raise HandlerValueError(msg)
        else:
            shape = self.shape
            if shape:
                if dset_shape != shape:
                    msg = ('2D dataset "{}" with shape {} is not of the '
                           'proper spatiotemporal shape: {}'
                           .format(dset_name, dset_shape, shape))
                    logger.error(msg)
                    raise HandlerValueError(msg)
            else:
                msg = ("'meta' and 'time_index' have not been loaded")
                logger.error(msg)
                raise HandlerRuntimeError(msg)

    def _add_dset(self, dset_name, data, dtype, chunks=None, attrs=None):
        """
        Write dataset to disk. Dataset it created in .h5 file and data is
        scaled if needed.

        Parameters
        ----------
        dset_name : str
            Name of dataset to be added to h5 file.
        data : ndarray
            Data to be added to h5 file.
        dtype : str
            Intended dataset datatype after scaling.
        chunks : tuple
            Chunk size for capacity factor means dataset.
        attrs : dict
            Attributes to be set. May include 'scale_factor'.
        """
        self._check_dset_shape(dset_name, data)

        data = self._check_data_dtype(dset_name, data, dtype, attrs=attrs)

        self._create_dset(dset_name, data.shape, dtype,
                          chunks=chunks, attrs=attrs, data=data)


[docs]
    def update_dset(self, dset, dset_array, dset_slice=None):
        """
        Check to see if dset needs to be updated on disk
        If so write dset_array to disk

        Parameters
        ----------
        dset : str
            dataset to update
        dset_array : ndarray
            dataset array
        dset_slice : tuple
            slice of dataset to update, it None update all
        """
        if dset_slice is None:
            dset_slice = (slice(None, None, None), )

        keys = (dset, ) + dset_slice

        # pylint: disable=unnecessary-dunder-call
        arr = self.__getitem__(keys)
        if not np.array_equal(arr, dset_array):
            self._set_ds_array(dset, dset_array, dset_slice)



[docs]
    def write_dataset(self, dset_name, data, dtype, chunks=None, attrs=None):
        """
        Write dataset to disk. Dataset it created in .h5 file and data is
        scaled if needed.

        Parameters
        ----------
        dset_name : str
            Name of dataset to be added to h5 file.
        data : ndarray
            Data to be added to h5 file.
        dtype : str
            Intended dataset datatype after scaling.
        chunks : tuple
            Chunk size for capacity factor means dataset.
        attrs : dict
            Attributes to be set. May include 'scale_factor'.
        """
        self._add_dset(dset_name, data, dtype, chunks=chunks, attrs=attrs)



[docs]
    @classmethod
    def write_profiles(cls, h5_file, meta, time_index, dset_name, profiles,
                       dtype, attrs=None, SAM_configs=None, chunks=(None, 100),
                       unscale=True, mode='w-', str_decode=True, group=None):
        """
        Write profiles to disk

        Parameters
        ----------
        h5_file : str
            Path to .h5 resource file
        meta : pandas.Dataframe
            Locational meta data
        time_index : pandas.DatetimeIndex
            Temporal timesteps
        dset_name : str
            Name of the target dataset (should identify the profiles).
        profiles : ndarray
            output result timeseries profiles
        dtype : str
            Intended dataset datatype after scaling.
        attrs : dict, optional
            Attributes to be set. May include 'scale_factor', by default None
        SAM_configs : dict, optional
            Dictionary of SAM configuration JSONs used to compute cf means,
            by default None
        chunks : tuple, optional
            Chunk size for capacity factor means dataset,
            by default (None, 100)
        unscale : bool, optional
            Boolean flag to automatically unscale variables on extraction,
            by default True
        mode : str, optional
            Mode to instantiate h5py.File instance, by default 'w-'
        str_decode : bool, optional
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read,
            by default True
        group : str, optional
            Group within .h5 resource file to open, by default None
        """
        logger.info("Saving profiles ({}) to {}".format(dset_name, h5_file))
        if profiles.shape != (len(time_index), len(meta)):
            raise HandlerValueError("Profile dimensions does not match"
                                    "'time_index' and 'meta'")
        ts = time.time()
        kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
                  "group": group}
        with cls(h5_file, **kwargs) as f:
            # Save time index
            f['time_index'] = time_index
            logger.debug("\t- 'time_index' saved to disc")
            # Save meta
            f['meta'] = meta
            logger.debug("\t- 'meta' saved to disc")
            # Add SAM configurations as attributes to meta
            if SAM_configs is not None:
                f.set_configs(SAM_configs)
                logger.debug("\t- SAM configurations saved as attributes "
                             "on 'meta'")

            # Write dset to disk
            f._add_dset(dset_name, profiles, dtype,
                        chunks=chunks, attrs=attrs)
            logger.debug("\t- '{}' saved to disc".format(dset_name))

        tt = (time.time() - ts) / 60
        logger.info('{} is complete'.format(h5_file))
        logger.debug('\t- Saving to disc took {:.4f} minutes'
                     .format(tt))



[docs]
    @classmethod
    def write_means(cls, h5_file, meta, dset_name, means, dtype, attrs=None,
                    SAM_configs=None, chunks=None, unscale=True, mode='w-',
                    str_decode=True, group=None):
        """
        Write means array to disk

        Parameters
        ----------
        h5_file : str
            Path to .h5 resource file
        meta : pandas.Dataframe
            Locational meta data
        dset_name : str
            Name of the target dataset (should identify the means).
        means : ndarray
            output means array.
        dtype : str
            Intended dataset datatype after scaling.
        attrs : dict, optional
            Attributes to be set. May include 'scale_factor', by default None
        SAM_configs : dict, optional
            Dictionary of SAM configuration JSONs used to compute cf means,
            by default None
        chunks : tuple, optional
            Chunk size for capacity factor means dataset, by default None
        unscale : bool, optional
            Boolean flag to automatically unscale variables on extraction,
            by default True
        mode : str, optional
            Mode to instantiate h5py.File instance, by default 'w-'
        str_decode : bool, optional
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read,
            by default True
        group : str, optional
            Group within .h5 resource file to open, by default None
        """
        logger.info("Saving means ({}) to {}".format(dset_name, h5_file))
        if len(means) != len(meta):
            msg = 'Number of means does not match meta'
            raise HandlerValueError(msg)

        ts = time.time()
        kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
                  "group": group}
        with cls(h5_file, **kwargs) as f:
            # Save meta
            f['meta'] = meta
            logger.debug("\t- 'meta' saved to disc")
            # Add SAM configurations as attributes to meta
            if SAM_configs is not None:
                f.set_configs(SAM_configs)
                logger.debug("\t- SAM configurations saved as attributes "
                             "on 'meta'")

            # Write dset to disk
            f._add_dset(dset_name, means, dtype,
                        chunks=chunks, attrs=attrs)
            logger.debug("\t- '{}' saved to disc".format(dset_name))

        tt = (time.time() - ts) / 60
        logger.info('{} is complete'.format(h5_file))
        logger.debug('\t- Saving to disc took {:.4f} minutes'
                     .format(tt))



[docs]
    @classmethod
    def add_dataset(cls, h5_file, dset_name, dset_data, dtype, attrs=None,
                    chunks=None, unscale=True, mode='a', str_decode=True,
                    group=None):
        """
        Add dataset to h5_file

        Parameters
        ----------
        h5_file : str
            Path to .h5 resource file
        dset_name : str
            Name of dataset to be added to h5 file
        dset_data : ndarray
            Data to be added to h5 file
        dtype : str
            Intended dataset datatype after scaling.
        attrs : dict, optional
            Attributes to be set. May include 'scale_factor', by default None
        unscale : bool, optional
            Boolean flag to automatically unscale variables on extraction,
            by default True
        mode : str, optional
            Mode to instantiate h5py.File instance, by default 'a'
        str_decode : bool, optional
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read,
            by default True
        group : str, optional
            Group within .h5 resource file to open, by default None
        """
        logger.info("Adding {} to {}".format(dset_name, h5_file))
        ts = time.time()
        kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
                  "group": group}
        with cls(h5_file, **kwargs) as f:
            f._add_dset(dset_name, dset_data, dtype,
                        chunks=chunks, attrs=attrs)

        tt = (time.time() - ts) / 60
        logger.info('{} added'.format(dset_name))
        logger.debug('\t- Saving to disc took {:.4f} minutes'
                     .format(tt))



[docs]
    @classmethod
    def init_h5(cls, h5_file, dsets, shapes, attrs, chunks, dtypes,
                meta, time_index=None, configs=None, unscale=True, mode='w',
                str_decode=True, group=None, run_attrs=None):
        """Init a full output file with the final intended shape without data.

        Parameters
        ----------
        h5_file : str
            Full h5 output filepath.
        dsets : list
            List of strings of dataset names to initialize (does not include
            meta or time_index).
        shapes : dict
            Dictionary of dataset shapes (keys correspond to dsets).
        attrs : dict
            Dictionary of dataset attributes (keys correspond to dsets).
        chunks : dict
            Dictionary of chunk tuples (keys correspond to dsets).
        dtypes : dict
            dictionary of numpy datatypes (keys correspond to dsets).
        meta : pd.DataFrame
            Full meta data.
        time_index : pd.datetimeindex | None
            Full pandas datetime index. None implies that only 1D results
            (no site profiles) are being written.
        configs : dict | None
            Optional input configs to set as attr on meta.
        unscale : bool
            Boolean flag to automatically unscale variables on extraction
        mode : str
            Mode to instantiate h5py.File instance
        str_decode : bool
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read.
        group : str
            Group within .h5 resource file to open
        run_attrs : dict | NoneType
            Runtime attributes (args, kwargs) to add as global (file)
            attributes
        """

        logger.debug("Initializing output file: {}".format(h5_file))
        kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
                  "group": group}
        with cls(h5_file, **kwargs) as f:
            if run_attrs is not None:
                f.run_attrs = run_attrs

            f['meta'] = meta

            if time_index is not None:
                f['time_index'] = time_index

            for dset in dsets:
                if dset not in ('meta', 'time_index'):
                    # initialize each dset to disk
                    f._create_dset(dset, shapes[dset], dtypes[dset],
                                   chunks=chunks[dset], attrs=attrs[dset])

            if configs is not None:
                f.set_configs(configs)
                logger.debug("\t- Configurations saved as attributes "
                             "on 'meta'")

        logger.debug('Output file has been initialized.')