# -*- coding: utf-8 -*-
"""
Classes to handle h5 output files.
"""
import json
import logging
import numpy as np
import pandas as pd
import time
import sys
import click
import h5py
import h5pyd
import scipy
from rex.version import __version__
from rex.utilities.exceptions import (HandlerRuntimeError, HandlerValueError,
ResourceKeyError)
from rex.resource import BaseResource
from rex.utilities.parse_keys import parse_keys, parse_slice
from rex.utilities.utilities import to_records_array
logger = logging.getLogger(__name__)
class Outputs(BaseResource):
"""
Base class to handle output data in .h5 format
Examples
--------
The Outputs handler can be used to initialize h5 files in the standard
reV/rex resource data format.
>>> from rex import Outputs
>>> import pandas as pd
>>> import numpy as np
>>>
>>> meta = pd.DataFrame({'latitude': np.ones(100),
>>> 'longitude': np.ones(100)})
>>>
>>> time_index = pd.date_range('20210101', '20220101', freq='1h',
>>> closed='right')
>>>
>>> with Outputs('test.h5', 'w') as f:
>>> f.meta = meta
>>> f.time_index = time_index
You can also use the Outputs handler to read output h5 files from disk.
The Outputs handler will automatically parse the meta data and time index
into the expected pandas objects (DataFrame and DatetimeIndex,
respectively).
>>> with Outputs('test.h5') as f:
>>> print(f.meta.head())
>>>
latitude longitude
gid
0 1.0 1.0
1 1.0 1.0
2 1.0 1.0
3 1.0 1.0
4 1.0 1.0
>>> with Outputs('test.h5') as f:
>>> print(f.time_index)
DatetimeIndex(['2021-01-01 01:00:00+00:00', '2021-01-01 02:00:00+00:00',
'2021-01-01 03:00:00+00:00', '2021-01-01 04:00:00+00:00',
'2021-01-01 05:00:00+00:00', '2021-01-01 06:00:00+00:00',
'2021-01-01 07:00:00+00:00', '2021-01-01 08:00:00+00:00',
'2021-01-01 09:00:00+00:00', '2021-01-01 10:00:00+00:00',
...
'2021-12-31 15:00:00+00:00', '2021-12-31 16:00:00+00:00',
'2021-12-31 17:00:00+00:00', '2021-12-31 18:00:00+00:00',
'2021-12-31 19:00:00+00:00', '2021-12-31 20:00:00+00:00',
'2021-12-31 21:00:00+00:00', '2021-12-31 22:00:00+00:00',
'2021-12-31 23:00:00+00:00', '2022-01-01 00:00:00+00:00'],
dtype='datetime64[ns, UTC]', length=8760, freq=None)
There are a few ways to use the Outputs handler to write data to a file.
Here is one example using the pre-initialized file we created earlier.
Note that the Outputs handler will automatically scale float data using
the "scale_factor" attribute. The Outputs handler will unscale the data
while being read unless the unscale kwarg is explicityly set to False.
This behavior is intended to reduce disk storage requirements for big
data and can be disabled by setting dtype=np.float32 or dtype=np.float64
when writing data.
>>> Outputs.add_dataset(h5_file='test.h5', dset_name='dset1',
>>> dset_data=np.ones((8760, 100)) * 42.42,
>>> attrs={'scale_factor': 100}, dtype=np.int32)
>>> with Outputs('test.h5') as f:
>>> print(f['dset1'])
>>> print(f['dset1'].dtype)
[[42.42 42.42 42.42 ... 42.42 42.42 42.42]
[42.42 42.42 42.42 ... 42.42 42.42 42.42]
[42.42 42.42 42.42 ... 42.42 42.42 42.42]
...
[42.42 42.42 42.42 ... 42.42 42.42 42.42]
[42.42 42.42 42.42 ... 42.42 42.42 42.42]
[42.42 42.42 42.42 ... 42.42 42.42 42.42]]
float32
>>> with Outputs('test.h5', unscale=False) as f:
>>> print(f['dset1'])
>>> print(f['dset1'].dtype)
[[4242 4242 4242 ... 4242 4242 4242]
[4242 4242 4242 ... 4242 4242 4242]
[4242 4242 4242 ... 4242 4242 4242]
...
[4242 4242 4242 ... 4242 4242 4242]
[4242 4242 4242 ... 4242 4242 4242]
[4242 4242 4242 ... 4242 4242 4242]]
int32
Note that the Outputs handler is specifically designed to read and
write spatiotemporal data. It is therefore important to intialize the meta
data and time index objects even if your data is only spatial or only
temporal. Furthermore, the Outputs handler will always assume that 1D
datasets represent scalar data (non-timeseries) that corresponds to the
meta data shape, and that 2D datasets represent spatiotemporal data whose
shape corresponds to (len(time_index), len(meta)). You can see these
constraints here:
>>> Outputs.add_dataset(h5_file='test.h5', dset_name='bad_shape',
dset_data=np.ones((1, 100)) * 42.42,
attrs={'scale_factor': 100}, dtype=np.int32)
HandlerValueError: 2D data with shape (1, 100) is not of the proper
spatiotemporal shape: (8760, 100)
>>> Outputs.add_dataset(h5_file='test.h5', dset_name='bad_shape',
dset_data=np.ones((8760,)) * 42.42,
attrs={'scale_factor': 100}, dtype=np.int32)
HandlerValueError: 1D data with shape (8760,) is not of the proper
spatial shape: (100,)
"""
def __init__(self, h5_file, mode='r', unscale=True, str_decode=True,
group=None):
"""
Parameters
----------
h5_file : str
Path to .h5 resource file
mode : str, optional
Mode to instantiate h5py.File instance, by default 'r'
unscale : bool, optional
Boolean flag to automatically unscale variables on extraction,
by default True
str_decode : bool, optional
Boolean flag to decode the bytestring meta data into normal
strings. Setting this to False will speed up the meta data read,
by default True
group : str, optional
Group within .h5 resource file to open, by default None
"""
super().__init__(h5_file, unscale=unscale, hsds=False,
str_decode=str_decode, group=group, mode=mode)
self._mode = mode
self._group = self._check_group(group)
self._shape = None
if self.writable:
self.set_version_attr()
def __len__(self):
_len = 0
if 'meta' in self.datasets:
_len = self.h5['meta'].shape[0]
return _len
def __setitem__(self, keys, arr):
if self.writable:
ds, ds_slice = parse_keys(keys)
slice_test = False
if isinstance(ds_slice, tuple):
slice_test = ds_slice[0] == slice(None, None, None)
if ds.endswith('meta') and slice_test:
self._set_meta(ds, arr)
elif ds.endswith('time_index') and slice_test:
self._set_time_index(ds, arr)
else:
self._set_ds_array(ds, arr, ds_slice)
@property
def full_version_record(self):
"""Get record of versions for dependencies
Returns
-------
dict
Dictionary of package versions for dependencies
"""
versions = {'rex': __version__,
'pandas': pd.__version__,
'numpy': np.__version__,
'python': sys.version,
'click': click.__version__,
'h5py': h5py.__version__,
'h5pyd': h5pyd.__version__,
'scipy': scipy.__version__
}
return versions
def set_version_attr(self):
"""Set the version attribute to the h5 file."""
self.h5.attrs['version'] = __version__
self.h5.attrs['full_version_record'] = json.dumps(
self.full_version_record)
self.h5.attrs['package'] = 'rex'
@property
def version(self):
"""
Version of package used to create file
Returns
-------
str
"""
return self.h5.attrs['version']
@property
def package(self):
"""
Package used to create file
Returns
-------
str
"""
return self.h5.attrs['package']
@property
def source(self):
"""
Package and version used to create file
Returns
-------
str
"""
out = ("{}_{}"
.format(self.h5.attrs['package'], self.h5.attrs['version']))
return out
@property
def shape(self):
"""
Variable array shape from time_index and meta
Returns
-------
tuple
shape of variables arrays == (time, locations)
"""
if self._shape is None:
dsets = self.datasets
if 'meta' in dsets:
self._shape = self.h5['meta'].shape
if 'time_index' in dsets:
self._shape = self.h5['time_index'].shape + self._shape
return self._shape
@property
def writable(self):
"""
Check to see if h5py.File instance is writable
Returns
-------
is_writable : bool
Flag if mode is writable
"""
is_writable = True
mode = ['a', 'w', 'w-', 'x']
if self._mode not in mode:
is_writable = False
return is_writable
@BaseResource.meta.setter # pylint: disable-msg=E1101
def meta(self, meta):
"""
Write meta data to disk, convert type if neccessary
Parameters
----------
meta : pandas.DataFrame | numpy.recarray
Locational meta data
"""
self._set_meta('meta', meta)
@BaseResource.time_index.setter # pylint: disable-msg=E1101
def time_index(self, time_index):
"""
Write time_index to dics, convert type if neccessary
Parameters
----------
time_index : pandas.DatetimeIndex | ndarray
Temporal index of timesteps
"""
self._set_time_index('time_index', time_index)
@property
def SAM_configs(self):
"""
SAM configuration JSONs used to create CF profiles
Returns
-------
configs : dict
Dictionary of SAM configuration JSONs
"""
if 'meta' in self.datasets:
configs = {k: json.loads(v)
for k, v in self.h5['meta'].attrs.items()}
else:
configs = {}
return configs
@property
def run_attrs(self):
"""
Runtime attributes stored at the global (file) level
Returns
-------
global_attrs : dict
"""
return self.global_attrs
@run_attrs.setter
def run_attrs(self, run_attrs):
"""
Set runtime attributes as global (file) attributes
Parameters
----------
run_attrs : dict
Dictionary of runtime attributes (args, kwargs)
"""
if self.writable:
for k, v in run_attrs.items():
self.h5.attrs[k] = v
@staticmethod
def _check_data_dtype(dset_name, data, dtype, attrs=None):
"""
Check data dtype and scale if needed
Parameters
----------
dset_name : str
Name of dataset being written to disk
data : ndarray
Data to be written to disc
dtype : str
dtype of data on disc
attrs : dict, optional
Attributes to be set. May include 'scale_factor',
by default None
Returns
-------
data : ndarray
Data ready for writing to disc:
- Scaled and converted to dtype
"""
if attrs is None:
attrs = {}
scale_factor = attrs.get('scale_factor', None)
scale = (scale_factor is not None
and not np.issubdtype(data.dtype, np.integer))
if scale:
if scale_factor != 1 and not np.issubdtype(dtype, np.integer):
msg = ('Output dtype for "{}" must be an integer in '
'order to apply scale factor {}".'
.format(dset_name, scale_factor))
logger.error(msg)
raise HandlerRuntimeError(msg)
data_type_differs = not np.issubdtype(data.dtype, np.dtype(dtype))
is_integer = np.issubdtype(dtype, np.integer)
if data_type_differs and is_integer:
# apply scale factor and dtype
data = np.round(data * scale_factor).astype(dtype)
elif (not np.issubdtype(data.dtype, np.dtype(dtype))
and not np.issubdtype(np.dtype(dtype), np.floating)):
msg = ('A scale_factor is needed to scale '
'"{}" of type "{}" to "{}".'
.format(dset_name, data.dtype, dtype))
raise HandlerRuntimeError(msg)
return data
def _check_group(self, group):
"""
Ensure group is in .h5 file
Parameters
----------
group : str
Group of interest
"""
if group is not None:
if group not in self._h5:
try:
if self.writable:
self._h5.create_group(group)
except Exception as ex:
msg = ('Cannot create group {}: {}'
.format(group, ex))
raise HandlerRuntimeError(msg) from ex
return group
def _set_meta(self, ds, meta, attrs=None):
"""
Write meta data to disk
Parameters
----------
ds : str
meta dataset name
meta : pandas.DataFrame | numpy.recarray
Locational meta data
attrs : dict
Attributes to add to the meta data dataset
"""
# pylint: disable=attribute-defined-outside-init
self._meta = meta
if isinstance(meta, pd.DataFrame):
meta = to_records_array(meta)
if ds in self.datasets:
self.update_dset(ds, meta)
else:
self._create_dset(ds, meta.shape, meta.dtype, data=meta,
attrs=attrs)
def _set_time_index(self, ds, time_index, attrs=None):
"""
Write time index to disk
Parameters
----------
ds : str
time index dataset name
time_index : pandas.DatetimeIndex | ndarray
Temporal index of timesteps
attrs : dict
Attributes to add to the meta data dataset
"""
# pylint: disable=attribute-defined-outside-init
self._time_index = time_index
if isinstance(time_index, pd.DatetimeIndex):
time_index = time_index.astype(str)
dtype = "S{}".format(len(time_index[0]))
time_index = np.array(time_index, dtype=dtype)
if ds in self.datasets:
self.update_dset(ds, time_index)
else:
self._create_dset(ds, time_index.shape, time_index.dtype,
data=time_index, attrs=attrs)
[docs]
def get_config(self, config_name):
"""
Get SAM config
Parameters
----------
config_name : str
Name of config
Returns
-------
config : dict
SAM config JSON as a dictionary
"""
if 'meta' in self.datasets:
config = json.loads(self.h5['meta'].attrs[config_name])
else:
config = None
return config
[docs]
def set_configs(self, SAM_configs):
"""
Set SAM configuration JSONs as attributes of 'meta'
Parameters
----------
SAM_configs : dict
Dictionary of SAM configuration JSONs
"""
if self.writable:
for key, config in SAM_configs.items():
if isinstance(config, dict):
config = json.dumps(config)
if not isinstance(key, str):
key = str(key)
self.h5['meta'].attrs[key] = config
def _set_ds_array(self, ds_name, arr, ds_slice):
"""
Write ds to disk
Parameters
----------
ds_name : str
Dataset name
arr : ndarray
Dataset data array
ds_slice : tuple
Dataset slicing that corresponds to arr
"""
if ds_name not in self.datasets:
msg = '{} must be initialized!'.format(ds_name)
raise HandlerRuntimeError(msg)
dtype = self.h5[ds_name].dtype
attrs = self.get_attrs(ds_name)
ds_slice = parse_slice(ds_slice)
self.h5[ds_name][ds_slice] = self._check_data_dtype(
ds_name, arr, dtype, attrs=attrs)
def _check_chunks(self, chunks, data=None):
"""
Convert dataset chunk size into valid tuple based on variable array
shape
Parameters
----------
chunks : tuple
Desired dataset chunk size
data : ndarray
Dataset array being chunked
Returns
-------
ds_chunks : tuple | None
dataset chunk size
"""
if chunks is None:
return None
if data is not None:
shape = data.shape
else:
shape = self.shape
if len(shape) != len(chunks):
msg = ('Shape dimensions ({}) are not the same length as chunks '
'({}). Please provide a single chunk value for each '
'dimension!'
.format(shape, chunks))
logger.error(msg)
raise HandlerRuntimeError(msg)
return tuple(np.min((s, s if c is None else c))
for s, c in zip(shape, chunks))
def _create_dset(self, ds_name, shape, dtype, chunks=None, attrs=None,
data=None, replace=True):
"""
Initialize dataset
Parameters
----------
ds_name : str
Dataset name
shape : tuple
Dataset shape
dtype : str
Dataset numpy dtype
chunks : tuple
Dataset chunk size
attrs : dict
Dataset attributes
data : ndarray
Dataset data array
replace : bool
If previous dataset exists with the same name, it will be replaced.
"""
ds = None
if self.writable:
if ds_name in self.datasets and replace:
del self.h5[ds_name]
elif ds_name in self.datasets:
old_shape, old_dtype, _ = self.get_dset_properties(ds_name)
if old_shape != shape or old_dtype != dtype:
e = ('Trying to create dataset "{}", but already exists '
'with mismatched shape and dtype. New shape/dtype '
'is {}/{}, previous shape/dtype is {}/{}'
.format(ds_name, shape, dtype, old_shape, old_dtype))
logger.error(e)
raise HandlerRuntimeError(e)
if ds_name not in self.datasets:
chunks = self._check_chunks(chunks, data=data)
try:
ds = self.h5.create_dataset(ds_name, shape=shape,
dtype=dtype, chunks=chunks)
except Exception as e:
msg = ('Could not create dataset "{}" in file!'
.format(ds_name))
logger.error(msg)
raise IOError(msg) from e
if attrs is not None:
self._create_ds_attrs(ds, ds_name, attrs)
if data is not None:
ds[...] = data
@staticmethod
def _create_ds_attrs(ds, ds_name, attrs):
"""Create dataset attributes.
Parameters
----------
ds : h5py.Dataset
Dataset object to write attributes to.
ds_name : str
Dataset name for logging / debugging
attrs : dict | None
Dataset attributes to write (None if no attributes to write).
"""
if attrs is not None:
for key, value in attrs.items():
try:
ds.attrs[key] = value
except Exception as e:
msg = ('Could not save datset "{}" attribute "{}" '
'to value: {}'.format(ds_name, key, value))
logger.error(msg)
raise IOError(msg) from e
def _check_dset_shape(self, dset_name, dset_data):
"""
Check to ensure that dataset array is of the proper shape
Parameters
----------
dset_name : str
Dataset name being written to disk.
dset_data : ndarray
Dataset data array
"""
dset_shape = dset_data.shape
if len(dset_shape) == 1:
possible_shapes = {}
try:
possible_shapes["spatial"] = (len(self.meta),)
except ResourceKeyError:
pass
try:
possible_shapes["temporal"] = (len(self.time_index),)
except ResourceKeyError:
pass
if not possible_shapes:
msg = ("Please load either 'meta' or 'time_index' before "
"loading a 1D dataset.")
logger.error(msg)
raise HandlerRuntimeError(msg)
if dset_shape not in possible_shapes.values():
possible_shapes_str = " or ".join(["{} {}".format(k, v)
for k, v
in possible_shapes.items()])
msg = ('1D dataset "{}" with shape {} is not of '
'the proper {} shape!'
.format(dset_name, dset_shape, possible_shapes_str))
logger.error(msg)
raise HandlerValueError(msg)
else:
shape = self.shape
if shape:
if dset_shape != shape:
msg = ('2D dataset "{}" with shape {} is not of the '
'proper spatiotemporal shape: {}'
.format(dset_name, dset_shape, shape))
logger.error(msg)
raise HandlerValueError(msg)
else:
msg = ("'meta' and 'time_index' have not been loaded")
logger.error(msg)
raise HandlerRuntimeError(msg)
def _add_dset(self, dset_name, data, dtype, chunks=None, attrs=None):
"""
Write dataset to disk. Dataset it created in .h5 file and data is
scaled if needed.
Parameters
----------
dset_name : str
Name of dataset to be added to h5 file.
data : ndarray
Data to be added to h5 file.
dtype : str
Intended dataset datatype after scaling.
chunks : tuple
Chunk size for capacity factor means dataset.
attrs : dict
Attributes to be set. May include 'scale_factor'.
"""
self._check_dset_shape(dset_name, data)
data = self._check_data_dtype(dset_name, data, dtype, attrs=attrs)
self._create_dset(dset_name, data.shape, dtype,
chunks=chunks, attrs=attrs, data=data)
[docs]
def update_dset(self, dset, dset_array, dset_slice=None):
"""
Check to see if dset needs to be updated on disk
If so write dset_array to disk
Parameters
----------
dset : str
dataset to update
dset_array : ndarray
dataset array
dset_slice : tuple
slice of dataset to update, it None update all
"""
if dset_slice is None:
dset_slice = (slice(None, None, None), )
keys = (dset, ) + dset_slice
arr = self.__getitem__(keys)
if not np.array_equal(arr, dset_array):
self._set_ds_array(dset, dset_array, dset_slice)
[docs]
def write_dataset(self, dset_name, data, dtype, chunks=None, attrs=None):
"""
Write dataset to disk. Dataset it created in .h5 file and data is
scaled if needed.
Parameters
----------
dset_name : str
Name of dataset to be added to h5 file.
data : ndarray
Data to be added to h5 file.
dtype : str
Intended dataset datatype after scaling.
chunks : tuple
Chunk size for capacity factor means dataset.
attrs : dict
Attributes to be set. May include 'scale_factor'.
"""
self._add_dset(dset_name, data, dtype, chunks=chunks, attrs=attrs)
[docs]
@classmethod
def write_profiles(cls, h5_file, meta, time_index, dset_name, profiles,
dtype, attrs=None, SAM_configs=None, chunks=(None, 100),
unscale=True, mode='w-', str_decode=True, group=None):
"""
Write profiles to disk
Parameters
----------
h5_file : str
Path to .h5 resource file
meta : pandas.Dataframe
Locational meta data
time_index : pandas.DatetimeIndex
Temporal timesteps
dset_name : str
Name of the target dataset (should identify the profiles).
profiles : ndarray
output result timeseries profiles
dtype : str
Intended dataset datatype after scaling.
attrs : dict, optional
Attributes to be set. May include 'scale_factor', by default None
SAM_configs : dict, optional
Dictionary of SAM configuration JSONs used to compute cf means,
by default None
chunks : tuple, optional
Chunk size for capacity factor means dataset,
by default (None, 100)
unscale : bool, optional
Boolean flag to automatically unscale variables on extraction,
by default True
mode : str, optional
Mode to instantiate h5py.File instance, by default 'w-'
str_decode : bool, optional
Boolean flag to decode the bytestring meta data into normal
strings. Setting this to False will speed up the meta data read,
by default True
group : str, optional
Group within .h5 resource file to open, by default None
"""
logger.info("Saving profiles ({}) to {}".format(dset_name, h5_file))
if profiles.shape != (len(time_index), len(meta)):
raise HandlerValueError("Profile dimensions does not match"
"'time_index' and 'meta'")
ts = time.time()
kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
"group": group}
with cls(h5_file, **kwargs) as f:
# Save time index
f['time_index'] = time_index
logger.debug("\t- 'time_index' saved to disc")
# Save meta
f['meta'] = meta
logger.debug("\t- 'meta' saved to disc")
# Add SAM configurations as attributes to meta
if SAM_configs is not None:
f.set_configs(SAM_configs)
logger.debug("\t- SAM configurations saved as attributes "
"on 'meta'")
# Write dset to disk
f._add_dset(dset_name, profiles, dtype,
chunks=chunks, attrs=attrs)
logger.debug("\t- '{}' saved to disc".format(dset_name))
tt = (time.time() - ts) / 60
logger.info('{} is complete'.format(h5_file))
logger.debug('\t- Saving to disc took {:.4f} minutes'
.format(tt))
[docs]
@classmethod
def write_means(cls, h5_file, meta, dset_name, means, dtype, attrs=None,
SAM_configs=None, chunks=None, unscale=True, mode='w-',
str_decode=True, group=None):
"""
Write means array to disk
Parameters
----------
h5_file : str
Path to .h5 resource file
meta : pandas.Dataframe
Locational meta data
dset_name : str
Name of the target dataset (should identify the means).
means : ndarray
output means array.
dtype : str
Intended dataset datatype after scaling.
attrs : dict, optional
Attributes to be set. May include 'scale_factor', by default None
SAM_configs : dict, optional
Dictionary of SAM configuration JSONs used to compute cf means,
by default None
chunks : tuple, optional
Chunk size for capacity factor means dataset, by default None
unscale : bool, optional
Boolean flag to automatically unscale variables on extraction,
by default True
mode : str, optional
Mode to instantiate h5py.File instance, by default 'w-'
str_decode : bool, optional
Boolean flag to decode the bytestring meta data into normal
strings. Setting this to False will speed up the meta data read,
by default True
group : str, optional
Group within .h5 resource file to open, by default None
"""
logger.info("Saving means ({}) to {}".format(dset_name, h5_file))
if len(means) != len(meta):
msg = 'Number of means does not match meta'
raise HandlerValueError(msg)
ts = time.time()
kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
"group": group}
with cls(h5_file, **kwargs) as f:
# Save meta
f['meta'] = meta
logger.debug("\t- 'meta' saved to disc")
# Add SAM configurations as attributes to meta
if SAM_configs is not None:
f.set_configs(SAM_configs)
logger.debug("\t- SAM configurations saved as attributes "
"on 'meta'")
# Write dset to disk
f._add_dset(dset_name, means, dtype,
chunks=chunks, attrs=attrs)
logger.debug("\t- '{}' saved to disc".format(dset_name))
tt = (time.time() - ts) / 60
logger.info('{} is complete'.format(h5_file))
logger.debug('\t- Saving to disc took {:.4f} minutes'
.format(tt))
[docs]
@classmethod
def add_dataset(cls, h5_file, dset_name, dset_data, dtype, attrs=None,
chunks=None, unscale=True, mode='a', str_decode=True,
group=None):
"""
Add dataset to h5_file
Parameters
----------
h5_file : str
Path to .h5 resource file
dset_name : str
Name of dataset to be added to h5 file
dset_data : ndarray
Data to be added to h5 file
dtype : str
Intended dataset datatype after scaling.
attrs : dict, optional
Attributes to be set. May include 'scale_factor', by default None
unscale : bool, optional
Boolean flag to automatically unscale variables on extraction,
by default True
mode : str, optional
Mode to instantiate h5py.File instance, by default 'a'
str_decode : bool, optional
Boolean flag to decode the bytestring meta data into normal
strings. Setting this to False will speed up the meta data read,
by default True
group : str, optional
Group within .h5 resource file to open, by default None
"""
logger.info("Adding {} to {}".format(dset_name, h5_file))
ts = time.time()
kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
"group": group}
with cls(h5_file, **kwargs) as f:
f._add_dset(dset_name, dset_data, dtype,
chunks=chunks, attrs=attrs)
tt = (time.time() - ts) / 60
logger.info('{} added'.format(dset_name))
logger.debug('\t- Saving to disc took {:.4f} minutes'
.format(tt))
@classmethod
def init_h5(cls, h5_file, dsets, shapes, attrs, chunks, dtypes,
meta, time_index=None, configs=None, unscale=True, mode='w',
str_decode=True, group=None, run_attrs=None):
"""Init a full output file with the final intended shape without data.
Parameters
----------
h5_file : str
Full h5 output filepath.
dsets : list
List of strings of dataset names to initialize (does not include
meta or time_index).
shapes : dict
Dictionary of dataset shapes (keys correspond to dsets).
attrs : dict
Dictionary of dataset attributes (keys correspond to dsets).
chunks : dict
Dictionary of chunk tuples (keys correspond to dsets).
dtypes : dict
dictionary of numpy datatypes (keys correspond to dsets).
meta : pd.DataFrame
Full meta data.
time_index : pd.datetimeindex | None
Full pandas datetime index. None implies that only 1D results
(no site profiles) are being written.
configs : dict | None
Optional input configs to set as attr on meta.
unscale : bool
Boolean flag to automatically unscale variables on extraction
mode : str
Mode to instantiate h5py.File instance
str_decode : bool
Boolean flag to decode the bytestring meta data into normal
strings. Setting this to False will speed up the meta data read.
group : str
Group within .h5 resource file to open
run_attrs : dict | NoneType
Runtime attributes (args, kwargs) to add as global (file)
attributes
"""
logger.debug("Initializing output file: {}".format(h5_file))
kwargs = {"unscale": unscale, "mode": mode, "str_decode": str_decode,
"group": group}
with cls(h5_file, **kwargs) as f:
if run_attrs is not None:
f.run_attrs = run_attrs
f['meta'] = meta
if time_index is not None:
f['time_index'] = time_index
for dset in dsets:
if dset not in ('meta', 'time_index'):
# initialize each dset to disk
f._create_dset(dset, shapes[dset], dtypes[dset],
chunks=chunks[dset], attrs=attrs[dset])
if configs is not None:
f.set_configs(configs)
logger.debug("\t- Configurations saved as attributes "
"on 'meta'")
logger.debug('Output file has been initialized.')