Source code for rex.multi_year_resource

# -*- coding: utf-8 -*-
"""
Classes to handle multiple years of resource data

Data split by time in chunks of less than a year can be opened by the
MultiTimeResource class, but not by these classes.
"""
import pandas as pd
import numpy as np
import os

from rex.multi_file_resource import MULTI_FILE_CLASS_MAP, MultiFileResource
from rex.multi_time_resource import MultiTimeH5, MultiTimeResource
from rex.renewable_resource import (NSRDB, SolarResource, WindResource,
                                    WaveResource)
from rex.resource import Resource
from rex.utilities.parse_keys import parse_slice
from rex.utilities.utilities import parse_year



[docs]
class MultiYearH5(MultiTimeH5):
    """
    Class to handle multiple years of h5 Resources
    """

    def __init__(self, h5_path, years=None, res_cls=Resource, hsds=False,
                 hsds_kwargs=None, **res_cls_kwargs):
        """
        Parameters
        ----------
        h5_path : str
            Unix shell style pattern path with * wildcards to multi-file
            resource file sets. Files must have the same coordinates
            but can have different datasets or time indexes.
        years : list, optional
            List of integer years to access, by default None
        res_cls : obj
            Resource class to use to open and access resource data
        hsds : bool
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS
        hsds_kwargs : dict, optional
            Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
            password, by default None
        """
        self.h5_path = h5_path
        self._file_paths = self._get_file_paths(h5_path, hsds=hsds,
                                                hsds_kwargs=hsds_kwargs)
        self._file_paths, self._years = self._get_years(self._file_paths,
                                                        years)
        res_cls_kwargs.update({'hsds': hsds})
        self._h5_map = self._map_file_instances(self._file_paths,
                                                self._years,
                                                res_cls=res_cls,
                                                **res_cls_kwargs)
        self._years = self._h5_map['year'].values.tolist()

        self._datasets = None
        self._shape = None
        self._time_index = None
        self._i = 0

    def __repr__(self):
        msg = ("{} for {}:\n Contains data for {} years"
               .format(self.__class__.__name__, self.h5_path, len(self)))
        return msg

    def __len__(self):
        return len(set(self.years))

    def __getitem__(self, year):
        if isinstance(year, str):
            year = int(year)

        if year not in self._h5_map['year'].values:
            raise ValueError('{} is invalid, must be one of: {}'
                             .format(year, self.years))

        idx = np.where(self._h5_map['year'] == year)[0][0]
        h5 = self._h5_map.at[idx, 'h5']

        return h5

    def __iter__(self):
        return self

    def __next__(self):
        if self._i >= len(self.years):
            self._i = 0
            raise StopIteration

        year = self.years[self._i]
        self._i += 1

        return year

    def __contains__(self, year):
        return year in self.years

    @property
    def years(self):
        """
        Available years ordered the same way as self.files

        Returns
        -------
        list
            List of dataset present in .h5 files
        """
        return self._years

    @property
    def files(self):
        """
        Available file paths ordered the same way as self.years

        Returns
        -------
        list
        """
        return self._file_paths

    @property
    def time_index(self):
        """
        Multi-year datetime index

        Returns
        -------
        pandas.DatatimeIndex
        """
        if self._time_index is None:
            for h5 in self._h5_map['h5'].unique():
                if self._time_index is None:
                    self._time_index = h5.time_index
                else:
                    ti = self._time_index.append(h5.time_index)
                    self._time_index = ti

        return self._time_index

    @staticmethod
    def _map_file_instances(file_paths, years, res_cls=Resource,
                            **res_cls_kwargs):
        """
        Open all .h5 files and map the open h5py instances to the
        associated file paths

        Parameters
        ----------
        file_paths : list
            List of filepaths for this handler to handle.

        Returns
        -------
        h5_map : pd.DataFrame
            DataFrame mapping file paths to open resource instances and
            datasets per file (columns: fp, h5, and dsets)
        """

        h5_map = pd.DataFrame({'fp': file_paths, 'year': years, 'h5': None})
        h5_map = h5_map.sort_values('year').reset_index(drop=True)

        if len(h5_map['year'].unique()) < len(h5_map):
            del res_cls_kwargs['hsds']  # no multi file res on hsds
            for _, subdf in h5_map.groupby('year'):
                fps = subdf['fp'].values.tolist()
                handle = MULTI_FILE_CLASS_MAP.get(res_cls, MultiFileResource)
                h5 = handle(fps, **res_cls_kwargs)
                for i in subdf.index:
                    h5_map.at[i, 'h5'] = h5

        else:
            for i, f_path in enumerate(h5_map['fp']):
                h5_map.at[i, 'h5'] = res_cls(f_path, **res_cls_kwargs)

        h5_map['dsets'] = [h5.dsets for h5 in h5_map['h5'].values]

        return h5_map

    @staticmethod
    def _get_years(file_paths, years):
        """Reduce file path list to requested years and/or return list of
        years corresponding to file paths

        Parameters
        ----------
        file_paths : list
            List of filepaths for this handler to handle.
        years : list | None
            List of years of interest. Should be a subset of years in file_map.
            Can also be None for all years found by the h5_path input.

        Returns
        -------
        file_paths : list
            List of filepaths for this handler to handle.
        years : list
            List of integer years corresponding to the file_paths list
        """

        fp_years = [int(parse_year(os.path.basename(fp), option='raise'))
                    for fp in file_paths]

        if years is None:
            years = fp_years
        elif any(int(y) not in fp_years for y in years):
            years = sorted([int(y) for y in years])
            raise RuntimeError('Requested years "{}" not all found in '
                               'file years "{}"'.format(years, fp_years))
        else:
            filtered_fps = []
            years = sorted([int(y) for y in years])
            for target_year in years:
                for fp_year, fp in zip(fp_years, file_paths):
                    if int(target_year) == int(fp_year):
                        filtered_fps.append(fp)
                        break
            file_paths = filtered_fps

        return file_paths, years

    @staticmethod
    def _check_for_years(time_slice):
        """
        Check to see if temporal slice is a year (str) or list of years (strs)
        to extract data for

        Parameters
        ----------
        time_slice : list | slice | int | str
            Temporal slice to extract

        Returns
        -------
        check : bool
            True if temporal slice is a year (str) or list of years (strs),
            else False
        """
        check = False
        if isinstance(time_slice, (list, tuple)):
            time_slice = time_slice[0]

        if isinstance(time_slice, str):
            check = True

        return check


[docs]
    def year_index(self, year):
        """
        Extract time_index for a specific year

        Parameters
        ----------
        year : int
            Year to extract time_index for

        Returns
        -------
        time_index : pandas.DatetimeIndex
            Resource datetime index for desired year
        """
        return self.time_index[self.time_index.year == year]


    def _get_ds(self, ds_name, ds_slice):
        """
        Extract data from given dataset

        Parameters
        ----------
        ds_name : str
            Variable dataset to be extracted
        ds_slice : int | list | slice
            tuple describing slice of dataset array to extract

        Returns
        -------
        out : ndarray
            ndarray of variable timeseries data
            If unscale, returned in native units else in scaled units
        """
        ds_slice = parse_slice(ds_slice)
        out = []
        time_slice = ds_slice[0]
        if self._check_for_years(time_slice):
            years = time_slice
            year_slice = (slice(None), ) + ds_slice[1:]
            if isinstance(years, str):
                years = [years]

            for year in years:
                year = int(year)
                out.append(self[year]._get_ds(ds_name, year_slice))

            out = np.concatenate(out, axis=0)

        elif isinstance(time_slice, (int, np.integer)):
            time_step = self.time_index[time_slice]
            year = time_step.year
            year_index = self.year_index(year)
            year_slice = np.where(time_step == year_index)[0][0]
            year_slice = (year_slice, ) + ds_slice[1:]
            out = self[year]._get_ds(ds_name, year_slice)

        else:
            time_index = self.time_index[time_slice]
            year_map = time_index.year
            for year in year_map.unique():
                year_index = self.year_index(year)
                year_slice = year_index.isin(time_index[year_map == year])
                year_slice = \
                    self._check_time_slice(np.where(year_slice)[0])
                year_slice = (year_slice, ) + ds_slice[1:]
                out.append(self[year]._get_ds(ds_name, year_slice))

            out = np.concatenate(out, axis=0)

        return out


[docs]
    def close(self):
        """
        Close all h5py.File instances
        """
        for f in self._h5_map['h5']:
            f.close()





[docs]
class MultiYearResource(MultiTimeResource):
    """
    Class to handle multiple years of resource data stored accross multiple
    .h5 files. This also works if each year is split into multiple files each
    containing different datasets (e.g. for Sup3rCC and hi-res WTK+NSRDB). Data
    split by time in chunks of less than a year can be opened by the
    MultiTimeResource class, but not by this class.

    Note that files across years must have the same meta data, and files within
    the same year must have the same meta and time_index.

    Examples
    --------

    Extracting the resource's Datetime Index

    >>> path = '$TESTDATADIR/nsrdb/ri_100_nsrdb_*.h5'
    >>> with MultiYearResource(path) as res:
    >>>     ti = res.time_index
    >>>
    >>> ti
    DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:30:00',
                   '2012-01-01 01:00:00', '2012-01-01 01:30:00',
                   '2012-01-01 02:00:00', '2012-01-01 02:30:00',
                   '2012-01-01 03:00:00', '2012-01-01 03:30:00',
                   '2012-01-01 04:00:00', '2012-01-01 04:30:00',
                   ...
                   '2013-12-31 19:00:00', '2013-12-31 19:30:00',
                   '2013-12-31 20:00:00', '2013-12-31 20:30:00',
                   '2013-12-31 21:00:00', '2013-12-31 21:30:00',
                   '2013-12-31 22:00:00', '2013-12-31 22:30:00',
                   '2013-12-31 23:00:00', '2013-12-31 23:30:00'],
                  dtype='datetime64[ns]', length=35088, freq=None)

    NOTE: time_index covers data from 2012 and 2013

    >>> with MultiYearResource(path) as res:
    >>>     print(res.h5_files)

    ['/Users/mrossol/Git_Repos/rex/tests/data/nsrdb/ri_100_nsrdb_2012.h5',
     '/Users/mrossol/Git_Repos/rex/tests/data/nsrdb/ri_100_nsrdb_2013.h5']

    Data slicing works the same as with "Resource" except axis 0 now covers
    2012 and 2013

    >>> with MultiYearResource(path) as res:
    >>>     temperature = res['air_temperature']
    >>>
    >>> temperature
    [[ 4.  5.  5. ...  4.  3.  4.]
     [ 4.  4.  5. ...  4.  3.  4.]
     [ 4.  4.  5. ...  4.  3.  4.]
     ...
     [-1. -1.  0. ... -2. -3. -2.]
     [-1. -1.  0. ... -2. -3. -2.]
     [-1. -1.  0. ... -2. -3. -2.]]
    >>> temperature.shape
    (35088, 100)

    >>> with MultiYearResource(path) as res:
    >>>     temperature = res['air_temperature', ::100] # every 100th timestep
    >>>
    >>> temperature
    [[ 4.  5.  5. ...  4.  3.  4.]
     [ 1.  1.  2. ...  0.  0.  1.]
     [-2. -1. -1. ... -2. -4. -2.]
     ...
     [-3. -2. -2. ... -3. -4. -3.]
     [ 0.  0.  1. ...  0. -1.  0.]
     [ 3.  3.  3. ...  2.  2.  3.]]
    >>> temperature.shape
    (351, 100)

    You can also request a specific year of data using a string representation
    of the year of interest
    NOTE: you can also request a list of years using strings

    >>> with MultiYearResource(path) as res:
    >>>     temperature = res['air_temperature', '2012'] # every 100th timestep
    >>>
    >>> temperature
    [[4. 5. 5. ... 4. 3. 4.]
     [4. 4. 5. ... 4. 3. 4.]
     [4. 4. 5. ... 4. 3. 4.]
     ...
     [1. 1. 2. ... 0. 0. 0.]
     [1. 1. 2. ... 0. 0. 1.]
     [1. 1. 2. ... 0. 0. 1.]]
    >>> temperature.shape
    (17520, 100)
    """

    def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
                 res_cls=Resource, hsds=False, hsds_kwargs=None):
        """
        Parameters
        ----------
        h5_path : str
            Unix shell style pattern path with * wildcards to multi-file
            resource file sets. Files must have the same coordinates
            but can have different datasets or time indexes.
        years : list, optional
            List of years to access, by default None
        unscale : bool
            Boolean flag to automatically unscale variables on extraction
        str_decode : bool
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read.
        res_cls : obj
            Resource handler to us to open individual .h5 files
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        hsds_kwargs : dict, optional
            Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
            password, by default None
        """
        self.h5_path = h5_path
        self._time_index = None
        # Map variables to their .h5 files
        cls_kwargs = {'unscale': unscale, 'str_decode': str_decode,
                      'hsds': hsds, 'hsds_kwargs': hsds_kwargs}
        self._h5 = MultiYearH5(self.h5_path, years=years, res_cls=res_cls,
                               **cls_kwargs)
        self.h5_files = self._h5.h5_files
        self.h5_file = self.h5_files[0]
        self._i = 0

    @property
    def years(self):
        """
        Available years

        Returns
        -------
        list
            List of dataset present in .h5 files
        """
        return self.h5.years




[docs]
class MultiYearSolarResource:
    """
    Class to handle multiple years of solar resource data stored accross
    multiple .h5 files
    """
    def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
                 hsds=False, hsds_kwargs=None):
        """
        Parameters
        ----------
        h5_path : str
            Unix shell style pattern path with * wildcards to multi-file
            resource file sets. Files must have the same coordinates
            but can have different datasets or time indexes.
        years : list, optional
            List of years to access, by default None
        unscale : bool
            Boolean flag to automatically unscale variables on extraction
        str_decode : bool
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read.
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        hsds_kwargs : dict, optional
            Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
            password, by default None
        """
        super().__init__(h5_path, years=years, unscale=unscale, hsds=hsds,
                         hsds_kwargs=hsds_kwargs, str_decode=str_decode,
                         res_cls=SolarResource)




[docs]
class MultiYearNSRDB(MultiYearResource):
    """
    Class to handle multiple years of NSRDB data stored accross
    multiple .h5 files
    """

    def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
                 hsds=False, hsds_kwargs=None):
        """
        Parameters
        ----------
        h5_path : str
            Unix shell style pattern path with * wildcards to multi-file
            resource file sets. Files must have the same coordinates
            but can have different datasets or time indexes.
        years : list, optional
            List of years to access, by default None
        unscale : bool
            Boolean flag to automatically unscale variables on extraction
        str_decode : bool
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read.
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        hsds_kwargs : dict, optional
            Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
            password, by default None
        """
        super().__init__(h5_path, years=years, unscale=unscale, hsds=hsds,
                         hsds_kwargs=hsds_kwargs, str_decode=str_decode,
                         res_cls=NSRDB)




[docs]
class MultiYearWindResource(MultiYearResource):
    """
    Class to handle multiple years of wind resource data stored accross
    multiple .h5 files
    """

    def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
                 hsds=False, hsds_kwargs=None):
        """
        Parameters
        ----------
        h5_path : str
            Unix shell style pattern path with * wildcards to multi-file
            resource file sets. Files must have the same coordinates
            but can have different datasets or time indexes.
        years : list, optional
            List of years to access, by default None
        unscale : bool
            Boolean flag to automatically unscale variables on extraction
        str_decode : bool
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read.
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        hsds_kwargs : dict, optional
            Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
            password, by default None
        """
        super().__init__(h5_path, years=years, unscale=unscale, hsds=hsds,
                         hsds_kwargs=hsds_kwargs, str_decode=str_decode,
                         res_cls=WindResource)




[docs]
class MultiYearWaveResource(MultiYearResource):
    """
    Class to handle multiple years of wave resource data stored accross
    multiple .h5 files
    """

    def __init__(self, h5_path, years=None, unscale=True, str_decode=True,
                 hsds=False, hsds_kwargs=None):
        """
        Parameters
        ----------
        h5_path : str
            Unix shell style pattern path with * wildcards to multi-file
            resource file sets. Files must have the same coordinates
            but can have different datasets or time indexes.
        years : list, optional
            List of years to access, by default None
        unscale : bool
            Boolean flag to automatically unscale variables on extraction
        str_decode : bool
            Boolean flag to decode the bytestring meta data into normal
            strings. Setting this to False will speed up the meta data read.
        hsds : bool, optional
            Boolean flag to use h5pyd to handle .h5 'files' hosted on AWS
            behind HSDS, by default False
        hsds_kwargs : dict, optional
            Dictionary of optional kwargs for h5pyd, e.g., bucket, username,
            password, by default None
        """
        super().__init__(h5_path, years=years, unscale=unscale, hsds=hsds,
                         hsds_kwargs=hsds_kwargs, str_decode=str_decode,
                         res_cls=WaveResource)