Source code for marmot.plottingmodules.plotutils.timeseries_modifiers

"""Collection of functions which modify or infer information from timeseries 

@author: Daniel Levie
"""

import datetime as dt
import logging
from typing import List, Tuple, Union

import numpy as np
import pandas as pd

logger = logging.getLogger("plotter." + __name__)


[docs]def set_timestamp_date_range(
    dfs: Union[pd.DataFrame, List[pd.DataFrame]], start_date: str, end_date: str
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, ...]]:
    """Sets the timestamp date range based on start_date and end_date strings

    Takes either a single df or a list of dfs as input.
    The index must be a pd.DatetimeIndex or a multiindex with level timestamp.

    Args:
        dfs (Union[pd.DataFrame, List[pd.DataFrame]]): df(s) to set date range for
        start_date (str): start date
        end_date (str): end date

    Raises:
        ValueError: If df.index is not of type type pd.DatetimeIndex or
                        type pd.MultiIndex with level timestamp.

    Returns:
        pd.DataFrame or Tuple[pd.DataFrame]: adjusted dataframes
    """

    logger.info(
        f"Plotting specific date range: \
                {str(start_date)} to {str(end_date)}"
    )

    df_list = []
    if isinstance(dfs, list):
        for df in dfs:
            if isinstance(df.index, pd.DatetimeIndex):
                df = df.loc[start_date:end_date]
            elif isinstance(df.index, pd.MultiIndex):
                df = df.xs(
                    slice(start_date, end_date), level="timestamp", drop_level=False
                )
            else:
                raise ValueError(
                    "'df.index' must be of type pd.DatetimeIndex or "
                    "type pd.MultiIndex with level 'timestamp'"
                )
            df_list.append(df)
        return tuple(df_list)
    else:
        if isinstance(dfs.index, pd.DatetimeIndex):
            df = dfs.loc[start_date:end_date]
        elif isinstance(dfs.index, pd.MultiIndex):
            df = dfs.xs(
                slice(start_date, end_date), level="timestamp", drop_level=False
            )
        else:
            raise ValueError(
                "'df.index' must be of type pd.DatetimeIndex or "
                "type pd.MultiIndex with level 'timestamp'"
            )
        return df


[docs]def get_sub_hour_interval_count(df: pd.DataFrame) -> int:
    """Detects the interval spacing of timeseries data.

    Used to adjust sums of certain variables for sub-hourly data.

    Args:
        df (pd.DataFrame): pandas dataframe with timestamp in index.

    Returns:
        int: Number of intervals per 60 minutes.
    """
    timestamps = df.index.get_level_values("timestamp").unique()
    time_delta = timestamps[1] - timestamps[0]
    # Finds intervals in 60 minute period
    intervals_per_hour = 60 / (time_delta / np.timedelta64(1, "m"))
    # If intervals are greater than 1 hour, returns 1
    return max(1, intervals_per_hour)


[docs]def adjust_for_leapday(self, df: pd.DataFrame) -> pd.DataFrame:
    """Shifts dataframe ahead by one day.

    Use if a non-leap year time series is modeled with a leap year time index.

    Modeled year must be included in the scenario parent directory name.
    Args:
        df (pd.DataFrame): Dataframe to process.

    Returns:
        pd.DataFrame: Same dataframe, with time index shifted.
    """
    if (
        "2008" not in self.processed_hdf5_folder
        and "2012" not in self.processed_hdf5_folder
        and df.index.get_level_values("timestamp")[0] > dt.datetime(2024, 2, 28, 0, 0)
    ):

        df.index = df.index.set_levels(
            df.index.levels[df.index.names.index("timestamp")].shift(1, freq="D"),
            level="timestamp",
        )
        return df


[docs]def sort_duration(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """Converts a dataframe time series into a duration curve.

    Args:
        df (pd.DataFrame): pandas multiindex dataframe.
        col (str): Column name by which to sort.

    Returns:
        pd.DataFrame: Dataframe with values sorted from largest to smallest.
    """
    sorted_duration = (
        df.sort_values(by=col, ascending=False)
        .reset_index()
        .drop(columns=["timestamp"])
    )

    return sorted_duration