Source code for flasc.analysis.total_uplift_power_ratio

"""Module for computing the total uplift in energy production."""

# This is a work in progress as we try to synthesize ideas from the
# table based methods and energy ratios back into one thing,
# some ideas we're incorporating:

# Conversion from polars to pandas
# Constructing tables (but now including tables of ratios)
# Keeping track of frequencies is matching sized tables

import warnings

import numpy as np
import polars as pl

import flasc.utilities.energy_ratio_utilities as util
from flasc.analysis.analysis_input import AnalysisInput
from flasc.data_processing.dataframe_manipulations import df_reduce_precision
from flasc.logging_manager import LoggingManager

logger_manager = LoggingManager()  # Instantiate LoggingManager
logger = logger_manager.logger  # Obtain the reusable logger


# Internal version, returns a polars dataframe

[docs]
def _total_uplift_power_ratio_single(
    df_,
    df_names,
    ref_cols,
    test_cols,
    wd_cols,
    ws_cols,
    wd_step=2.0,
    wd_min=0.0,
    wd_max=360.0,
    ws_step=1.0,
    ws_min=0.0,
    ws_max=50.0,
    bin_cols_in=["wd_bin", "ws_bin"],
    weight_by="min",  # min, sum
    df_freq_pl=None,
    wd_bin_overlap_radius=0.0,
    uplift_pairs=[],
    uplift_names=[],
    remove_all_nulls=False,
):
    """Compute the total change in energy production between two sets of turbines.

    Args:
        df_ (pl.DataFrame): A dataframe containing the data to use in the calculation.
        df_names (list): A list of names to give to the dataframes.
        ref_cols (list[str]): A list of columns to use as the reference turbines
        test_cols (list[str]): A list of columns to use as the test turbines
        wd_cols (list[str]): A list of columns to derive the wind directions from
        ws_cols (list[str]): A list of columns to derive the wind speeds from
        wd_step (float): The width of the wind direction bins.
        wd_min (float): The minimum wind direction to use.
        wd_max (float): The maximum wind direction to use.
        ws_step (float): The width of the wind speed bins.
        ws_min (float): The minimum wind speed to use.
        ws_max (float): The maximum wind speed to use.
        bin_cols_in (list[str]): A list of column names
            to use for the wind speed and wind direction bins.
        weight_by (str): How to weight the energy ratio, options are 'min', or 'sum'.  'min' means
            the minimum count across the dataframes
            is used to weight the energy ratio.   'sum' means the sum of the counts
            across the dataframes is used to weight the energy ratio.   Defaults to 'min'.
        df_freq_pl (pl.Dataframe): Polars dataframe of pre-provided per bin weights
        wd_bin_overlap_radius (float): The distance
            in degrees one wd bin overlaps into the next, must be
            less or equal to half the value of wd_step
        uplift_pairs: (list[tuple]): List of pairs of df_names to compute uplifts for. Each element
            of the list should be a tuple (or list) of length 2, where the first element will be the
            base case in the uplift calculation and the second element will be the test case in the
            uplift calculation. If None, no uplifts are computed.
        uplift_names: (list[str]): Names for the uplift columns, following the order of the
            pairs specified in uplift_pairs. If None, will default to "uplift_df_name1_df_name2",
        remove_all_nulls: (bool): Construct reference and test by strictly requiring all data to be
            available. If False, a minimum one data point from
            ref_cols, test_cols, wd_cols, and ws_cols
            must be available to compute the bin. Defaults to False.

    Returns:
        A tuple (dict, pl.DataFrame): containing the results of the computation
            and the frequency table.  The dictionary contains the uplift results indexed by the
            uplift_names.  The dataframe contains the weights for each wind direction
                and wind speed bin.
    """
    # Get the number of dataframes
    num_df = len(df_names)

    bin_cols_without_df_name = [c for c in bin_cols_in if c != "df_name"]

    # Filter df_ to remove null values
    null_filter = util.filter_all_nulls if remove_all_nulls else util.filter_any_nulls
    df_ = null_filter(df_, ref_cols, test_cols, ws_cols, wd_cols)
    if len(df_) == 0:
        raise RuntimeError("After removing nulls, no data remains for computation.")

    # Apply binning to dataframe and aggregate bins
    df_ = util.bin_and_group_dataframe(
        df_,
        ref_cols,
        test_cols,
        wd_cols,
        ws_cols,
        wd_step,
        wd_min,
        wd_max,
        ws_step,
        ws_min,
        ws_max,
        wd_bin_overlap_radius,
        remove_all_nulls,
        bin_cols_without_df_name,
        num_df,
    )

    # Determine the weighting of the ws/wd bins
    df_, df_freq_pl = util.add_bin_weights(df_, df_freq_pl, bin_cols_without_df_name, weight_by)

    # If total uplift requested, compute at this point
    total_uplift_result = {}
    for uplift_pair, uplift_name in zip(uplift_pairs, uplift_names):
        df_total = (
            df_.filter(pl.col("df_name").is_in(uplift_pair))
            .with_columns(
                power_ratio=pl.col("pow_test") / pl.col("pow_ref"),
                weighted_pow_ref=pl.col("pow_ref") * pl.col("count"),
            )
            .with_columns(total_count_per_bin=pl.col("count").sum().over(bin_cols_without_df_name))
            .with_columns(
                weighted_pow_ref=pl.col("weighted_pow_ref") / pl.col("total_count_per_bin")
            )
            .with_columns(
                weighted_pow_ref=pl.col("weighted_pow_ref").sum().over(bin_cols_without_df_name)
            )
            .pivot(
                values=["power_ratio"],
                on="df_name",
                index=bin_cols_without_df_name + ["weight", "weighted_pow_ref"],
                aggregate_function="first",
            )
            # Renorm the weight
            .with_columns(pl.col("weight") / pl.col("weight").sum())
            .with_columns(delta_power_ratio=pl.col(uplift_pair[1]) - pl.col(uplift_pair[0]))
            .with_columns(
                delta_aep=pl.col("weight")
                * pl.col("delta_power_ratio")
                * pl.col("weighted_pow_ref"),
                base_aep=pl.col("weight") * pl.col(uplift_pair[0]) * pl.col("weighted_pow_ref"),
            )
            .sum()
        )

        delta_aep = 8760 * df_total.select("delta_aep").item()
        percent_delta_aep = 100 * (
            df_total.select("delta_aep").item() / df_total.select("base_aep").item()
        )

        if np.isnan(delta_aep):
            if ws_min < 5.0:
                warnings.warn(
                    "NaNs detected in power ratios. This can result from "
                    + "the reference power being 0, which can occur when wind speed is "
                    "very low. Try setting ws_min keyword argument to remove 0 power "
                    "wind speeds."
                )
            else:
                warnings.warn("NaNs detected in power ratios.")

        total_uplift_result[uplift_name] = {
            "energy_uplift_ctr": delta_aep,
            "energy_uplift_lb": None,
            "energy_uplift_ub": None,
            "energy_uplift_ctr_pc": percent_delta_aep,
            "energy_uplift_lb_pc": None,
            "energy_uplift_ub_pc": None,
        }

    return total_uplift_result, df_freq_pl



# Bootstrap function wraps the _compute_energy_ratio function

[docs]
def _total_uplift_power_ratio_bootstrap(
    a_in,
    ref_cols,
    test_cols,
    wd_cols,
    ws_cols,
    wd_step=2.0,
    wd_min=0.0,
    wd_max=360.0,
    ws_step=1.0,
    ws_min=0.0,
    ws_max=50.0,
    bin_cols_in=["wd_bin", "ws_bin"],
    weight_by="min",  # min, sum
    df_freq_pl=None,
    wd_bin_overlap_radius=0.0,
    uplift_pairs=[],
    uplift_names=[],
    N=1,
    percentiles=[5.0, 95.0],
    remove_all_nulls=False,
):
    """Compute the total change in energy between two sets of turbines with bootstrapping.

    Args:
        a_in (AnalysisInput): An AnalysisInput object
            containing the data to use in the calculation.
        ref_cols (list[str]): A list of columns to use as the reference turbines
        test_cols (list[str]): A list of columns to use as the test turbines
        wd_cols (list[str]): A list of columns to derive the wind directions from
        ws_cols (list[str]): A list of columns to derive the wind speeds from
        wd_step (float): The width of the wind direction bins.
        wd_min (float): The minimum wind direction to use.
        wd_max (float): The maximum wind direction to use.
        ws_step (float): The width of the wind speed bins.
        ws_min (float): The minimum wind speed to use.
        ws_max (float): The maximum wind speed to use.
        bin_cols_in (list[str]): A list of column names to use for
            the wind speed and wind direction bins.
        weight_by (str): How to weight the energy ratio, options are 'min', or 'sum'.  'min' means
            the minimum count across the dataframes
            is used to weight the energy ratio. 'sum' means the sum of the counts
            across the dataframes is used to weight the energy ratio.
        df_freq_pl (pl.Dataframe): Polars dataframe of pre-provided per bin weights
        wd_bin_overlap_radius (float): The distance
            in degrees one wd bin overlaps into the next, must be
            less or equal to half the value of wd_step
        uplift_pairs: (list[tuple]): List of pairs of df_names to compute uplifts for. Each element
            of the list should be a tuple (or list) of length 2, where the first element will be the
            base case in the uplift calculation and the second element will be the test case in the
            uplift calculation. If None, no uplifts are computed.
        uplift_names: (list[str]): Names for the uplift columns, following the order of the
            pairs specified in uplift_pairs. If None, will default to "uplift_df_name1_df_name2"
        N (int): The number of bootstrap samples to use.
        percentiles: (list or None): percentiles to use when returning energy ratio bounds.
            If specified as None with N > 1 (bootstrapping), defaults to [5, 95].
        remove_all_nulls: (bool): Construct reference and test by strictly requiring all data to be
                available. If False, a minimum one data point
                from ref_cols, test_cols, wd_cols, and ws_cols
                must be available to compute the bin. Defaults to False.


    Returns:
        pl.DataFrame: A dataframe containing the energy ratio between the two sets of turbines.

    """
    # Otherwise run the function N times and concatenate the results to compute statistics
    uplift_single_outs = [
        _total_uplift_power_ratio_single(
            a_in.resample_energy_table(perform_resample=(i != 0)),
            a_in.df_names,
            ref_cols,
            test_cols,
            wd_cols,
            ws_cols,
            wd_step,
            wd_min,
            wd_max,
            ws_step,
            ws_min,
            ws_max,
            bin_cols_in,
            weight_by,
            df_freq_pl,
            wd_bin_overlap_radius,
            uplift_pairs,
            uplift_names,
            remove_all_nulls,
        )
        for i in range(N)
    ]
    # df_concat = pl.concat([uplift_single_out[0] for uplift_single_out in uplift_single_outs])
    # First output contains the original table; use that df_freq_pl
    df_freq_pl = uplift_single_outs[0][1]

    # Add in the statistics
    total_uplift_result = {}

    for uplift_name in uplift_names:
        delta_aeps = np.zeros(N)
        percent_delta_aeps = np.zeros(N)

        for i in range(N):
            delta_aeps[i] = uplift_single_outs[i][0][uplift_name]["energy_uplift_ctr"]
            percent_delta_aeps[i] = uplift_single_outs[i][0][uplift_name]["energy_uplift_ctr_pc"]

        delta_aep_central = delta_aeps[0]
        delta_aep_lb = np.quantile(delta_aeps, percentiles[0] / 100)
        delta_aep_ub = np.quantile(delta_aeps, percentiles[1] / 100)

        percent_delta_aep_central = percent_delta_aeps[0]
        percent_delta_aep_lb = np.quantile(percent_delta_aeps, percentiles[0] / 100)
        percent_delta_aep_ub = np.quantile(percent_delta_aeps, percentiles[1] / 100)

        total_uplift_result[uplift_name] = {
            "energy_uplift_ctr": delta_aep_central,
            "energy_uplift_lb": delta_aep_lb,
            "energy_uplift_ub": delta_aep_ub,
            "energy_uplift_ctr_pc": percent_delta_aep_central,
            "energy_uplift_lb_pc": percent_delta_aep_lb,
            "energy_uplift_ub_pc": percent_delta_aep_ub,
        }

    return total_uplift_result, df_freq_pl




[docs]
def total_uplift_power_ratio(
    a_in: AnalysisInput,
    ref_turbines=None,
    test_turbines=None,
    wd_turbines=None,
    ws_turbines=None,
    use_predefined_ref=False,
    use_predefined_wd=False,
    use_predefined_ws=False,
    wd_step=2.0,
    wd_min=0.0,
    wd_max=360.0,
    ws_step=1.0,
    ws_min=0.0,
    ws_max=50.0,
    bin_cols_in=["wd_bin", "ws_bin"],
    weight_by="min",  # min or sum
    df_freq=None,
    wd_bin_overlap_radius=0.0,
    uplift_pairs=None,
    uplift_names=None,
    N=1,
    percentiles=None,
    remove_all_nulls=False,
) -> dict:  # dict output for now, may change later
    """Compute the energy ratio between two sets of turbines with bootstrapping.

    Args:
        a_in (AnalysisInput): An AnalysisInput object
            containing the data to use in the calculation.
        ref_turbines (list[int]): A list of turbine numbers to use as the reference.
        test_turbines (list[int]): A list of turbine numbers to use as the test.
        ws_turbines (list[int]): A list of turbine numbers to use for the wind speeds
        wd_turbines (list[int]): A list of turbine numbers to use for the wind directions
        use_predefined_ref (bool): If True, use the pow_ref column of df_ as the reference power.
        use_predefined_ws (bool): If True, use the ws column of df_ as the wind speed.
        use_predefined_wd (bool): If True, use the wd column of df_ as the wind direction.
        wd_step (float): The width of the wind direction bins.
        wd_min (float): The minimum wind direction to use.
        wd_max (float): The maximum wind direction to use.
        ws_step (float): The width of the wind speed bins.
        ws_min (float): The minimum wind speed to use.
        ws_max (float): The maximum wind speed to use.
        bin_cols_in (list[str]): A list of column names to
            use for the wind speed and wind direction bins.
        weight_by (str): How to weight the energy ratio, options are 'min', , or 'sum'.  'min' means
            the minimum count across the dataframes is used to weight the energy ratio.
            'sum' means the sum of the counts
            across the dataframes is used to weight the energy ratio.
        df_freq (pd.Dataframe): A dataframe which specifies the
            frequency of the ws/wd bin combinations.  Provides
            a method to use an explicit or long-term weigthing of bins.  Dataframe should include
            columns ws, wd and freq_val.  ws and wd should
            orrespond to the bin centers resulting from
            the choices of the ws/wd_min / _max / _step.
            In the case that df_freq has extra bins that aren't included
            in those given by ws/wd min, max, step, they will
            be ignored in the energy ratio calculation.
            Any bins given by ws/wd min, max, step not present in
            df_freq will be assigned a frequency of zero.
            Defaults to None.
        wd_bin_overlap_radius (float): The distance in degrees
            one wd bin overlaps into the next, must be
            less or equal to half the value of wd_step
        uplift_pairs: (list[tuple]): List of pairs of df_names to compute uplifts for. Each element
            of the list should be a tuple (or list) of length 2, where the first element will be the
            base case in the uplift calculation and the second element will be the test case in the
            uplift calculation. If None, no uplifts are computed.
        uplift_names: (list[str]): Names for the uplift columns, following the order of the
            pairs specified in uplift_pairs. If None, will default to "uplift_df_name1_df_name2"
        N (int): The number of bootstrap samples to use.
        percentiles: (list or None): percentiles to use when returning energy ratio bounds.
            If specified as None with N > 1 (bootstrapping), defaults to [5, 95].
        remove_all_nulls: (bool): Construct reference and test by strictly requiring all data to be
                available. If False, a minimum one data point from ref_cols, test_cols, wd_cols,
                and ws_cols
                must be available to compute the bin. Defaults to False.

    Returns:
        EnergyRatioOutput: An EnergyRatioOutput object containing the energy ratio between the
        two sets of turbines.

    """
    # Get the polars dataframe from within the a_in
    df_ = a_in.get_df()

    # Check that inputs are valid
    util.check_compute_analysis_inputs(
        df_,
        ref_turbines,
        test_turbines,
        wd_turbines,
        ws_turbines,
        use_predefined_ref,
        use_predefined_wd,
        use_predefined_ws,
        wd_step,
        wd_min,
        wd_max,
        ws_step,
        ws_min,
        ws_max,
        bin_cols_in,
        weight_by,
        df_freq,
        wd_bin_overlap_radius,
        uplift_pairs,
        uplift_names,
        False,
        N,
        percentiles,
        remove_all_nulls,
    )

    # Set up the column names for the reference and test power
    if not use_predefined_ref:
        ref_cols = [f"pow_{i:03d}" for i in ref_turbines]
    else:
        ref_cols = ["pow_ref"]

    if not use_predefined_ws:
        ws_cols = [f"ws_{i:03d}" for i in ws_turbines]
    else:
        ws_cols = ["ws"]

    if not use_predefined_wd:
        wd_cols = [f"wd_{i:03d}" for i in wd_turbines]
    else:
        wd_cols = ["wd"]

    # Confirm uplift pairs provided correctly
    if uplift_pairs is None:
        uplift_pairs = []
    elif isinstance(uplift_pairs[0], str) and len(uplift_pairs) == 2:
        # Single pair provided, not in list of lists
        uplift_pairs = [uplift_pairs]
    else:
        for up in uplift_pairs:
            if len(up) != 2:
                raise ValueError("uplift_pairs should be a list of tuples of length 2.")
    if uplift_names is not None:
        if len(uplift_names) != len(uplift_pairs):
            raise ValueError("Length of uplift_names should match length of uplift_pairs")
    else:
        uplift_names = ["uplift_" + up[1] + "/" + up[0] for up in uplift_pairs]

    # Convert the numbered arrays to appropriate column names
    test_cols = [f"pow_{i:03d}" for i in test_turbines]

    # If df_freq is provided, confirm is consistent with ws/wd min max and
    # prepare a polars table of weights
    if df_freq is not None:
        # Maybe not test, not sure yet
        # ws_edges = np.arange(ws_min, ws_max+ws_step,ws_step)
        # ws_labels = ws_edges[:-1] + np.diff(ws_edges)/2.0
        # wd_edges = np.arange(wd_min, wd_max+wd_step,wd_step)
        # wd_labels = wd_edges[:-1] + np.diff(wd_edges)/2.0

        # Conver to polars dataframe
        df_freq_pl = pl.from_pandas(df_reduce_precision(df_freq, allow_convert_to_integer=False))

        # Rename the columns
        df_freq_pl = df_freq_pl.rename({"ws": "ws_bin", "wd": "wd_bin", "freq_val": "weight"})

    else:
        df_freq_pl = None

    # If N=1, don't use bootstrapping
    if N == 1:
        if percentiles is not None:
            logger.warn("percentiles can only be used with bootstrapping (N > 1).")
        # Compute the energy ratio
        total_uplift_result, df_freq_pl = _total_uplift_power_ratio_single(
            df_,
            a_in.df_names,
            ref_cols,
            test_cols,
            wd_cols,
            ws_cols,
            wd_step,
            wd_min,
            wd_max,
            ws_step,
            ws_min,
            ws_max,
            bin_cols_in,
            weight_by,
            df_freq_pl,
            wd_bin_overlap_radius,
            uplift_pairs,
            uplift_names,
            remove_all_nulls,
        )
    else:
        if percentiles is None:
            percentiles = [5, 95]
        elif not hasattr(percentiles, "__len__") or len(percentiles) != 2:
            raise ValueError(
                "percentiles should be a two element list of the "
                + "upper and lower desired percentiles."
            )

        total_uplift_result, df_freq_pl = _total_uplift_power_ratio_bootstrap(
            a_in,
            ref_cols,
            test_cols,
            wd_cols,
            ws_cols,
            wd_step,
            wd_min,
            wd_max,
            ws_step,
            ws_min,
            ws_max,
            bin_cols_in,
            weight_by,
            df_freq_pl,
            wd_bin_overlap_radius,
            uplift_pairs,
            uplift_names,
            N,
            percentiles,
        )

    # Do we want some kind of more complex return object? Or are we OK
    # returning just the total_uplift_result dictionary?
    return total_uplift_result



# For backwards compatability include a function compute_total_uplift that
# simply wraps the total_uplift_power_ratio function and adds a deprecated
# warning

[docs]
def compute_total_uplift(*args, **kwargs):
    """Deprecated function for computing the total uplift in energy production."""
    warnings.warn(
        "compute_total_uplift is deprecated, please use total_uplift_power_ratio instead.",
        DeprecationWarning,
    )
    return total_uplift_power_ratio(*args, **kwargs)