# -*- coding: utf-8 -*-
"""
Compute and plot summary data
"""
import logging
import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotting as mplt
from rex import Resource
from rex.utilities import SpawnProcessPool, parse_table
from reV.utilities import SupplyCurveField, ResourceMetaField
logger = logging.getLogger(__name__)
[docs]class SummarizeH5:
"""
reV Summary data for QA/QC
"""
def __init__(self, h5_file, group=None):
"""
Parameters
----------
h5_file : str
Path to .h5 file to summarize data from
group : str, optional
Group within h5_file to summarize datasets for, by default None
"""
logger.info("QAQC Summarize initializing on: {}".format(h5_file))
self._h5_file = h5_file
self._group = group
def __repr__(self):
msg = "{} for {}".format(self.__class__.__name__, self.h5_file)
return msg
@property
def h5_file(self):
"""
.h5 file path
Returns
-------
str
"""
return self._h5_file
@staticmethod
def _compute_sites_summary(h5_file, ds_name, sites=None, group=None):
"""
Compute summary stats for given sites of given dataset
Parameters
----------
h5_file : str
Path to .h5 file to summarize data from
ds_name : str
Dataset name of interest
sites : list | slice, optional
sites of interest, by default None
group : str, optional
Group within h5_file to summarize datasets for, by default None
Returns
-------
sites_summary : pandas.DataFrame
Summary stats for given sites / dataset
"""
if sites is None:
sites = slice(None)
with Resource(h5_file, group=group) as f:
sites_meta = f["meta", sites]
sites_data = f[ds_name, :, sites]
sites_summary = pd.DataFrame(sites_data, columns=sites_meta.index)
sites_summary = sites_summary.describe().T.drop(columns=["count"])
sites_summary["sum"] = sites_data.sum(axis=0)
return sites_summary
@staticmethod
def _compute_ds_summary(h5_file, ds_name, group=None):
"""
Compute summary statistics for given dataset (assumed to be a vector)
Parameters
----------
h5_file : str
Path to .h5 file to summarize data from
ds_name : str
Dataset name of interest
group : str, optional
Group within h5_file to summarize datasets for, by default None
Returns
-------
ds_summary : pandas.DataFrame
Summary statistics for dataset
"""
with Resource(h5_file, group=group) as f:
ds_data = f[ds_name, :]
ds_summary = pd.DataFrame(ds_data, columns=[ds_name])
ds_summary = ds_summary.describe().drop(["count"])
ds_summary.at["sum", ds_name] = ds_data.sum()
return ds_summary
[docs] def summarize_dset(
self, ds_name, process_size=None, max_workers=None, out_path=None
):
"""
Compute dataset summary. If dataset is 2D compute temporal statistics
for each site
Parameters
----------
ds_name : str
Dataset name of interest
process_size : int, optional
Number of sites to process at a time, by default None
max_workers : int, optional
Number of workers to use in parallel, if 1 run in serial,
if None use all available cores, by default None
out_path : str
File path to save summary to
Returns
-------
summary : pandas.DataFrame
Summary summary for dataset
"""
with Resource(self.h5_file, group=self._group) as f:
ds_shape, _, ds_chunks = f.get_dset_properties(ds_name)
if len(ds_shape) > 1:
sites = np.arange(ds_shape[1])
if max_workers != 1:
if process_size is None and ds_chunks is not None:
process_size = ds_chunks[1]
if process_size is None:
process_size = ds_shape[-1]
sites = np.array_split(
sites, int(np.ceil(len(sites) / process_size))
)
loggers = [__name__, "reV"]
with SpawnProcessPool(
max_workers=max_workers, loggers=loggers
) as ex:
futures = []
for site_slice in sites:
futures.append(
ex.submit(
self._compute_sites_summary,
self.h5_file,
ds_name,
sites=site_slice,
group=self._group,
)
)
summary = [future.result() for future in futures]
summary = pd.concat(summary)
elif process_size is None:
summary = self._compute_sites_summary(self.h5_file,
ds_name,
sites=sites,
group=self._group)
else:
sites = np.array_split(
sites, int(np.ceil(len(sites) / process_size)))
summary = []
for site_slice in sites:
summary.append(self._compute_sites_summary(
self.h5_file, ds_name,
sites=site_slice,
group=self._group))
summary = pd.concat(summary)
summary.index.name = ResourceMetaField.GID
else:
summary = self._compute_ds_summary(
self.h5_file, ds_name, group=self._group
)
if out_path is not None:
summary.to_csv(out_path)
return summary
[docs] def summarize_means(self, out_path=None):
"""
Add means datasets to meta data
Parameters
----------
out_path : str, optional
Path to .csv file to save update meta data to, by default None
Returns
-------
meta : pandas.DataFrame
Meta data with means datasets added
"""
with Resource(self.h5_file, group=self._group) as f:
meta = f.meta
if ResourceMetaField.GID not in meta:
if meta.index.name != ResourceMetaField.GID:
meta.index.name = ResourceMetaField.GID
meta = meta.reset_index()
for ds_name in f.datasets:
shape, dtype, _ = f.get_dset_properties(ds_name)
if len(shape) == 1 and np.issubdtype(dtype, np.number):
meta[ds_name] = f[ds_name]
if out_path is not None:
meta.to_csv(out_path, index=False)
return meta
[docs] @classmethod
def run(
cls,
h5_file,
out_dir,
group=None,
dsets=None,
process_size=None,
max_workers=None,
):
"""
Summarize all datasets in h5_file and dump to out_dir
Parameters
----------
h5_file : str
Path to .h5 file to summarize data from
out_dir : str
Directory to dump summary .csv files to
group : str, optional
Group within h5_file to summarize datasets for, by default None
dsets : str | list, optional
Datasets to summarize, by default None
process_size : int, optional
Number of sites to process at a time, by default None
max_workers : int, optional
Number of workers to use when summarizing 2D datasets,
by default None
"""
if not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
if dsets is None:
with Resource(h5_file, group=group) as f:
dsets = [
dset
for dset in f.datasets
if dset not in ["meta", "time_index"]
]
elif isinstance(dsets, str):
dsets = [dsets]
summary = cls(h5_file)
for ds_name in dsets:
out_path = os.path.join(out_dir, "{}_summary.csv".format(ds_name))
summary.summarize_dset(
ds_name,
process_size=process_size,
max_workers=max_workers,
out_path=out_path,
)
out_path = os.path.basename(h5_file).replace(".h5", "_summary.csv")
out_path = os.path.join(out_dir, out_path)
summary.summarize_means(out_path=out_path)
[docs]class SummarizeSupplyCurve:
"""
Summarize Supply Curve table
"""
def __init__(self, sc_table):
self._sc_table = self._parse_summary(sc_table)
def __repr__(self):
msg = "{}".format(self.__class__.__name__)
return msg
@property
def sc_table(self):
"""
Supply Curve table
Returns
-------
pd.DataFrame
"""
return self._sc_table
@staticmethod
def _parse_summary(summary):
"""
Extract summary statistics
Parameters
----------
summary : str | pd.DataFrame
Path to .csv or .json or DataFrame to parse
Returns
-------
summary : pandas.DataFrame
DataFrame of summary statistics
"""
try:
summary = parse_table(summary)
except ValueError as ex:
logger.error(ex)
raise
return summary
[docs] def supply_curve_summary(self, columns=None, out_path=None):
"""
Summarize Supply Curve Table
Parameters
----------
sc_table : str | pandas.DataFrame
Supply curve table or .csv containing table
columns : str | list, optional
Column(s) to summarize, if None summarize all numeric columns,
by default None
out_path : str, optional
Path to .csv to save summary to, by default None
Returns
-------
sc_summary : pandas.DataFrame
Summary statistics (mean, stdev, median, min, max, sum) for
Supply Curve table columns
"""
sc_table = self.sc_table
if columns is not None:
if isinstance(columns, str):
columns = [columns]
sc_table = sc_table[columns]
sc_table = sc_table.select_dtypes(include=np.number)
sc_summary = []
sc_stat = sc_table.mean(axis=0)
sc_stat.name = "mean"
sc_summary.append(sc_stat)
sc_stat = sc_table.std(axis=0)
sc_stat.name = "stdev"
sc_summary.append(sc_stat)
sc_stat = sc_table.median(axis=0)
sc_stat.name = "median"
sc_summary.append(sc_stat)
sc_stat = sc_table.min(axis=0)
sc_stat.name = "min"
sc_summary.append(sc_stat)
sc_stat = sc_table.max(axis=0)
sc_stat.name = "max"
sc_summary.append(sc_stat)
sc_stat = sc_table.sum(axis=0)
sc_stat.name = "sum"
sc_summary.append(sc_stat)
sc_summary = pd.concat(sc_summary, axis=1).T
if out_path is not None:
sc_summary.to_csv(out_path)
return sc_summary
[docs] @classmethod
def run(cls, sc_table, out_dir, columns=None):
"""
Summarize Supply Curve Table and save to disk
Parameters
----------
sc_table : str | pandas.DataFrame
Path to .csv containing Supply Curve table
out_dir : str
Directory to dump summary .csv files to
columns : str | list, optional
Column(s) to summarize, if None summarize all numeric columns,
by default None
"""
if not os.path.exists(out_dir):
os.makedirs(out_dir, exist_ok=True)
summary = cls(sc_table)
out_path = os.path.basename(sc_table).replace(".csv", "_summary.csv")
out_path = os.path.join(out_dir, out_path)
summary.supply_curve_summary(columns=columns, out_path=out_path)
[docs]class PlotBase:
"""
QA/QC Plotting base class
"""
def __init__(self, data):
"""
Parameters
----------
data : str | pandas.DataFrame | ndarray
data to plot or file containing data to plot
"""
self._data = data
def __repr__(self):
msg = "{}".format(self.__class__.__name__)
return msg
@property
def data(self):
"""
Data to plot
Returns
-------
pandas.DataFrame | ndarray
"""
return self._data
@staticmethod
def _save_plotly(fig, out_path):
"""
Save plotly figure to disk
Parameters
----------
fig : plotly.Figure
Plotly Figure object
out_path : str
File path to save plot to, can be a .html or static image
"""
if out_path.endswith(".html"):
fig.write_html(out_path)
else:
fig.write_image(out_path)
@staticmethod
def _check_value(df, values, scatter=True):
"""
Check DataFrame for needed columns
Parameters
----------
df : pandas.DataFrame
DataFrame to check
values : str | list
Column(s) to plot
scatter : bool, optional
Flag to check for latitude and longitude columns, by default True
"""
if isinstance(values, str):
values = [values]
if scatter:
values += [SupplyCurveField.LATITUDE, SupplyCurveField.LONGITUDE]
for value in values:
if value not in df:
msg = "{} is not a valid column in summary table:\n{}".format(
value, df
)
logger.error(msg)
raise ValueError(msg)
[docs]class SummaryPlots(PlotBase):
"""
Plot summary data for QA/QC
"""
def __init__(self, summary):
"""
Parameters
----------
summary : str | pandas.DataFrame
Summary DataFrame or path to summary .csv
"""
self._data = SummarizeSupplyCurve._parse_summary(summary)
@property
def summary(self):
"""
Summary table
Returns
-------
pandas.DataFrame
"""
return self._data
@property
def columns(self):
"""
Available columns in summary table
Returns
-------
list
"""
return list(self.summary.columns)
[docs] def scatter_plot(self, value, cmap="viridis", out_path=None, **kwargs):
"""
Plot scatter plot of value versus longitude and latitude using
pandas.plot.scatter
Parameters
----------
value : str
Column name to plot as color
cmap : str, optional
Matplotlib colormap name, by default 'viridis'
out_path : str, optional
File path to save plot to, by default None
kwargs : dict
Additional kwargs for plotting.dataframes.df_scatter
"""
self._check_value(self.summary, value)
mplt.df_scatter(self.summary, x=SupplyCurveField.LONGITUDE,
y=SupplyCurveField.LATITUDE, c=value, colormap=cmap,
filename=out_path, **kwargs)
[docs] def scatter_plotly(self, value, cmap="Viridis", out_path=None, **kwargs):
"""
Plot scatter plot of value versus longitude and latitude using
plotly
Parameters
----------
value : str
Column name to plot as color
cmap : str | px.color, optional
Continuous color scale to use, by default 'Viridis'
out_path : str, optional
File path to save plot to, can be a .html or static image,
by default None
kwargs : dict
Additional kwargs for plotly.express.scatter
"""
self._check_value(self.summary, value)
fig = px.scatter(self.summary, x=SupplyCurveField.LONGITUDE,
y=SupplyCurveField.LATITUDE, color=value,
color_continuous_scale=cmap, **kwargs)
fig.update_layout(font=dict(family="Arial", size=18, color="black"))
if out_path is not None:
self._save_plotly(fig, out_path)
fig.show()
def _extract_sc_data(self, lcoe=SupplyCurveField.MEAN_LCOE):
"""
Extract supply curve data
Parameters
----------
lcoe : str, optional
LCOE value to use for supply curve,
by default :obj:`SupplyCurveField.MEAN_LCOE`
Returns
-------
sc_df : pandas.DataFrame
Supply curve data
"""
values = [SupplyCurveField.CAPACITY_AC_MW, lcoe]
self._check_value(self.summary, values, scatter=False)
sc_df = self.summary[values].sort_values(lcoe)
sc_df['cumulative_capacity'] = (
sc_df[SupplyCurveField.CAPACITY_AC_MW].cumsum()
)
return sc_df
[docs] def dist_plot(self, value, out_path=None, **kwargs):
"""
Plot distribution plot of value using seaborn.distplot
Parameters
----------
value : str
Column name to plot
out_path : str, optional
File path to save plot to, by default None
kwargs : dict
Additional kwargs for plotting.dataframes.dist_plot
"""
self._check_value(self.summary, value, scatter=False)
series = self.summary[value]
mplt.dist_plot(series, filename=out_path, **kwargs)
[docs] def dist_plotly(self, value, out_path=None, **kwargs):
"""
Plot histogram of value using plotly
Parameters
----------
value : str
Column name to plot
out_path : str, optional
File path to save plot to, by default None
kwargs : dict
Additional kwargs for plotly.express.histogram
"""
self._check_value(self.summary, value, scatter=False)
fig = px.histogram(self.summary, x=value)
if out_path is not None:
self._save_plotly(fig, out_path, **kwargs)
fig.show()
[docs] @classmethod
def scatter(
cls,
summary_csv,
out_dir,
value,
plot_type="plotly",
cmap="viridis",
**kwargs,
):
"""
Create scatter plot for given value in summary table and save to
out_dir
Parameters
----------
summary_csv : str
Path to .csv file containing summary table
out_dir : str
Output directory to save plots to
value : str
Column name to plot as color
plot_type : str, optional
plot_type of plot to create 'plot' or 'plotly', by default 'plotly'
cmap : str, optional
Colormap name, by default 'viridis'
kwargs : dict
Additional plotting kwargs
"""
splt = cls(summary_csv)
if plot_type == "plot":
out_path = os.path.basename(summary_csv).replace(".csv", ".png")
out_path = os.path.join(out_dir, out_path)
splt.scatter_plot(
value, cmap=cmap.lower(), out_path=out_path, **kwargs
)
elif plot_type == "plotly":
out_path = os.path.basename(summary_csv).replace(".csv", ".html")
out_path = os.path.join(out_dir, out_path)
splt.scatter_plotly(
value, cmap=cmap.capitalize(), out_path=out_path, **kwargs
)
else:
msg = (
"plot_type must be 'plot' or 'plotly' but {} was given".format(
plot_type
)
)
logger.error(msg)
raise ValueError(msg)
[docs] @classmethod
def scatter_all(
cls, summary_csv, out_dir, plot_type="plotly", cmap="viridis", **kwargs
):
"""
Create scatter plot for all summary stats in summary table and save to
out_dir
Parameters
----------
summary_csv : str
Path to .csv file containing summary table
out_dir : str
Output directory to save plots to
plot_type : str, optional
plot_type of plot to create 'plot' or 'plotly', by default 'plotly'
cmap : str, optional
Colormap name, by default 'viridis'
kwargs : dict
Additional plotting kwargs
"""
splt = cls(summary_csv)
splt._data = splt.summary.select_dtypes(include=np.number)
datasets = [
c for c in splt.summary.columns if not c.startswith(("lat", "lon"))
]
for value in datasets:
if plot_type == "plot":
out_path = "_{}.png".format(value)
out_path = os.path.basename(summary_csv).replace(
".csv", out_path
)
out_path = os.path.join(out_dir, out_path)
splt.scatter_plot(
value, cmap=cmap.lower(), out_path=out_path, **kwargs
)
elif plot_type == "plotly":
out_path = "_{}.html".format(value)
out_path = os.path.basename(summary_csv).replace(
".csv", out_path
)
out_path = os.path.join(out_dir, out_path)
splt.scatter_plotly(
value, cmap=cmap.capitalize(), out_path=out_path, **kwargs
)
else:
msg = ("plot_type must be 'plot' or 'plotly' but {} was given"
.format(plot_type))
logger.error(msg)
raise ValueError(msg)
[docs]class SupplyCurvePlot(PlotBase):
"""
Plot supply curve data for QA/QC
"""
def __init__(self, sc_table):
"""
Parameters
----------
sc_table : str | pandas.DataFrame
Supply curve table or path to supply curve .csv
"""
self._data = SummarizeSupplyCurve._parse_summary(sc_table)
@property
def sc_table(self):
"""
Supply curve table
Returns
-------
pandas.DataFrame
"""
return self._data
@property
def columns(self):
"""
Available columns in supply curve table
Returns
-------
list
"""
return list(self.sc_table.columns)
def _extract_sc_data(self, lcoe=SupplyCurveField.MEAN_LCOE):
"""
Extract supply curve data
Parameters
----------
lcoe : str, optional
LCOE value to use for supply curve,
by default :obj:`SupplyCurveField.MEAN_LCOE`
Returns
-------
sc_df : pandas.DataFrame
Supply curve data
"""
values = [SupplyCurveField.CAPACITY_AC_MW, lcoe]
self._check_value(self.sc_table, values, scatter=False)
sc_df = self.sc_table[values].sort_values(lcoe)
sc_df['cumulative_capacity'] = (
sc_df[SupplyCurveField.CAPACITY_AC_MW].cumsum()
)
return sc_df
[docs] def supply_curve_plot(self, lcoe=SupplyCurveField.MEAN_LCOE, out_path=None,
**kwargs):
"""
Plot supply curve (cumulative capacity vs lcoe) using seaborn.scatter
Parameters
----------
lcoe : str, optional
LCOE value to plot, by default :obj:`SupplyCurveField.MEAN_LCOE`
out_path : str, optional
File path to save plot to, by default None
kwargs : dict
Additional kwargs for plotting.dataframes.df_scatter
"""
sc_df = self._extract_sc_data(lcoe=lcoe)
mplt.df_scatter(
sc_df, x="cumulative_capacity", y=lcoe, filename=out_path, **kwargs
)
[docs] def supply_curve_plotly(self, lcoe=SupplyCurveField.MEAN_LCOE,
out_path=None, **kwargs):
"""
Plot supply curve (cumulative capacity vs lcoe) using plotly
Parameters
----------
lcoe : str, optional
LCOE value to plot, by default SupplyCurveField.MEAN_LCOE
out_path : str, optional
File path to save plot to, can be a .html or static image,
by default None
kwargs : dict
Additional kwargs for plotly.express.scatter
"""
sc_df = self._extract_sc_data(lcoe=lcoe)
fig = px.scatter(sc_df, x="cumulative_capacity", y=lcoe, **kwargs)
fig.update_layout(font=dict(family="Arial", size=18, color="black"))
if out_path is not None:
self._save_plotly(fig, out_path)
fig.show()
[docs] @classmethod
def plot(cls, sc_table, out_dir, plot_type='plotly',
lcoe=SupplyCurveField.MEAN_LCOE, **kwargs):
"""
Create supply curve plot from supply curve table using lcoe value
and save to out_dir
Parameters
----------
sc_table : str
Path to .csv file containing Supply Curve table
out_dir : str
Output directory to save plots to
plot_type : str, optional
plot_type of plot to create 'plot' or 'plotly', by default 'plotly'
lcoe : str, optional
LCOE value to plot, by default :obj:`SupplyCurveField.MEAN_LCOE`
kwargs : dict
Additional plotting kwargs
"""
splt = cls(sc_table)
if plot_type == "plot":
out_path = os.path.basename(sc_table).replace(".csv", ".png")
out_path = os.path.join(out_dir, out_path)
splt.supply_curve_plot(lcoe=lcoe, out_path=out_path, **kwargs)
elif plot_type == "plotly":
out_path = os.path.basename(sc_table).replace(".csv", ".html")
out_path = os.path.join(out_dir, out_path)
splt.supply_curve_plotly(lcoe=lcoe, out_path=out_path, **kwargs)
else:
msg = (
"plot_type must be 'plot' or 'plotly' but {} was given".format(
plot_type
)
)
logger.error(msg)
raise ValueError(msg)
[docs]class ExclusionsMask(PlotBase):
"""
Plot Exclusions mask as a heat map data for QA/QC
"""
def __init__(self, excl_mask):
"""
Parameters
----------
excl_mask : str | ndarray
Exclusions mask or path to .npy file containing final mask
"""
self._data = self._parse_mask(excl_mask)
@property
def mask(self):
"""
Final Exclusions mask
Returns
-------
ndarray
"""
return self._data
@staticmethod
def _parse_mask(excl_mask):
"""
Load exclusions mask if needed
Parameters
----------
excl_mask : str | ndarray
Exclusions mask or path to .npy file containing final mask
Returns
-------
excl_mask : ndarray
[n, m] array of final exclusion values
"""
if isinstance(excl_mask, str):
excl_mask = np.load(excl_mask)
elif not isinstance(excl_mask, np.ndarray):
raise ValueError("excl_mask must be a .npy file or an ndarray")
return excl_mask
[docs] def exclusions_plot(
self, cmap="Viridis", plot_step=100, out_path=None, **kwargs
):
"""
Plot exclusions mask as a seaborn heatmap
Parameters
----------
cmap : str | px.color, optional
Continuous color scale to use, by default 'Viridis'
plot_step : int
Step between points to plot
out_path : str, optional
File path to save plot to, can be a .html or static image,
by default None
kwargs : dict
Additional kwargs for plotting.colormaps.heatmap_plot
"""
mplt.heatmap_plot(
self.mask[::plot_step, ::plot_step],
cmap=cmap,
filename=out_path,
**kwargs,
)
[docs] def exclusions_plotly(
self, cmap="Viridis", plot_step=100, out_path=None, **kwargs
):
"""
Plot exclusions mask as a plotly heatmap
Parameters
----------
cmap : str | px.color, optional
Continuous color scale to use, by default 'Viridis'
plot_step : int
Step between points to plot
out_path : str, optional
File path to save plot to, can be a .html or static image,
by default None
kwargs : dict
Additional kwargs for plotly.express.imshow
"""
fig = px.imshow(
self.mask[::plot_step, ::plot_step],
color_continuous_scale=cmap,
**kwargs,
)
fig.update_layout(font=dict(family="Arial", size=18, color="black"))
if out_path is not None:
SummaryPlots._save_plotly(fig, out_path)
fig.show()
[docs] @classmethod
def plot(
cls,
mask,
out_dir,
plot_type="plotly",
cmap="Viridis",
plot_step=100,
**kwargs,
):
"""
Plot exclusions mask and save to out_dir
Parameters
----------
mask : ndarray
ndarray of final exclusions mask
out_dir : str
Output directory to save plots to
plot_type : str, optional
plot_type of plot to create 'plot' or 'plotly', by default 'plotly'
cmap : str, optional
Colormap name, by default 'viridis'
plot_step : int
Step between points to plot
kwargs : dict
Additional plotting kwargs
"""
excl_mask = cls(mask)
if plot_type == "plot":
out_path = "exclusions_mask.png"
out_path = os.path.join(out_dir, out_path)
excl_mask.exclusions_plot(
cmap=cmap.lower(),
plot_step=plot_step,
out_path=out_path,
**kwargs,
)
elif plot_type == "plotly":
out_path = "exclusions_mask.html"
out_path = os.path.join(out_dir, out_path)
excl_mask.exclusions_plotly(
cmap=cmap.capitalize(),
plot_step=plot_step,
out_path=out_path,
**kwargs,
)
else:
msg = (
"plot_type must be 'plot' or 'plotly' but {} was given".format(
plot_type
)
)
logger.error(msg)
raise ValueError(msg)