Source code for reVX.utilities.reeds_cols


# -*- coding: utf-8 -*-
"""
reVX ReEDS column addition utilities
"""
import os
import json
import pandas as pd
from warnings import warn

from rex import Resource
from reVX.utilities.region_classifier import RegionClassifier
from reVX.utilities.utilities import load_fips_to_state_map
from reVX.version import __version__


UTILITY_DIR = os.path.dirname(os.path.realpath(__file__))
CONFIG_DIR = os.path.join(UTILITY_DIR, "config")
COUNTY_GDF_FP = ("https://www2.census.gov/geo/tiger/TIGER2021/COUNTY/"
                 "tl_2021_us_county.zip")


[docs]def add_county_info(data_frame, regions=COUNTY_GDF_FP): """Add county info to a Pandas DataFrame with coordinates. The input DataFrame must have latitude and longitude columns. Parameters ---------- data_frame : pandas.DataFrame A pandas data frame with latitude and longitude coordinates. regions : str | GeoDataFrame Path to regions shapefile containing labeled geometries or a pre-loaded GeoDataFrame. Returns ------- pandas.DataFrame A pandas data frame with all initial input data plus three new columns: "cnty_fips", "state", and "county". "cnty_fips" is a five-digit county code, while "state" and "county" are the state and county names, respectively. """ data_frame = data_frame.drop(columns=["cnty_fips", "county"], errors="ignore") data_frame = _classify(data_frame, "GEOID", regions) data_frame = _classify(data_frame, "NAME", regions) data_frame = data_frame.rename(columns={"GEOID": "cnty_fips", "NAME": "county"}) cmap = load_fips_to_state_map() data_frame["state"] = data_frame["cnty_fips"].apply( lambda code: cmap[code[:2]]) return data_frame
def _classify(data_frame, col, regions=COUNTY_GDF_FP): """Classify a single county column for the input DataFrame""" classifier = RegionClassifier(data_frame, regions, col) data_frame = classifier.classify(force=True) return data_frame.drop(columns="geometry", errors="ignore") def _lowercase_alpha_only(in_str): """Convert a string to lowercase alphabetic values only (a-z)""" return ''.join(filter(str.isalpha, in_str.casefold()))
[docs]def add_nrel_regions(data_frame): """Add NREL Regions info to a Pandas DataFrame with coordinates. The input DataFrame must have a "state" column containing teh state name for each row. Parameters ---------- data_frame : pandas.DataFrame A pandas data frame with "state" column. Returns ------- pandas.DataFrame A pandas data frame with an extra "nrel_region" column. """ if "state" not in data_frame: raise KeyError("Input DataFrame missing required column 'state'") with open(os.path.join(CONFIG_DIR, "nrel_regions.json")) as fh: nrel_regions = json.load(fh) regions = {_lowercase_alpha_only(key): val for key, val in nrel_regions.items()} states = data_frame["state"].apply(_lowercase_alpha_only) data_frame["nrel_region"] = states.map(regions) return data_frame
[docs]def add_extra_data(data_frame, extra_data, merge_col="sc_point_gid"): """Add extra data to a Pandas DataFrame from a list of input files. Parameters ---------- data_frame : pandas.DataFrame A pandas data frame with initial data. Must have `merge_col` column if extracting data from HDF5 files. extra_data : list of dicts A list of dictionaries, where each dictionary contains two keys. The first key is "source", and its value must either be a dictionary of `field: value` pairs or a path to the extra data being extracted. The latter must be a path pointing to an HDF5 or JSON file (i.e. it must end in ".h5" or ".json"). The second key is "dsets", and it points to a list of dataset names to extract from `source`. For JSON and dictionary data extraction, the values of the datasets must either be scalars or must match the length of the input `data_frame`. For HDF5 data, the datasets must be 1D datasets, and they will be merged with the input `data_frame` on `merge_col` (column must be in the HDF5 file meta). By default, ``None``. merge_col : str, optional Name of column used to merge the data in the input `data_frame` with the data in the HDF5 file. Note that this column must be present in both the `data_frame` as well as the HDF5 file meta. Returns ------- pandas.DataFrame A pandas data frame with extra data added from input files. """ for data_info in extra_data: source = data_info["source"] dsets = data_info["dsets"] if isinstance(source, dict): extra_data = source elif str(source).endswith(".json"): with open(source, "r") as fh: extra_data = json.load(fh) elif str(source).endswith(".h5"): with Resource(source) as res: extra_data = res.meta[[merge_col]].copy() for dset in dsets: extra_data[dset] = res[dset] extra_data = pd.merge(data_frame[[merge_col]], extra_data, on=merge_col) extra_data = {dset: extra_data[dset].values for dset in dsets} else: msg = ("File format not currently supported for file: {}" .format(source)) warn(msg) continue for dset in dsets: data_frame[dset] = extra_data[dset] return data_frame
[docs]def add_reeds_columns(supply_curve_fpath, out_fp=None, capacity_col="capacity", extra_data=None, merge_col="sc_point_gid", filter_out_zero_capacity=True, rename_mapping=None, regions=COUNTY_GDF_FP): """Add columns to supply curve required by ReEDS. This method will add columns like "cnty_fips", "state", "county", "nrel_region", "eos_mult", and "reg_mult". This method also allows you to add extra columns from H5 or JSON files. Parameters ---------- supply_curve_fpath : str Path to input supply curve. Should have standard reV supply curve output columns (e.g. latitude, longitude, capacity, sc_point_gid, etc.). If running from CLI, this can be a list of supply curve paths. out_fp : str, optional Path to output file for supply curve with new columns. If ``None``, the supply curve will be overwritten (i.e. the data will be written to `supply_curve_fpath`). If running from CLI, this can be a list output paths (length *must* match length of `supply_curve_fpath`). By default, ``None``. capacity_col : str, optional Name of capacity column. This is used to filter out sites with zero capacity, if that option is selected. By default, ``"capacity"``. extra_data : list of dicts, optional A list of dictionaries, where each dictionary contains two keys. The first key is "source", and its value must either be a dictionary of `field: value` pairs or a path to the extra data being extracted. The latter must be a path pointing to an HDF5 or JSON file (i.e. it must end in ".h5" or ".json"). The second key is "dsets", and it points to a list of dataset names to extract from `source`. For JSON and dictionary data extraction, the values of the datasets must either be scalars or must match the length of the input `data_frame`. For HDF5 data, the datasets must be 1D datasets, and they will be merged with the input `data_frame` on `merge_col` (column must be in the HDF5 file meta). By default, ``None``. merge_col : str, optional Name of column used to merge the data in the input supply curve with the data in the HDF5 file if `extra_data` is specified. Note that this column must be present in both the input supply curve as well as the HDF5 file meta. By default, ``"sc_point_gid"``. filter_out_zero_capacity : bool, optional Flag to filter out sites with zero capacity. By default, ``True``. rename_mapping : dict, optional Optional mapping of old column names to new column names. This mapping will be used to rename the columns in the supply curve towards the end of the procedure (after all extra columns except ``eos_mult`` and ``reg_mult`` have been added). By default, ``None`` (no renaming). regions : str, optional Path to a regions shapefile containing county geometries labeled with county FIPS values. Default value pulls the data from ``www2.census.gov``. Returns ------- out_fpath : str Path to output file. """ sc = pd.read_csv(supply_curve_fpath) sc = add_county_info(sc, regions) sc = add_nrel_regions(sc) if extra_data: sc = add_extra_data(sc, extra_data, merge_col=merge_col) if filter_out_zero_capacity and capacity_col in sc: sc = sc[sc[capacity_col] > 0] rename_mapping = rename_mapping or {} sc = sc.rename(columns=rename_mapping) for col in ["eos_mult", "reg_mult"]: if col not in sc: sc[col] = 1 sc = sc.reset_index(drop=True) out_fp = out_fp or supply_curve_fpath sc.to_csv(out_fp, index=False) return out_fp