Source code for fied.analysis.boiler_inventory_comparison



import pandas as pd
import numpy as np
import pathlib
import json
from urllib.request import urlopen
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio



[docs]
def fix_fips(fips):
    """
    Correct float formatting for county FIPS.

    Parameters
    ----------
    fips : str
        County FIPS in original formatting.

    Returns
    -------
    fips : str
        County FIPS, corrected and re-formatted
    """

    try:
        fips = fips.replace('.0', '')

    except AttributeError:
        fips = str(fips)
        fips = fips.replace('.0', '')

    len_missing = 5 - len(fips)

    if len_missing > 0:
        fips = '0'*len_missing+fips

    else:
        pass

    return fips



[docs]
def convert_units(x, unit):
    """
    Convert energy units from MMBtu to MJ and
    power units from MMBtu/hr to MW.

    Parameters
    ----------
    x : float
        Value to be converted

    unit : {'power', 'energy'}
        Type of unit to convert.

    Returns
    -------
    y : float
        Converted unit, or None
    """

    units = {
        'power': 0.29307107,  # convert to MW
        'energy': 1054.35  # convert to MJ
        }
    
    try:
        y = x * units[unit]

    except TypeError:
        y = np.nan

    return y




[docs]
def get_boiler_data(boiler_url):
    """
    Download Northwestern boiler inventory and return it as a dataframe.

    Parameters
    ----------
    boiler_url : str
        URL for Northwestern boiler inventory


    Returns
    -------
    bdb : pandas.DataFrame
        Boiler inventory as a pandas DataFrame

    """

    dtypes = {
        'state': str,
        'county': str,
        'company_name': str,
        'site_name': str,
        'naics_code': float,
        'zip_code': str,
        'eis_unit_id': float,
        'naics_sub': int,
        'fuel_type': str,
        'data_source': str,
        'REPORTING_YEAR': float,
        'UNIT_TYPE': str,
        'TIER': str,
        'FUEL_COM': float,
        'FUEL_UNIT': str,
        'Op Hours Per Year': object
        }
    
    converters = {
        'fips': lambda x: (fix_fips(x)),
        }
    
    col_rename = {
        'ENERGY_COM_MMBtu': 'energyMJ',
        'fips': 'countyFIPS',
        'naics_code': 'naicsCode',
        'eis_unit_id': 'eisUnitID',
        'fuel_type': 'fuelType'
        }

    # use_cols = [0, 1, 2, 3, 4, 7, 9, 10, 13, 15, 16, 17, 18]

    bdb = pd.read_csv(boiler_url, dtype=dtypes, sep=',',
                      converters=converters, na_values=['', 'na'])
    
    for c in ['cap_mmbtuhr', 'ENERGY_COM_MMBtu', 'ENERGY_MMBtu_hr']:

        if c =='ENERGY_COM_MMBtu':
            bdb.loc[:, c] = bdb[c].apply(lambda x: convert_units(x, 'energy'))

        else:
            bdb.loc[:, c] = bdb[c].apply(lambda x: convert_units(x, 'power'))

    bdb.rename(columns=col_rename, inplace=True)
    
    bdb.loc[:, 'designCapacity'] = np.nan
    bdb.designCapacity.update(bdb.cap_mmbtuhr)
    bdb.designCapacity.update(bdb.ENERGY_MMBtu_hr)

    return bdb



[docs]
def get_fied_boiler(fied_path):

    """
    Load FIED and return information only on boilers. 

    Parameters
    ----------
    fied_path : str


    Returns
    -------
    fied_boiler : pandas.DataFrame
    """

    fied_boiler = pd.read_csv(fied_path, 
                              converters={'fips': lambda x: (fix_fips(x))},
                              low_memory=False,
                              index_col=[0])
    fied_boiler = fied_boiler.query("unitTypeStd == 'boiler'").copy(deep=True)
    fied_boiler.loc[:, 'countyFIPS'] = fied_boiler.countyFIPS.apply(lambda x: fix_fips(x))

    # Drop territories
    fied_boiler = fied_boiler.where(~fied_boiler.stateCode.isin(['VI', 'PR'])).dropna(how='all')
    
    naics_sub = pd.DataFrame(fied_boiler.naicsCode.dropna().unique())
    naics_sub.columns = ['naicsCode']

    naics_sub.loc[:, 'naics_sub'] = naics_sub.naicsCode.apply(
        lambda x: int(str(x)[0:3])
        )
    
    fied_boiler = pd.merge(
        fied_boiler, naics_sub, on='naicsCode',
        how='left'
        )

    return fied_boiler




[docs]
def compare_boilers_aggregate(bdb, fied_boiler, compare_type=None):
    """
    Compare either county or NAICS (3-digit) sums of 
    design capacity and energy estimates.
    
    Parameters
    ----------
    bdb : pandas.DataFrame

    fied_boiler : pandas.DataFrame

    type : str, {'county', 'naics'}
        Aggregate data sets at either county- or naics-level.


    Returns
    -------
    comparison : pandas.DataFrame
    """

    grouping_column = {'county': 'countyFIPS', 'naics': 'naics_sub'}

    try:
        comparison = pd.concat(
            [bdb.groupby(grouping_column[compare_type]).designCapacity.sum(),
            fied_boiler.groupby(grouping_column[compare_type]).designCapacity.sum(),
            bdb.groupby(grouping_column[compare_type]).energyMJ.sum(),
            fied_boiler.groupby(grouping_column[compare_type])[['energyMJ', 'energyMJq2']].sum().sum(axis=1)],
            axis=1
            )

    except KeyError:
        print("Must specify compare_type")

    else:
        comparison.columns = ['designCapacity_bdb', 'designCapacity_fied', 'energyMJ_bdb', 'energyMJ_fied']

        return comparison



[docs]
def compare_boilers_units(bdb, fied_boiler):
    """
    Compare boiler inventory and FIED boilers on an individual unit level. 
    Includes only 
    
    Parameters
    ----------
    bdb : pandas.DataFrame

    fied_boiler : pandas.DataFrame

    Returns
    -------
    units : pandas.DataFrame

    units_summary : pandas.DataFrame

    """

    fied_edit = fied_boiler.copy(deep=True)

    fied_edit.fuelType.replace({
        'diesel': 'oil products', 'naturalGas': 'natural gas', 
        'resFuelOil': 'oil products', 'lpgHGL': 'other fuels',
        'gasoline': 'oil products', 'other': 'other fuels',
        'coke': 'other fuels'
        }, inplace=True)

    units = pd.merge(
        bdb.dropna(subset=['eisUnitID'])[['eisUnitID', 'energyMJ', 'designCapacity', 'fuelType', 'naics_sub',
             'state']], 
        fied_edit.dropna(subset=['eisUnitID'])[['eisUnitID', 'energyMJ', 'energyMJq2', 'designCapacity',
                     'fuelType', 'stateCode', 'naicsCode']], 
        on='eisUnitID', how='inner',
        suffixes=['_bdb', '_fied']
        )

    units.loc[:, 'capCompare']  = (units.designCapacity_fied.subtract(
        units.designCapacity_bdb, fill_value=0
        )).divide(units.designCapacity_bdb, fill_value=0)
 
    units.loc[:, 'energyCompare'] = (units[units.energyMJ_fied>0].energyMJ_fied.subtract(
        units[units.energyMJ_fied>0].energyMJ_bdb, fill_value=0
        )).divide(units[units.energyMJ_fied>0].energyMJ_bdb, fill_value=0)
    
    units.loc[:, 'energyCompare'] = (units[units.energyMJq2>0].energyMJq2.subtract(
        units[units.energyMJq2>0].energyMJ_bdb, fill_value=0
        )).divide(units[units.energyMJq2>0].energyMJ_bdb, fill_value=0)


    units.loc[:, 'fuelCompare'] = units.fuelType_fied == units.fuelType_bdb

    # There can be multiple combustion units reported under a single EIS unit ID.
    # dups = units[units.eisUnitID.duplicated(keep=False)]

    units_summary = pd.DataFrame(units[['capCompare', 'energyCompare']].describe())
    units_summary.loc['% matching', 'fuelCompare'] = units.fuelCompare.sum() / units.fuelCompare.count()
    
    # for s in ['bdb', 'fied']:

    #     for t in ['cap', 'energy']:

    #         units_summary.loc['% /reported_{s}', f'{t}Compare'] = 


    return units, units_summary




[docs]
def plot_scatter_comparison(comparison, compare_type=None, write_fig=True):
    """
    Creates two subplots that compare county sums of design capacity (MW)
    and energy (MJ) for the boiler inventory and the foundational dataset.
    
    Parameters
    ----------
    comparison : pandas.DataFrame

    write_fig : Bool, default=True

    compare_type : str; {'county', 'naics'}

    Returns
    -------
    None

    """

    plot_data = comparison.copy(deep=True)
    plot_data.fillna(0, inplace=True)
    plot_data.reset_index(inplace=True)

    fig = make_subplots(rows=1, cols=2, 
                        subplot_titles=('Design Capacity (MW)', 'Energy (MJ)'))

    fig.add_trace(
        go.Scatter(
            x=plot_data.designCapacity_bdb,
            y=plot_data.designCapacity_fied,
            mode='markers'
            ),
        row=1, col=1
        )

    fig.add_trace(
        go.Scatter(
            x=plot_data.energyMJ_bdb,
            y=plot_data.energyMJ_fied,
            mode='markers'
            ),
        row=1, col=2
        )

    # Hardcoded coordinates for diagonal lines.
    line_xy = {
        'county': {
            'capacity_x': [0.001, 1E6],
            'capacity_y': [0.001, 1E6],
            'energy_x': [100, 1E12],
            'energy_y': [100, 1E12],
            },
        'naics': {
            'capacity_x': [10, 1E6],
            'capacity_y': [10, 1E6],
            'energy_x': [1E4, 1E12],
            'energy_y': [1E4, 1E12]
            }
        }

    fig.add_trace(
        go.Scatter(
            x=line_xy[compare_type]['capacity_x'],
            y=line_xy[compare_type]['capacity_y'],
            mode='lines', line=dict(color='black', width=2, dash='dash')
            ),
        row=1, col=1
        )
    
    fig.add_trace(
        go.Scatter(
            x=line_xy[compare_type]['energy_x'],
            y=line_xy[compare_type]['energy_y'],
            mode='lines', line=dict(color='black', width=2, dash='dash')
            ),
        row=1, col=2
        )
    
    for n in [1, 2]:
        fig.update_xaxes(
            title_text='NW Boiler Inventory', type='log', row=1, col=n,
            showexponent='all',
            exponentformat='power',
            showgrid=True
            )
        fig.update_yaxes(
            title_text='Foundational Data', type='log', row=1, col=n,
            showexponent='all',
            exponentformat='power',
            showgrid=True
            )

    fig.update_layout(
        height=800, width=1000,
        showlegend=False,
        template='presentation'
        )
    
    if write_fig is True:
            pio.write_image(
            fig,
            file=f'./analysis/figures/boiler_compare_scatter_{compare_type}.svg'
            )

    else:
        fig.show()

    return None


if __name__ == '__main__':

    boiler_url = 'https://raw.githubusercontent.com/carriescho/Electrification-of-Boilers/master/total_boiler_inventory.csv'
    fied_path = pathlib.Path('foundational_industry_data_2017.csv.gz')

    bdb = get_boiler_data(boiler_url)
    fied_boiler = get_fied_boiler(fied_path)

    agg_comparison = {}

    # Note that FIED is for 2017; the boiler inventory uses multiple reporting years.
    # This is more of an issue for energy than design capacity, as it's expected that
    # energy use is more likely to change by year than design capacity.
    for t in ['county', 'naics']:

    #     comparison = compare_boilers_aggregate(bdb, fied_boiler, compare_type=t)
    #     plot_scatter_comparison(comparison, compare_type=t, write_fig=True)

        agg_comparison[t] = compare_boilers_aggregate(bdb, fied_boiler, compare_type=t)

    units, units_summary = compare_boilers_units(bdb, fied_boiler)