import pandas as pd
import numpy as np
import pathlib
import json
from urllib.request import urlopen
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
[docs]
def fix_fips(fips):
"""
Correct float formatting for county FIPS.
Parameters
----------
fips : str
County FIPS in original formatting.
Returns
-------
fips : str
County FIPS, corrected and re-formatted
"""
try:
fips = fips.replace('.0', '')
except AttributeError:
fips = str(fips)
fips = fips.replace('.0', '')
len_missing = 5 - len(fips)
if len_missing > 0:
fips = '0'*len_missing+fips
else:
pass
return fips
[docs]
def convert_units(x, unit):
"""
Convert energy units from MMBtu to MJ and
power units from MMBtu/hr to MW.
Parameters
----------
x : float
Value to be converted
unit : {'power', 'energy'}
Type of unit to convert.
Returns
-------
y : float
Converted unit, or None
"""
units = {
'power': 0.29307107, # convert to MW
'energy': 1054.35 # convert to MJ
}
try:
y = x * units[unit]
except TypeError:
y = np.nan
return y
[docs]
def get_boiler_data(boiler_url):
"""
Download Northwestern boiler inventory and return it as a dataframe.
Parameters
----------
boiler_url : str
URL for Northwestern boiler inventory
Returns
-------
bdb : pandas.DataFrame
Boiler inventory as a pandas DataFrame
"""
dtypes = {
'state': str,
'county': str,
'company_name': str,
'site_name': str,
'naics_code': float,
'zip_code': str,
'eis_unit_id': float,
'naics_sub': int,
'fuel_type': str,
'data_source': str,
'REPORTING_YEAR': float,
'UNIT_TYPE': str,
'TIER': str,
'FUEL_COM': float,
'FUEL_UNIT': str,
'Op Hours Per Year': object
}
converters = {
'fips': lambda x: (fix_fips(x)),
}
col_rename = {
'ENERGY_COM_MMBtu': 'energyMJ',
'fips': 'countyFIPS',
'naics_code': 'naicsCode',
'eis_unit_id': 'eisUnitID',
'fuel_type': 'fuelType'
}
# use_cols = [0, 1, 2, 3, 4, 7, 9, 10, 13, 15, 16, 17, 18]
bdb = pd.read_csv(boiler_url, dtype=dtypes, sep=',',
converters=converters, na_values=['', 'na'])
for c in ['cap_mmbtuhr', 'ENERGY_COM_MMBtu', 'ENERGY_MMBtu_hr']:
if c =='ENERGY_COM_MMBtu':
bdb.loc[:, c] = bdb[c].apply(lambda x: convert_units(x, 'energy'))
else:
bdb.loc[:, c] = bdb[c].apply(lambda x: convert_units(x, 'power'))
bdb.rename(columns=col_rename, inplace=True)
bdb.loc[:, 'designCapacity'] = np.nan
bdb.designCapacity.update(bdb.cap_mmbtuhr)
bdb.designCapacity.update(bdb.ENERGY_MMBtu_hr)
return bdb
[docs]
def get_fied_boiler(fied_path):
"""
Load FIED and return information only on boilers.
Parameters
----------
fied_path : str
Returns
-------
fied_boiler : pandas.DataFrame
"""
fied_boiler = pd.read_csv(fied_path,
converters={'fips': lambda x: (fix_fips(x))},
low_memory=False,
index_col=[0])
fied_boiler = fied_boiler.query("unitTypeStd == 'boiler'").copy(deep=True)
fied_boiler.loc[:, 'countyFIPS'] = fied_boiler.countyFIPS.apply(lambda x: fix_fips(x))
# Drop territories
fied_boiler = fied_boiler.where(~fied_boiler.stateCode.isin(['VI', 'PR'])).dropna(how='all')
naics_sub = pd.DataFrame(fied_boiler.naicsCode.dropna().unique())
naics_sub.columns = ['naicsCode']
naics_sub.loc[:, 'naics_sub'] = naics_sub.naicsCode.apply(
lambda x: int(str(x)[0:3])
)
fied_boiler = pd.merge(
fied_boiler, naics_sub, on='naicsCode',
how='left'
)
return fied_boiler
[docs]
def compare_boilers_aggregate(bdb, fied_boiler, compare_type=None):
"""
Compare either county or NAICS (3-digit) sums of
design capacity and energy estimates.
Parameters
----------
bdb : pandas.DataFrame
fied_boiler : pandas.DataFrame
type : str, {'county', 'naics'}
Aggregate data sets at either county- or naics-level.
Returns
-------
comparison : pandas.DataFrame
"""
grouping_column = {'county': 'countyFIPS', 'naics': 'naics_sub'}
try:
comparison = pd.concat(
[bdb.groupby(grouping_column[compare_type]).designCapacity.sum(),
fied_boiler.groupby(grouping_column[compare_type]).designCapacity.sum(),
bdb.groupby(grouping_column[compare_type]).energyMJ.sum(),
fied_boiler.groupby(grouping_column[compare_type])[['energyMJ', 'energyMJq2']].sum().sum(axis=1)],
axis=1
)
except KeyError:
print("Must specify compare_type")
else:
comparison.columns = ['designCapacity_bdb', 'designCapacity_fied', 'energyMJ_bdb', 'energyMJ_fied']
return comparison
[docs]
def compare_boilers_units(bdb, fied_boiler):
"""
Compare boiler inventory and FIED boilers on an individual unit level.
Includes only
Parameters
----------
bdb : pandas.DataFrame
fied_boiler : pandas.DataFrame
Returns
-------
units : pandas.DataFrame
units_summary : pandas.DataFrame
"""
fied_edit = fied_boiler.copy(deep=True)
fied_edit.fuelType.replace({
'diesel': 'oil products', 'naturalGas': 'natural gas',
'resFuelOil': 'oil products', 'lpgHGL': 'other fuels',
'gasoline': 'oil products', 'other': 'other fuels',
'coke': 'other fuels'
}, inplace=True)
units = pd.merge(
bdb.dropna(subset=['eisUnitID'])[['eisUnitID', 'energyMJ', 'designCapacity', 'fuelType', 'naics_sub',
'state']],
fied_edit.dropna(subset=['eisUnitID'])[['eisUnitID', 'energyMJ', 'energyMJq2', 'designCapacity',
'fuelType', 'stateCode', 'naicsCode']],
on='eisUnitID', how='inner',
suffixes=['_bdb', '_fied']
)
units.loc[:, 'capCompare'] = (units.designCapacity_fied.subtract(
units.designCapacity_bdb, fill_value=0
)).divide(units.designCapacity_bdb, fill_value=0)
units.loc[:, 'energyCompare'] = (units[units.energyMJ_fied>0].energyMJ_fied.subtract(
units[units.energyMJ_fied>0].energyMJ_bdb, fill_value=0
)).divide(units[units.energyMJ_fied>0].energyMJ_bdb, fill_value=0)
units.loc[:, 'energyCompare'] = (units[units.energyMJq2>0].energyMJq2.subtract(
units[units.energyMJq2>0].energyMJ_bdb, fill_value=0
)).divide(units[units.energyMJq2>0].energyMJ_bdb, fill_value=0)
units.loc[:, 'fuelCompare'] = units.fuelType_fied == units.fuelType_bdb
# There can be multiple combustion units reported under a single EIS unit ID.
# dups = units[units.eisUnitID.duplicated(keep=False)]
units_summary = pd.DataFrame(units[['capCompare', 'energyCompare']].describe())
units_summary.loc['% matching', 'fuelCompare'] = units.fuelCompare.sum() / units.fuelCompare.count()
# for s in ['bdb', 'fied']:
# for t in ['cap', 'energy']:
# units_summary.loc['% /reported_{s}', f'{t}Compare'] =
return units, units_summary
[docs]
def plot_scatter_comparison(comparison, compare_type=None, write_fig=True):
"""
Creates two subplots that compare county sums of design capacity (MW)
and energy (MJ) for the boiler inventory and the foundational dataset.
Parameters
----------
comparison : pandas.DataFrame
write_fig : Bool, default=True
compare_type : str; {'county', 'naics'}
Returns
-------
None
"""
plot_data = comparison.copy(deep=True)
plot_data.fillna(0, inplace=True)
plot_data.reset_index(inplace=True)
fig = make_subplots(rows=1, cols=2,
subplot_titles=('Design Capacity (MW)', 'Energy (MJ)'))
fig.add_trace(
go.Scatter(
x=plot_data.designCapacity_bdb,
y=plot_data.designCapacity_fied,
mode='markers'
),
row=1, col=1
)
fig.add_trace(
go.Scatter(
x=plot_data.energyMJ_bdb,
y=plot_data.energyMJ_fied,
mode='markers'
),
row=1, col=2
)
# Hardcoded coordinates for diagonal lines.
line_xy = {
'county': {
'capacity_x': [0.001, 1E6],
'capacity_y': [0.001, 1E6],
'energy_x': [100, 1E12],
'energy_y': [100, 1E12],
},
'naics': {
'capacity_x': [10, 1E6],
'capacity_y': [10, 1E6],
'energy_x': [1E4, 1E12],
'energy_y': [1E4, 1E12]
}
}
fig.add_trace(
go.Scatter(
x=line_xy[compare_type]['capacity_x'],
y=line_xy[compare_type]['capacity_y'],
mode='lines', line=dict(color='black', width=2, dash='dash')
),
row=1, col=1
)
fig.add_trace(
go.Scatter(
x=line_xy[compare_type]['energy_x'],
y=line_xy[compare_type]['energy_y'],
mode='lines', line=dict(color='black', width=2, dash='dash')
),
row=1, col=2
)
for n in [1, 2]:
fig.update_xaxes(
title_text='NW Boiler Inventory', type='log', row=1, col=n,
showexponent='all',
exponentformat='power',
showgrid=True
)
fig.update_yaxes(
title_text='Foundational Data', type='log', row=1, col=n,
showexponent='all',
exponentformat='power',
showgrid=True
)
fig.update_layout(
height=800, width=1000,
showlegend=False,
template='presentation'
)
if write_fig is True:
pio.write_image(
fig,
file=f'./analysis/figures/boiler_compare_scatter_{compare_type}.svg'
)
else:
fig.show()
return None
if __name__ == '__main__':
boiler_url = 'https://raw.githubusercontent.com/carriescho/Electrification-of-Boilers/master/total_boiler_inventory.csv'
fied_path = pathlib.Path('foundational_industry_data_2017.csv.gz')
bdb = get_boiler_data(boiler_url)
fied_boiler = get_fied_boiler(fied_path)
agg_comparison = {}
# Note that FIED is for 2017; the boiler inventory uses multiple reporting years.
# This is more of an issue for energy than design capacity, as it's expected that
# energy use is more likely to change by year than design capacity.
for t in ['county', 'naics']:
# comparison = compare_boilers_aggregate(bdb, fied_boiler, compare_type=t)
# plot_scatter_comparison(comparison, compare_type=t, write_fig=True)
agg_comparison[t] = compare_boilers_aggregate(bdb, fied_boiler, compare_type=t)
units, units_summary = compare_boilers_units(bdb, fied_boiler)