import logging
import os
import json
import zipfile
import pyarrow
import pathlib
import requests
import textwrap
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from io import BytesIO
logging.basicConfig(level=logging.INFO)
[docs]
class FIED_analysis:
def __init__(self, year, file_path=None, pio_engine=None, df=None, fig_format='png'):
if file_path is None:
self._fied = df
else:
try:
self._fied = pd.read_parquet(
file_path
)
except (pyarrow.ArrowIOError, pyarrow.ArrowInvalid):
self._fied = pd.read_csv(
file_path, low_memory=False, index_col=0
)
self._fied = self.make_consistent_naics_column(self._fied, n=2)
self._year = year
self._fig_path = pathlib.Path('./analysis/figures')
self._pio_engine = pio_engine
if self._fig_path.exists():
pass
else:
pathlib.Path.mkdir(self._fig_path)
self._fig_format = fig_format
[docs]
def get_cbp_data(self):
"""
Get establishment counts from Census County Business
Patterns for reporting year.
Returns
-------
cbp_data : pandas.DataFrame
DataFrame of CBP establishment counts by NAICS
code for U.S. for specified year.
"""
cbp_data_url = \
f'https://www2.census.gov/programs-surveys/cbp/datasets/{self._year}/cbp{str(self._year)[-2:]}us.zip'
r = requests.get(cbp_data_url)
with zipfile.ZipFile(BytesIO(r.content)) as zf:
with zf.open(zf.namelist()[0]) as f:
cbp_data = pd.read_csv(
f, sep=',',
usecols=['lfo', 'naics', 'est']
)
cbp_data = cbp_data.query('lfo == "-"').copy(deep=True)
cbp_data.replace(
{'-': '', '/':''}, inplace=True, regex=True
)
cbp_data.loc[:, 'n_naics'] = \
cbp_data.naics.apply(lambda x: len(str(x)))
return cbp_data
[docs]
def create_core_analysis(self, **kwargs):
"""
Create summary figures and table.
"""
summary_table_all = self.summary_unit_table()
summary_table_eisghgrp = self.summary_unit_table(eis_or_ghgrp_only=True)
self.summary_unit_bar(summary_table_all, write_fig=kwargs['write_fig'])
self.plot_rel_bar_missing(write_fig=kwargs['write_fig'])
self.plot_stacked_bar_missing(write_fig=kwargs['write_fig'])
for ds in ['ghgrp', 'nei']:
self.plot_stacked_bar_missing(data_subset=ds, write_fig=kwargs['write_fig'])
self.plot_facility_count(write_fig=kwargs['write_fig'])
self.plot_best_characterized()
for u in self._fied.unitTypeStd.unique():
try:
u.title()
except AttributeError:
continue
else:
for m in ['energy', 'power']:
self.plot_unit_bubble_map(u, m, write_fig=kwargs['write_fig'])
for v in ['count', 'energy', 'capacity']:
for n in [None, 2, 3]:
self.plot_ut_by_naics(n, v, write_fig=kwargs['write_fig'])
for ds in ['mecs', 'seds']:
self.plot_eia_comparison_maps(dataset=ds, year=2017, write_fig=kwargs['write_fig'])
return
[docs]
def summary_unit_table(self, eis_or_ghgrp_only=False):
"""
Creates a table that summarizes by industrial sector
(i.e., 2-digit NAICS) various aspects of the
dataset. Saves table to analysis directory and returns table.
Parameters
----------
final_data : pandas.DataFrame or parquet
Returns
-------
summary_table : pandas.DataFrame
"""
# Multiple units can share the same eisUnitID.
# (see evidence in unitDescription field)
table_data = self._fied.copy(deep=True)
table_data = self.id_sectors(table_data)
if eis_or_ghgrp_only:
table_data = table_data[
(table_data.eisFacilityID.notnull())|(table_data.ghgrpID.notnull())
].copy(deep=True)
fname = './analysis/summary_unit_table_eisghgrp_only.csv'
else:
fname = './analysis/summary_unit_table_eisghgrp_only.csv'
desc = ['count', 'sum', 'mean', 'std', 'min', 'median', 'max']
_unit_count = table_data.groupby(
['sector', 'unitTypeStd']
).registryID.count()
_unit_count.name = 'Count of Units'
_capacity_summ = table_data.query('designCapacityUOM == "MW"').groupby(
['sector', 'unitTypeStd']
).designCapacity.agg(desc)
_capacity_summ.columns = ['Capacity_MW_' + c for c in _capacity_summ.columns]
# Count of facilities with unit information
_fac_count = table_data.query('unitTypeStd.notnull()', engine='python').groupby(
['sector']
).registryID.unique().apply(lambda x: len(x))
_fac_count.name = 'Count of Facilities'
# Only use non-zero value
_energy_summ_ = pd.concat(
[table_data[table_data[c] > 0] for c in ['energyMJ', 'energyMJq0', 'energyMJq2', 'energyMJq3']],
axis=0, ignore_index=True
)
def agg_energy(df, type):
e_agg = df.copy(deep=True)
e_agg.energyMJ.update(e_agg[f'energyMJ{type[1]}'])
e_agg = e_agg.groupby(['sector', 'unitTypeStd']).energyMJ.agg(type[0])
e_agg.name = f'EnergyMJ_{type[0]}'
return e_agg
_energy_summ_agg = pd.concat(
[agg_energy(_energy_summ_, t) for t in [
['sum', 'q2'], ['mean', 'q2'], ['std', 'q2'], ['min', 'q0'],
['median', 'q2'], ['max', 'q3']]],
axis=1
)
_throughput_summ = pd.concat(
[
table_data.groupby(
['sector', 'unitTypeStd']
).throughputTonneQ2.sum(),
table_data.groupby(
['sector', 'unitTypeStd']
).throughputTonneQ2.mean(),
table_data.groupby(
['sector', 'unitTypeStd']
).throughputTonneQ2.agg('std'),
table_data.groupby(
['sector', 'unitTypeStd']
).throughputTonneQ0.min(),
table_data.groupby(
['sector', 'unitTypeStd']
).throughputTonneQ2.median(),
table_data.groupby(
['sector', 'unitTypeStd']
).throughputTonneQ3.max()
],
axis=1
)
_throughput_summ.columns = [f'ThroughputTonnes_{c}' for c in ['sum', 'mean', 'std', 'min', 'median', 'max']]
summary_table = pd.concat(
[_unit_count, _capacity_summ, _energy_summ_agg, _throughput_summ],
axis=1,
)
summary_table = pd.DataFrame(_fac_count).join(summary_table)
summary_table.to_csv('./analysis/summary_unit_table.csv')
return summary_table
[docs]
def plot_facility_count(self, write_fig=True):
""""
Plots the count of facilities from foundational data
and the count of establishments from the corresponding
year of Census Business Patterns (CBP) data.
Parameters
----------
write_fig : bool; default=True
Write resulting figure to analysis figures
directory.
"""
ind_map = {
'11': 'Agriculture',
'21': 'Mining',
'23': 'Construction',
'31': 'Manufacturing',
'32': 'Manufacturing',
'33': 'Manufacturing',
}
cbp_data = self.get_cbp_data()
# Some foundational data reported at 4-digit NAICS level.
# So, will be aggregating counts at 4-digit level.
# Note that CBP doesn't track ag establishments for NAICS 111 - 112
# (essentially all farming, animal production, and aquaculture)
cbp_data.loc[:, 'ind'] = cbp_data.naics.apply(
lambda x: (x[0:2] in ['11', '21', '23', '31', '32', '33']) & (len(x) == 4)
)
# Total count of industrial establishments
total_est = cbp_data.where(
cbp_data.naics.isin(ind_map.keys())
).dropna().est.sum()
ind_cbp_data = pd.DataFrame(cbp_data.query("ind == True"))
ind_cbp_data.loc[:, 'naics'] = ind_cbp_data.naics.astype(int)
fied_count = self._fied.copy(deep=True)
fied_count.loc[:, 'n4'] = fied_count.naicsCode.apply(
lambda x: int(str(int(x))[0:4])
)
fied_count = pd.concat(
[fied_count.groupby('n4').registryID.apply(lambda x: len(x.unique())),
ind_cbp_data.set_index('naics').est],
axis=1,
ignore_index=False
).reset_index()
index_str = fied_count['index'].astype(str)
fied_count.loc[:, 'index'] = index_str
fied_count.loc[:, 'sector'] = fied_count['index'].apply(
lambda x: x[0:2]
).map(ind_map)
fied_cumsum = pd.merge(
fied_count[['index', 'sector']],
fied_count.groupby(['sector'])[['registryID', 'est']].cumsum(),
left_index=True, right_index=True
)
fig = px.line(
fied_cumsum.melt(
id_vars=['index', 'sector']),
y="value", x="index", color='variable',
facet_col='sector', facet_col_wrap=2,
markers=True,
labels={
'index': 'NAICS Code',
'value': 'Cumulative Count of Establishments<br>or Registry IDs',
'variable': 'Data Source'
}
)
for n in range(0,8):
if n < 4:
fig.data[n].name = 'Foundational Data'
else:
fig.data[n].name = 'County Business Patterns'
fig.for_each_annotation(
lambda a: a.update(text=a.text.split("=")[-1].capitalize())
)
fig.update_yaxes(matches=None, showticklabels=True,
titlefont=dict(size=15), automargin=True)
fig.update_xaxes(matches=None, showticklabels=True,
titlefont=dict(size=15), automargin=True)
fig.update_layout(
autosize=True,
template='presentation',
height=800,
width=1200,
legend=dict(orientation="h", x=0.25, title=""),
font=dict(size=16)
)
if write_fig:
pio.write_image(
fig,
file=self._fig_path / f'counts_{self._year}',
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
[docs]
@staticmethod
def plot_difference_nei(nei, data):
"""
Plot difference between max and min energy or
throughput quanitites for units when there are
multiple emissions per unit.
Parameters
----------
nei : pandas.DataFrame
Unformatted NEI, prior to estimating
quartile values for throughput and energy.
data : str; 'energy' or 'throughput'
"""
selection = {
'energy': {
'column': 'energy_MJ',
'title': 'Energy'
},
'throughput': {
'column': 'throughput_TON',
'title': 'Throughput'
}
}
duplic = nei[(nei[selection[data]['columns']] > 0) &
(nei.eis_process_id.duplicated(keep=False) == True)]
duplic = duplic.groupby(
['eis_process_id']).agg(
perc_diff=(
selection[data]['column'],
lambda x: ((x.max()-x.min())/x.mean())*100
)
).reset_index()
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['font.sans-serif'] = "Arial"
sns.histplot(data=duplic, x="perc_diff") # sns.kdeplot
plt.xlabel('Percentage difference')
plt.ylabel('Units')
plt.title(selection[data]['title'])
return None
[docs]
def id_sectors(self, df):
"""
Make a new sector column for NAICS 2-digit
Returns
-------
df : pandas.DataFrame
FIED with 2-digit NAICS code column
"""
if 'n2' in df.columns:
pass
else:
df = self.make_consistent_naics_column(df, n=2)
sectors = pd.DataFrame(
[['Agriculture', 11], ['Construction', 21], ['Mining', 23],
['Manufacturing', 31],
['Manufacturing', 32], ['Manufacturing', 33]],
columns=['sector', 'n2']
)
df = pd.merge(df, sectors, on='n2', how='left')
return df
[docs]
def summary_unit_bar(self, summary_table, write_fig=True):
"""
Make stacked bar chart showing units by Sector and
total number of facilities reporting units
Paramters
---------
summary_table : pandas.DataFrame
Output of `summary_unit_table` method.
write_fig : bool; default=True
Write figure to analysis figures directory
"""
plot_data = summary_table.reset_index()
len_unit_types = len(plot_data.unitTypeStd.unique())
fig = make_subplots(specs=[[{"secondary_y": True}]])
plot_data.sort_values(by=['Count of Units'], ascending=False)
fig_units = px.bar(
plot_data,
x='sector',
y='Count of Units',
labels={
'value': 'Facility Count',
'variable': 'Facilities'
},
template='simple_white',
color='unitTypeStd',
color_discrete_sequence=px.colors.qualitative.Alphabet
# color_discrete_sequence=px.colors.sample_colorscale(
# "plasma", [n/(len_unit_types-1) for n in range(len_unit_types)]
# )
)
for d in range(0, len(fig_units.data)):
fig.add_trace(fig_units.data[d], secondary_y=False)
fac_counts = plot_data.drop_duplicates(['sector', 'Count of Facilities'])
fig.update_layout(barmode='stack')
fig.add_trace(
go.Scatter(
mode='markers',
x=fac_counts.sector,
y=fac_counts['Count of Facilities'],
marker=dict(size=26, color='LightSkyBlue'),
showlegend=True,
name='Facilities'
), secondary_y=True
)
fig.update_yaxes(automargin=True, title='Count of Units',
secondary_y=False,
range=[0, 70000],
tickmode='linear',
tick0=0,
dtick=7000
)
fig.update_yaxes(automargin=True, title='Count of Facilities',
secondary_y=True, range=[0, 20000],
tickmode='linear',
tick0=0,
dtick=2000)
fig.update_xaxes(automargin=True, title='Sector')
fig.update_layout(
template='presentation',
legend=dict(title_text='Unit Type', font=dict(size=16), yanchor='top', y=0.99),
width=1200,
height=800
)
fname = self._fig_path / f'summary_figure_{self._year}'
if write_fig:
pio.write_image(
fig,
file=fname,
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
return None
[docs]
def set_mecs_data(self, year=2018):
""""
MECS format is not machine-friendly. This is a manual input
of MECS combustion energy estimates from MECS Table 3.2:
https://www.eia.gov/consumption/manufacturing/about.php
Returns
-------
mecs : dict
Dictionary of combustion energy estimates from MECS (converted from TBtu to MJ), on
a national and census region basis.
"""
mecs = {
2018: {
'nation': (14859 - 2591) * 1.055E9,
'northeast': (1130 - 250) * 1.055E9 ,
'midwest': (3907 - 834) * 1.055E9,
'south': (7897 - 1143) * 1.055E9,
'west': (1925 - 365) * 1.055E9
}
}
return mecs[year]
[docs]
def get_eia_seds(self, year=2017):
"""
Get EIA State Energy Data System (SEDS) data
Parameters
----------
year : int; default=2017
Year of SEDS data to return.
Returns
-------
seds : pandas.DataFrame
"""
seds_url = 'https://www.eia.gov/opendata/bulk/SEDS.zip'
try:
r = requests.get(seds_url)
r.raise_for_status()
except r.exceptions.HTTPError as e:
logging.ERROR(e)
else:
with zipfile.ZipFile(BytesIO(r.content)) as zf:
with zf.open(zf.namelist()[0]) as f:
seds = pd.read_json(f, lines=True)
# eia_data = pd.DataFrame.from_dict(f)
seds.dropna(subset=['series_id'], inplace=True)
seds.dropna(axis=1, thresh=1, inplace=True)
# Natural gas consumed by the industrial sector (including supplemental gaseous fuels)
# All petroleum products consumed by the industrial sector
# Coal consumed by the industrial sector
# Wood and waste consumed in the industrial sector
series = ['NGICB', 'PAICB' , 'CLICB', 'WWICB']
seds = seds[seds.series_id.apply(
lambda x: any([s in x for s in series])
)].copy(deep=True)
seds.reset_index(drop=True, inplace=True)
final_seds = pd.DataFrame()
for i in seds.index:
data = pd.DataFrame(
seds.loc[i, 'data'],
columns=['year', 'BillionBtu']
)
data = pd.concat(
[pd.DataFrame(np.tile(seds.loc[i, seds.columns[0:-1]].values.reshape(13, 1), len(data))).T,
data], axis=1)
final_seds = final_seds.append(data)
for i, v in enumerate(seds.columns[0:-1]):
final_seds.rename(columns={i:v}, inplace=True)
final_seds.year.update(final_seds.year.astype(int))
final_seds = final_seds.query("year==@year").copy(deep=True)
final_seds.loc[:, 'MJ'] = final_seds.BillionBtu * 1.055E6
final_seds.drop(['start', 'end', 'BillionBtu', 'last_updated'],
axis=1, inplace=True)
final_seds = final_seds[final_seds.geography != 'USA'].copy(deep=True)
def split_columns(**kwargs):
final_seds = kwargs['data']
df = pd.DataFrame(
final_seds[kwargs['data_column']].unique(),
columns=[kwargs['data_column']]
)
df.loc[:, kwargs['new_column']] = df[kwargs['data_column']].apply(
lambda x: x.split(f'{kwargs["split_char"]}')[1]
)
final_seds = pd.merge(final_seds, df, on=kwargs['data_column'], how='left')
return final_seds
final_seds = split_columns(
data=final_seds,
data_column= 'geography',
new_column='state',
split_char='-'
)
# final_seds = split_columns(
# data = final_seds,
# data_column = 'series_id',
# new_column='data_id',
# split_char= '.'
# )
# states = pd.DataFrame(final_seds.geography.unique(), columns=['geography'])
# states.loc[:, 'state'] = states.geography.apply(
# lambda x: x.split('-')[1]
# )
# final_seds = pd.merge(final_seds, states, on='geography', how='left')
final_seds = final_seds.groupby('state', as_index=False).MJ.sum()
return final_seds
[docs]
def get_state_region(self):
""" Download state to region file"""
url = 'https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv'
s_r = pd.read_csv(url, encoding='latin_1')
return s_r
[docs]
def plot_eia_comparison_maps(self, dataset, year=2017, write_fig=True):
"""
Plots a relative comparison of FIED vs. EIA on a geographic
basis (combustion energy only).
For MECS, this is census region; for SEDS, states.
Parameters
----------
dataset : str; {'mecs', 'seds'}
EIA dataset to compare FIED against.
year : int; default=2017
write_fig : bool; default=True
Write figure to analysis figures directory
Returns
-------
"""
shared_plot_args = dict(
color_continuous_scale= [
[0, 'rgb(43,131,186)'],
[0.25, 'rgb(171,221,164)'],
[0.5, 'rgb(255,255,191)'],
[0.75, 'rgb(253,174,97)'],
[1, 'rgb(215,25,28)']
],
color_continuous_midpoint=1,
locationmode="USA-states",
scope='usa',
locations = 'stateCode',
color = 'fied_relative',
labels={'fied_relative': f'FIED relative to EIA {dataset.upper()}'}
)
plot_data = self._fied.copy(deep=True)
# with requests.get('https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/us-state-boundaries/exports/geojson?lang=en&timezone=America%2FNew_York') as r:
# states = r.json()
# for s in range(0, len(states['features'])):
# states['features'][s]['id'] = states['features'][s]['properties']['stusab']
if dataset == 'mecs':
mecs = self.set_mecs_data()
s_r = self.get_state_region()[['State Code', 'Region']]
s_r.Region.update(s_r.Region.apply(lambda x: x.lower()))
plot_data = pd.merge(
plot_data, s_r, left_on='stateCode', right_on='State Code',
how='inner'
)
# Need to remove non-manufacturing facilities
plot_data_n = pd.Series(plot_data.naicsCode.unique())
plot_data_n = pd.concat(
[plot_data_n, plot_data_n.apply(lambda x: int(str(x)[0]))],
axis=1
)
plot_data_n.columns = ['naicsCode', 'n1']
plot_data_n = plot_data_n[plot_data_n.n1==3]
plot_data = pd.merge(plot_data, plot_data_n, how='inner',
on='naicsCode')
plot_data = pd.concat([
plot_data[
(plot_data.energyMJ==0) | (plot_data.energyMJ.isnull())
].groupby(
['Region', 'stateCode']
).energyMJq2.sum(),
plot_data.groupby(
['Region', 'stateCode']
).energyMJ.sum()
], axis=1
)
plot_data.loc[:, 'totalMJ'] = plot_data.sum(axis=1)
plot_data.loc[:, 'mecsMJ'] = plot_data.index.get_level_values(0).map(mecs)
plot_data.loc[:, 'fied_relative'] = plot_data.totalMJ.sum(level=0).divide(plot_data.mecsMJ)
plot_data.reset_index(inplace=True)
elif dataset == 'seds':
seds = self.get_eia_seds(year=2017)
seds.rename(columns={'MJ': 'sedsMJ'}, inplace=True)
plot_data = pd.concat([
plot_data[
(plot_data.energyMJ==0) | (plot_data.energyMJ.isnull())
].groupby(['stateCode']).energyMJq2.sum(),
plot_data.groupby(['stateCode']).energyMJ.sum()
], axis=1
)
plot_data.loc[:, 'totalMJ'] = plot_data.sum(axis=1)
plot_data = plot_data.join(seds.set_index('state'))
plot_data.loc[:, 'fied_relative'] = plot_data.totalMJ.divide(plot_data.sedsMJ)
plot_data.reset_index(inplace=True)
plot_data.dropna(inplace=True) # Drop Territories
rel_max = plot_data.fied_relative.max()
if rel_max > np.around(rel_max):
rel_max = np.around(rel_max)+1
else:
rel_max = np.around(rel_max)
shared_plot_args['range_color'] =(0, rel_max)
shared_plot_args['data_frame'] = plot_data
plot_data.to_csv(f'{dataset}_comparison_data.csv')
fig = px.choropleth(**shared_plot_args)
# plot_data,
# color_continuous_scale= [
# [0, 'rgb(43,131,186)'],
# [0.25, 'rgb(171,221,164)'],
# [0.5, 'rgb(255,255,191)'],
# [0.75, 'rgb(253,174,97)'],
# [1, 'rgb(215,25,28)']
# ],
# range_color=(0, rel_max),
# color_continuous_midpoint=1,
# locationmode="USA-states",
# scope='usa',
# locations = 'stateCode',
# color = 'fied_relative',
# labels={'fied_relative': 'FIED Relative to EIA MECS'}
# )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
if write_fig:
fname = self._fig_path / f'{dataset}_comaprison_map_{self._year}'
pio.write_image(
fig,
file=fname,
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
[docs]
def plot_eia_comparison(self, eia_mecs=1.568E13, eia_mer=2.316E13):
"""
Plot bar chart depicting FIED energy estimates relative to EIA estimates.
Parameters
----------
eia_mecs : float; default=1.568E13
Combustion energy use (in MJ) from EIA Manufacturing Consumption survey.
Default represents 2018 value.
eia_mer : float; default=2.316E13
Combustion energy use (in MJ) for industry from EIA Monthly Energy Review (MER).
Default represents 2017 value.
"""
[docs]
def plot_unit_bubble_map(self, unit_type, measure, max_size=66, write_fig=True):
"""
Plot locations of a single standard unit type by
either energy (MJ) or design capacity.
Parameters
----------
unit_type : str
Standard unit type: 'other combustion', 'kiln', 'dryer',
'boiler', 'heater', 'turbine', 'oven', 'engine', 'furnace',
'thermal oxidizer', 'incinerator', 'other', 'generator',
'flare', 'stove', 'compressor', 'pump',
'building heat', 'distillation'
measure : str; {'energy', 'power', 'ghgs'}
Either 'energy' (results in MJ), 'power' (results in MW), or
'ghgs' (results in MTCO2e)
max_size : int
Max size of bubbles on map
write_fig : bool; default=True
Write figure to analysis figures directory
"""
if unit_type:
plot_data = pd.DataFrame(self._fied.query("unitTypeStd == @unit_type"))
file_name = f'bubble_map_{measure}-{unit_type}_{self._year}'
else:
plot_data = self._fied.copy(deep=True)
file_name = f'bubble_map_{measure}_{self._year}'
# plot_args = {
# 'lat': 'latitude',
# 'lon': 'longitude',
# 'scope': 'usa',
# 'color': 'unitTypeStd',
# 'title': f'Location of {unit_type.title()} Units Reporting {measure.title()} Data',
# 'size_max': max_size,
# 'color_discrete_sequence': px.colors.qualitative.Safe
# }
if measure == 'energy':
plot_data.loc[:, 'plotMJ'] = plot_data.query('energyMJ>0 & (energyMJq2.isnull() | energyMJq2==0)').energyMJ
plot_data.loc[:, 'plotMJ'] = plot_data.query('(energyMJ==0 | energyMJ.isnull()) & energyMJq2>0').energyMJq2
plot_data.dropna(subset=['plotMJ'], axis=0, inplace=True)
# plot_args['size'] = plot_data.plotMJ.to_list()
markersize = plot_data.plotMJ.to_list()
# plot_args['labels'] = {
# 'plotMJ': 'Unit Energy Use (MJ)'
# }
ids = [f'{x} MJ' for x in plot_data.plotMJ.values]
leg_title = 'Unit Energy Use (MJ)'
plot_color='plotMJ'
elif measure == 'power':
plot_data = plot_data.query(
"designCapacityUOM=='MW' & designCapacity>0"
)
# plot_args['size'] = plot_data.designCapacity.to_list()
markersize = plot_data.designCapacity.to_list()
# plot_args['labels'] = {
# 'designCapacity': 'Unit Capacity (MW)'
# }
# plot_args['title'] = f'Location of {unit_type.title()} Units Reporting Capacity Data'
ids = [f'{x} MW' for x in plot_data.designCapacity.values]
leg_title = 'Unit Capacity (MW)'
plot_color='designCapacity'
elif measure == 'ghgs':
plot_data.loc[:, 'plotGHG'] = plot_data.query('ghgsTonneCO2e>0 & (ghgsTonneCO2eQ2.isnull() | ghgsTonneCO2eQ2==0)').ghgsTonneCO2e
plot_data.loc[:, 'plotGHG'] = plot_data.query('(ghgsTonneCO2e==0 | ghgsTonneCO2e.isnull()) & ghgsTonneCO2eQ2>0').ghgsTonneCO2eQ2
plot_data.dropna(subset=['plotGHG'], axis=0, inplace=True)
plot_color = 'plotGHG'
# plot_args['size'] = plot_data.plotGHG.to_list()
markersize = plot_data.plotGHG.to_list()
# plot_args['size'] = plot_data.plotGHG.to_list()
# plot_args['labels'] = {
# 'plotGHG': 'Unit GHG Emissions (tonne CO2e)'
# }
# plot_args['title'] = f'Location of {unit_type.title()} Units Reporting GHG Data'
ids = [f'{x} tonne CO2e' for x in plot_data.plotGHG.values]
leg_title = 'Unit GHG Emissions (tonne CO2e)'
fig = go.Figure(data = go.Scattergeo(
locationmode='USA-states',
lat=plot_data['latitude'],
lon=plot_data['longitude'],
mode='markers',
ids=ids,
marker=dict(
size=markersize,
sizeref=max(markersize)/max_size**2,
sizemode='area',
autocolorscale=False,
color=plot_data[plot_color],
colorscale='agsunset',
colorbar=dict(
title=leg_title,
tickfont=dict(size=18)
)
)
))
fig.update_layout(
title=f'Location of {unit_type.title()} Units Reporting {measure.title()} Data',
geo=dict(
scope='usa',
projection_type='albers usa',
showland = True,
landcolor = "rgb(255, 255, 255)",
subunitcolor = "rgb(150, 150, 150)",
subunitwidth = 0.5
)
)
# fig = px.scatter_geo(plot_data, **plot_args)
# fig.update_geos(bgcolor='#FFFFFF')
# fig.update_layout(showlegend=True)
# fig.update_yaxes(automargin=True)
# fig.update_xaxes(automargin=True)
if write_fig:
pio.write_image(
fig,
file=self._fig_path / file_name,
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
return None
[docs]
def plot_rel_bar_missing(self, write_fig=True):
"""
Creates a simple stacked bar showing relative amount (percentage)
of GHGRP and NEI facilities with and without unit-level data.
Parameters
----------
write_fig : bool; default=True
Write figure to analysis figures directory
"""
plot_data = {}
label = "Percentage of Facilities with Unit-Level Characterization"
for l in ['ghgrp', 'eisFacility']:
plot_data[l] = self._fied.query(f"{l}ID.notnull()", engine="python").copy()
plot_data[l] = pd.concat(
[pd.Series(len(
plot_data[l].query("unitTypeStd.isnull()", engine="python").registryID.unique()
)),
pd.Series(len(
plot_data[l].query("unitTypeStd.notnull()", engine="python").registryID.unique()
))],
axis=1,
ignore_index=False
)
plot_data[l].columns = ['Without Unit Characterization',
'With Unit Characterization']
plot_data[l] = plot_data[l].divide(
plot_data[l].sum(axis=1), axis=0
)
plot_data[l].loc[:, 'Data Source'] = l
plot_data = pd.concat(
[plot_data[l] for l in plot_data.keys()],
axis=0, ignore_index=True
)
plot_data.drop(['Without Unit Characterization'], axis=1, inplace=True)
plot_data.replace({'eisFacility': 'NEI', 'ghgrp': 'GHGRP'}, inplace=True)
fig = px.bar(
plot_data,
x='Data Source',
y=['With Unit Characterization'],
labels={
'value': label,
'variable': 'Facilities'
},
template='plotly_white',
color_discrete_map={
'With Unit Characterization': "#756bb1"
})
fig.update_yaxes(automargin=True, range=[0, 1])
fig.update_xaxes(automargin=True, type='category')
fig.update_layout(
yaxis_tickformat=".0%",
template='presentation',
height=800,
width=600,
legend=dict(orientation="h", x=0.25, title=""),
font=dict(size=16)
)
if write_fig:
pio.write_image(
fig,
file=self._fig_path / f'rel_bar_unit_char_{self._year}',
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
[docs]
def plot_stacked_bar_missing(self, naics_level=2, data_subset=None, write_fig=True):
""""
Creates stacked bar showing counts of facilities
with and without unit-level data.
Parameters
----------
naics_level : int; default=2
Specific level of NAICS aggregation to display
data. NAICS is a range of integers from 2 to 6.
data_subset : str; {None, 'ghgrp', 'nei'}
Plot subset of data, either facilities that
are GHGRP or NEI reporters
rel_total : bool; default=False
Plot the relative total of NEI and GHGRP facilities that
have unit-level characterization. Renders naics_level and data_subset
obsolete.
write_fig : bool; default=True
Write figure to analysis figures directory
"""
plot_data = self._fied.copy(deep=True)
label = "Facility Count"
if data_subset == 'ghgrp':
plot_data = plot_data.query(
"ghgrpID.notnull()", engine="python"
).copy()
label = label + " (GHGRP Reporters Only)"
elif data_subset == 'nei':
plot_data = plot_data.query(
"eisFacilityID.notnull()", engine="python"
).copy()
label = label + " (NEI Reporters Only)"
else:
pass
plot_data = pd.concat(
[plot_data.query("unitTypeStd.isnull()", engine="python").groupby(
f'n{naics_level}'
).apply(lambda x: np.size(x.registryID.unique())),
plot_data.query("unitTypeStd.notnull()", engine="python").groupby(
f'n{naics_level}'
).apply(lambda x: np.size(x.registryID.unique()))],
axis=1,
ignore_index=False
)
if naics_level == 2:
plot_data.loc['Manufacturing', :] = \
plot_data.loc[31:, :].sum()
plot_data.reset_index(inplace=True)
plot_data.columns = ['NAICS Code', 'Without Unit Characterization',
'With Unit Characterization']
plot_data['NAICS Code'].replace({11: 'Agriculture', 21: 'Mining', 23: 'Construction'},
inplace=True)
plot_data = plot_data[[type(x)!=int for x in plot_data['NAICS Code']]]
else:
plot_data.reset_index(inplace=True)
plot_data.columns = ['NAICS Code', 'Without Unit Characterization',
'With Unit Characterization']
plot_data.dropna(subset=['NAICS Code'], inplace=True)
plot_data.loc[:, 'NAICS Code'] = plot_data['NAICS Code'].astype(str)
fig = px.bar(plot_data,
x='NAICS Code',
y=['Without Unit Characterization',
'With Unit Characterization'],
labels={
'value': label,
'variable': 'Facilities'
},
template='plotly_white',
color_discrete_map={
'Without Unit Characterization': "#bcbddc",
'With Unit Characterization': "#756bb1"
})
fig.update_yaxes(automargin=True)
fig.update_xaxes(automargin=True, type='category')
fig.update_layout(
template='presentation',
height=800,
width=1200,
legend=dict(orientation="h", x=0.25, title=""),
font=dict(size=16)
)
if write_fig:
pio.write_image(
fig,
file=self._fig_path / f'stacked_bar_NAICS{naics_level}_{data_subset}_{self._year}',
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
return None
[docs]
def plot_ut_by_naics(self, naics_level=None, variable='count', write_fig=True):
"""
Creates a table that summarizes by industrial sector
(i.e., 2-digit NAICS) various aspects of the dataset
Parameters
----------
naics_level : int; default=None
Specified NAICS level (None or 2 - 6)
variable : str; {'count', 'energy', 'capacity'}
write_fig : bool; default=True
Write figure to analysis figures directory
"""
formatting = {
'x': 'unitTypeStd',
'template': 'presentation',
'labels': {
'unitTypeStd': 'Standardized Unit Type'
},
}
plot_data = self._fied.copy(deep=True)
if not naics_level:
grouper = 'unitTypeStd'
else:
grouper = ['unitTypeStd', f'n{naics_level}']
if naics_level == 2:
pass
else:
plot_data = self.make_consistent_naics_column(
plot_data, naics_level
)
plot_data.loc[:, f'n{naics_level}'] = \
plot_data[f'n{naics_level}'].astype(str)
len_naics = len(plot_data[f'n{naics_level}'].unique())
formatting['labels'][f'n{naics_level}'] = 'NAICS Code'
formatting['color'] = f'n{naics_level}'
# formatting['color_discrete_sequence'] = px.colors.sequential.Plasma_r
formatting['color_discrete_sequence'] = px.colors.sample_colorscale(
"plasma_r", [n/(len_naics-1) for n in range(len_naics)]
)
if variable == 'count':
plot_data = plot_data.query(
"unitTypeStd.notnull()", engine="python"
).groupby(grouper).registryID.count()
formatting['labels']['registryID'] = 'Unit Count'
formatting['y'] = 'registryID'
elif variable == 'energy':
plot_data = plot_data.query(
"unitTypeStd.notnull()", engine="python"
).groupby(grouper)['energyMJ', 'energyMJq0'].sum().sum(axis=1)
plot_data.name = 'energy'
formatting['y'] = 'energy'
formatting['labels']['energy'] = 'Energy (MJ)'
elif variable == 'capacity':
plot_data = plot_data.query(
"unitTypeStd.notnull() & designCapacityUOM=='MW'", engine="python"
).groupby(grouper).designCapacity.sum()
formatting['y'] = 'designCapacity'
formatting['labels']['designCapacity'] = 'Unit Design Capacity (MW)'
else:
logging.error('No variable specified')
raise ValueError
if type(plot_data) == pd.core.series.Series:
plot_data = pd.DataFrame(plot_data).reset_index()
else:
plot_data.reset_index(inplace=True)
plot_data.loc[:, 'unitTypeStd'] = [x.capitalize() for x in plot_data.unitTypeStd]
plot_data = plot_data.sort_values(by=grouper, ascending=True)
fig = px.bar(plot_data, **formatting)
if variable == 'count':
pass
else:
fig.update_layout(
yaxis=dict(
showexponent='all',
exponentformat='power'
)
)
fig.update_yaxes(automargin=True)
fig.update_xaxes(automargin=True)
if write_fig:
pio.write_image(
fig,
file=self._fig_path / f'unittype_NAICS{naics_level}_{variable}_{self._year}',
format=self._fig_format,
engine=self._pio_engine
)
else:
fig.show()
return None
[docs]
def make_consistent_naics_column(self, final_data, n):
"""
Creates a column of consisently aggregated NAICS codes
(i.e., same number of digits) when a column of
NAICS codes contains different levels of aggregation.
Will only include
Parameters
----------
final_data : pandas.DataFrame or parquet
n : int; 2 to 6
Specified NAICS level
Returns
-------
analysis_data : pandas.DataFrame
Returns original DataFrame with
new column named f'n{n}'.
"""
df_naics = pd.DataFrame(final_data['naicsCode'].drop_duplicates(),
dtype=int)
df_naics_consistent = df_naics[
df_naics.naicsCode.apply(lambda x: len(str(x)) >= n)
]
if len(df_naics_consistent) < len(df_naics):
logging.info(f"Caution: not all original NAICS codes are at this aggregation")
else:
pass
df_naics_consistent.loc[:, f'n{n}'] = df_naics_consistent.naicsCode.apply(
lambda x: int(str(x)[0:n])
)
analysis_data = final_data.copy(deep=True)
analysis_data = pd.merge(
analysis_data, df_naics_consistent, on='naicsCode'
)
return analysis_data
[docs]
def plot_best_characterized(self):
"""
Identify which NAICS codes have the highest portion of facilities that
have some unit-level characterization associated with them.
"""
plot_data = self._fied.copy(deep=True)
# find most aggregated NAICS (smallest number of digits)
smallest_n = plot_data.naicsCode.unique()
smallest_n = min([len(str(int(x))) for x in smallest_n])
plot_data = self.make_consistent_naics_column(plot_data, smallest_n)
plot_data = plot_data.groupby('stateCode')
plot_data = pd.concat([plot_data.apply(lambda x: pd.concat(
[x.query("unitTypeStd.isnull()", engine="python").groupby(
f'n{smallest_n}'
).apply(lambda y: np.size(y.registryID.unique())),
x.query("unitTypeStd.notnull()", engine="python").groupby(
f'n{smallest_n}'
).apply(lambda y: np.size(y.registryID.unique()))],
axis=1,
ignore_index=False
))], axis=0)
plot_data.columns = ['unchar', 'char']
plot_data.loc[:, 'total'] = plot_data.sum(axis=1)
plot_data.loc[:, 'fraction_char'] = plot_data.char.divide(
plot_data.total
)
plot_data.fillna(0, inplace=True)
# # State sorted mean characterized
# state_char = pd.concat(
# [plot_data.fraction_char.mean(level=0),
# plot_data.total.sum(level=0)], axis=1
# ).sort_values(by='fraction_char', ascending=False)
# # NAICS sorted mean characterized
# naics_char = pd.concat(
# [plot_data.fraction_char.mean(level=1),
# plot_data.total.sum(level=1)],
# axis=1
# ).sort_values(by='fraction_char', ascending=False)
# Level 0 is state aggregation; level 1 is NAICS aggregation
for k in [0, 1]:
d = pd.concat(
[plot_data.fraction_char.mean(level=k),
plot_data.total.sum(level=k)], axis=1
).sort_values(by='fraction_char', ascending=False)
if k == 1:
d.index = d.index.astype('str')
d = d[d.fraction_char > 0]
fig, ax = plt.subplots(figsize=(6, 12), tight_layout=False)
sns.scatterplot(
data=d,
y=d.index,
x="fraction_char",
size="total",
sizes=(10, 800),
alpha=.5, ax=ax
)
if k == 1:
[l.set_visible(False) for (i, l) in enumerate(ax.yaxis.get_ticklabels()) if i % 6 != 0]
ax.set_ylabel('NAICS Code')
else:
ax.set_ylabel('State')
ax.set_xlabel("Fraction of Facilities with Unit Characterization")
plt.legend(
title='Total Number of Facilities',
loc='best',
frameon=False)
plt.show()
# def unit_capacity_nonintensive(self):
# """
# Parameters
# ----------
# df : pandas.DataFrame
# Final foundational dataset
# naics : str; {'all', 'non-mfg', 'intensive', 'other-mfg'} or list of ints; default='all'
# Group output by NAICS code group (str) or by list of integer(s). Integers must
# be at 4-digit NAICS level.
# Returns
# -------
# naics_plot :
# """
# # plot_data = id_sectors(final_data)
# # Make subplots, one each for non-mfg,
# # intenstive, and other-mfg
# fig = make_subplots(
# rows=2, cols=2,
# specs=[
# [{}, {}],
# [{"colspan": 2}, None]
# ],
# print_grid=False)
[docs]
def summary_table_intensive(self):
"""
Summary description of unit coverage for intensive
and non-intensive industries.
"""
df_naics = pd.DataFrame(
self._fied.naicsCode.drop_duplicates()
)
df_naics.loc[:, 'n4'] = df_naics.naicsCode.apply(lambda x: int(str(x)[0:4]))
plot_data = dict(
all=pd.merge(self._fied, df_naics, on='naicsCode')
)
# NAICS subsectors of Paper Manufacturing, Petroleum and Coal Products,
# Chemical Manufacturing, Glass and Glass Product Manufacturing, Cement and
# Concrete, Lime and Gypsum, Iron & Steel, Alumina and Aluminum,
intensive_naics = [3221, 3241, 3251, 3253, 3272, 3273, 3274, 3311, 3313]
plot_data['non-mfg'] = \
plot_data['all'][~plot_data['all'].n4.between(3000, 4000)]
plot_data['intensive'] = \
plot_data['all'][plot_data['all'].n4.isin(intensive_naics)]
plot_data['other-mfg'] = \
plot_data['all'][
~plot_data['all'].n4.isin(intensive_naics) & \
plot_data['all'].n4.between(3000, 4000)]
for k in plot_data.keys():
if k == 'all':
pass
else:
plot_data['all'].loc[plot_data[k].index, 'grouping'] = k
summ_table = pd.DataFrame(index=['intensive', 'other-mfg', 'non-mfg'])
ind_grp = plot_data['all'].groupby('grouping')
summ_table = pd.concat([
ind_grp.apply(lambda x: len(x.registryID.unique())),
ind_grp.apply(
lambda x: len(x[x.unitTypeStd.notnull()].registryID.unique())
),
ind_grp.apply(lambda x: len(x[
(x.unitTypeStd.notnull()) &
(x.designCapacity.notnull()) &
(x.designCapacityUOM == 'MW')
].registryID.unique())
),
ind_grp.apply(
lambda x: x[x.designCapacityUOM == 'MW'].designCapacity.sum()
),
ind_grp.apply(
lambda x: x[x.designCapacityUOM == 'MW'].designCapacity.median()
)], axis=1, ignore_index=False
)
summ_table.update(
summ_table.loc[:, [1, 2]].divide(summ_table.loc[:, 0], axis=0)
)
summ_table.columns = ['Count of Facilities', 'Facilities with Unit Type',
'Facilities with Unit Type & Capacity',
'Total Capacity (MW)', 'Median Capacity (MW)']
summ_table = summ_table.style.format({
'Facilities with Unit Type': '{:.0%}',
'Facilities with Unit Type & Capacity': '{:.0%}',
'Total Capacity (MW)': '{:.2f}'}
)
if __name__ == '__main__':
year = 2017
filepath = os.path.abspath('foundational_industry_data_2017.csv.gz')
fa = FIED_analysis(year=year, file_path=filepath, fig_format='svg')
fa.create_core_analysis(write_fig=False)