Source code for fied.analysis.analysis_figures


import logging
import os
import json
import zipfile
import pyarrow
import pathlib
import requests
import textwrap
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
from io import BytesIO


logging.basicConfig(level=logging.INFO)


[docs]
class FIED_analysis:

    def __init__(self, year, file_path=None, pio_engine=None, df=None, fig_format='png'):

        if file_path is None:
            self._fied = df

        else:
            try:
                self._fied = pd.read_parquet(
                    file_path
                    )

            except (pyarrow.ArrowIOError, pyarrow.ArrowInvalid):
                self._fied = pd.read_csv(
                    file_path, low_memory=False, index_col=0
                    )

        self._fied = self.make_consistent_naics_column(self._fied, n=2)
        
        self._year = year

        self._fig_path = pathlib.Path('./analysis/figures')

        self._pio_engine = pio_engine

        if self._fig_path.exists():
            pass
    
        else:
            pathlib.Path.mkdir(self._fig_path)

        self._fig_format = fig_format


[docs]
    def get_cbp_data(self):
        """
        Get establishment counts from Census County Business
        Patterns for reporting year.
        
        Returns
        -------
        cbp_data : pandas.DataFrame
            DataFrame of CBP establishment counts by NAICS
            code for U.S. for specified year.

        """

        cbp_data_url = \
            f'https://www2.census.gov/programs-surveys/cbp/datasets/{self._year}/cbp{str(self._year)[-2:]}us.zip'
        
        r = requests.get(cbp_data_url)

        with zipfile.ZipFile(BytesIO(r.content)) as zf:
            with zf.open(zf.namelist()[0]) as f:
                cbp_data = pd.read_csv(
                    f, sep=',', 
                    usecols=['lfo', 'naics', 'est']
                    )
                
        cbp_data = cbp_data.query('lfo == "-"').copy(deep=True)

        cbp_data.replace(
            {'-': '', '/':''}, inplace=True, regex=True
            )
    
        cbp_data.loc[:, 'n_naics'] = \
            cbp_data.naics.apply(lambda x: len(str(x)))
    
        return cbp_data



[docs]
    def create_core_analysis(self, **kwargs):
        """
        Create summary figures and table.
        """

        summary_table_all = self.summary_unit_table()

        summary_table_eisghgrp = self.summary_unit_table(eis_or_ghgrp_only=True)

        self.summary_unit_bar(summary_table_all, write_fig=kwargs['write_fig'])

        self.plot_rel_bar_missing(write_fig=kwargs['write_fig'])

        self.plot_stacked_bar_missing(write_fig=kwargs['write_fig'])

        for ds in ['ghgrp', 'nei']:
            self.plot_stacked_bar_missing(data_subset=ds, write_fig=kwargs['write_fig'])

        self.plot_facility_count(write_fig=kwargs['write_fig'])

        self.plot_best_characterized()

        for u in self._fied.unitTypeStd.unique():
            try:
                u.title()
            except AttributeError:
                continue
            else:
                for m in ['energy', 'power']:
                    self.plot_unit_bubble_map(u, m, write_fig=kwargs['write_fig'])

        for v in ['count', 'energy', 'capacity']:
            for n in [None, 2, 3]:
                self.plot_ut_by_naics(n, v, write_fig=kwargs['write_fig'])

        for ds in ['mecs', 'seds']:
            self.plot_eia_comparison_maps(dataset=ds, year=2017, write_fig=kwargs['write_fig'])

        return

    


[docs]
    def summary_unit_table(self, eis_or_ghgrp_only=False):
        """
        Creates a table that summarizes by industrial sector
        (i.e., 2-digit NAICS) various aspects of the 
        dataset. Saves table to analysis directory and returns table.

        Parameters
        ----------
        final_data : pandas.DataFrame or parquet

        Returns
        -------
        summary_table : pandas.DataFrame

        """

        # Multiple units can share the same eisUnitID.
        # (see evidence in unitDescription field)
        table_data = self._fied.copy(deep=True)

        table_data = self.id_sectors(table_data)

        if eis_or_ghgrp_only:
            table_data = table_data[
                (table_data.eisFacilityID.notnull())|(table_data.ghgrpID.notnull())
                ].copy(deep=True)
            
            fname = './analysis/summary_unit_table_eisghgrp_only.csv'
        else:
            fname = './analysis/summary_unit_table_eisghgrp_only.csv'

        desc = ['count', 'sum', 'mean', 'std', 'min', 'median', 'max']

        _unit_count = table_data.groupby(
            ['sector', 'unitTypeStd']
            ).registryID.count()

        _unit_count.name = 'Count of Units'

        _capacity_summ = table_data.query('designCapacityUOM == "MW"').groupby(
                ['sector', 'unitTypeStd']
                ).designCapacity.agg(desc)
        
        _capacity_summ.columns = ['Capacity_MW_' + c for c in _capacity_summ.columns]

        # Count of facilities with unit information
        _fac_count = table_data.query('unitTypeStd.notnull()', engine='python').groupby(
            ['sector']
            ).registryID.unique().apply(lambda x: len(x))

        _fac_count.name = 'Count of Facilities'

        # Only use non-zero value
        _energy_summ_ = pd.concat(
            [table_data[table_data[c] > 0] for c in ['energyMJ', 'energyMJq0', 'energyMJq2', 'energyMJq3']],
            axis=0, ignore_index=True
            )
        
        def agg_energy(df, type):
            e_agg = df.copy(deep=True)
            e_agg.energyMJ.update(e_agg[f'energyMJ{type[1]}'])
            e_agg = e_agg.groupby(['sector', 'unitTypeStd']).energyMJ.agg(type[0])
            e_agg.name = f'EnergyMJ_{type[0]}'

            return e_agg
        
        _energy_summ_agg = pd.concat(
            [agg_energy(_energy_summ_, t) for t in [
                ['sum', 'q2'], ['mean', 'q2'], ['std', 'q2'], ['min', 'q0'], 
                ['median', 'q2'], ['max', 'q3']]],
            axis=1
            )

        _throughput_summ = pd.concat(
            [
                table_data.groupby(
                    ['sector', 'unitTypeStd']
                    ).throughputTonneQ2.sum(),
                table_data.groupby(
                    ['sector', 'unitTypeStd']
                    ).throughputTonneQ2.mean(),
                table_data.groupby(
                    ['sector', 'unitTypeStd']
                    ).throughputTonneQ2.agg('std'),
                table_data.groupby(
                    ['sector', 'unitTypeStd']
                    ).throughputTonneQ0.min(),
                table_data.groupby(
                    ['sector', 'unitTypeStd']
                    ).throughputTonneQ2.median(),
                table_data.groupby(
                    ['sector', 'unitTypeStd']
                    ).throughputTonneQ3.max()
                ],
            axis=1
            )
        
        _throughput_summ.columns = [f'ThroughputTonnes_{c}' for c in ['sum', 'mean', 'std', 'min', 'median', 'max']]

        summary_table = pd.concat(
            [_unit_count, _capacity_summ, _energy_summ_agg, _throughput_summ],
            axis=1,
            )

        summary_table = pd.DataFrame(_fac_count).join(summary_table)

        summary_table.to_csv('./analysis/summary_unit_table.csv')

        return summary_table



[docs]
    def plot_facility_count(self, write_fig=True):
        """"
        Plots the count of facilities from foundational data
        and the count of establishments from the corresponding
        year of Census Business Patterns (CBP) data.
        
        Parameters
        ----------
        write_fig : bool; default=True
            Write resulting figure to analysis figures
            directory.
        
        """
        ind_map = {
            '11': 'Agriculture', 
            '21': 'Mining',
            '23': 'Construction',
            '31': 'Manufacturing',
            '32': 'Manufacturing',
            '33': 'Manufacturing',
            }

        cbp_data = self.get_cbp_data()

        # Some foundational data reported at 4-digit NAICS level.
        # So, will be aggregating counts at 4-digit level.
        # Note that CBP doesn't track ag establishments for NAICS 111 - 112
        # (essentially all farming, animal production, and aquaculture)
        cbp_data.loc[:, 'ind'] = cbp_data.naics.apply(
            lambda x: (x[0:2] in ['11', '21', '23', '31', '32', '33']) & (len(x) == 4)
            )

        # Total count of industrial establishments
        total_est = cbp_data.where(
            cbp_data.naics.isin(ind_map.keys())
            ).dropna().est.sum()
        
        ind_cbp_data = pd.DataFrame(cbp_data.query("ind == True"))
        ind_cbp_data.loc[:, 'naics'] = ind_cbp_data.naics.astype(int)

        fied_count = self._fied.copy(deep=True)
        fied_count.loc[:, 'n4'] = fied_count.naicsCode.apply(
            lambda x: int(str(int(x))[0:4])
            )

        fied_count = pd.concat(
            [fied_count.groupby('n4').registryID.apply(lambda x: len(x.unique())),
            ind_cbp_data.set_index('naics').est],
            axis=1,
            ignore_index=False
            ).reset_index()

        index_str = fied_count['index'].astype(str)
        fied_count.loc[:, 'index'] = index_str
        fied_count.loc[:, 'sector'] = fied_count['index'].apply(
            lambda x: x[0:2]
            ).map(ind_map)
        
        fied_cumsum = pd.merge(
            fied_count[['index', 'sector']],
            fied_count.groupby(['sector'])[['registryID', 'est']].cumsum(),
            left_index=True, right_index=True
            )

        fig = px.line(
            fied_cumsum.melt(
                id_vars=['index', 'sector']), 
            y="value", x="index", color='variable',
            facet_col='sector', facet_col_wrap=2,
            markers=True,
            labels={
                'index': 'NAICS Code',
                'value': 'Cumulative Count of Establishments<br>or Registry IDs',
                'variable': 'Data Source'
                }
            )

        for n in range(0,8):
            if n < 4:
                fig.data[n].name = 'Foundational Data'

            else:
                fig.data[n].name = 'County Business Patterns'
        
        fig.for_each_annotation(
            lambda a: a.update(text=a.text.split("=")[-1].capitalize())
            )
        
        fig.update_yaxes(matches=None, showticklabels=True,
                         titlefont=dict(size=15), automargin=True)
        fig.update_xaxes(matches=None, showticklabels=True,
                         titlefont=dict(size=15), automargin=True)

        fig.update_layout(
            autosize=True,
            template='presentation',
            height=800,
            width=1200,
            legend=dict(orientation="h", x=0.25, title=""),
            font=dict(size=16)
            )
    
        if write_fig:
            pio.write_image(
                fig,
                file=self._fig_path / f'counts_{self._year}',
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()


 

[docs]
    @staticmethod
    def plot_difference_nei(nei, data):
        """
        Plot difference between max and min energy or
        throughput quanitites for units when there are
        multiple emissions per unit.

        Parameters
        ----------
        nei : pandas.DataFrame
            Unformatted NEI, prior to estimating
            quartile values for throughput and energy.

        data : str; 'energy' or 'throughput'

        """

        selection = {
            'energy': {
                'column': 'energy_MJ',
                'title': 'Energy'
                },
            'throughput': {
                'column': 'throughput_TON',
                'title': 'Throughput'
                }
            }

        duplic = nei[(nei[selection[data]['columns']] > 0) &
                        (nei.eis_process_id.duplicated(keep=False) == True)]

        duplic = duplic.groupby(
            ['eis_process_id']).agg(
                perc_diff=(
                    selection[data]['column'],
                    lambda x: ((x.max()-x.min())/x.mean())*100
                    )
                ).reset_index()

        plt.rcParams['figure.dpi'] = 300
        plt.rcParams['savefig.dpi'] = 300
        plt.rcParams['font.sans-serif'] = "Arial" 

        sns.histplot(data=duplic, x="perc_diff")  # sns.kdeplot
        plt.xlabel('Percentage difference')
        plt.ylabel('Units')
        plt.title(selection[data]['title'])

        return None




[docs]
    def id_sectors(self, df):
        """
        Make a new sector column for NAICS 2-digit

        Returns
        -------
        df : pandas.DataFrame
            FIED with 2-digit NAICS code column

        """

        if 'n2' in df.columns:
            pass

        else:
            df = self.make_consistent_naics_column(df, n=2)

        sectors = pd.DataFrame(
            [['Agriculture', 11], ['Construction', 21], ['Mining', 23], 
            ['Manufacturing', 31], 
            ['Manufacturing', 32], ['Manufacturing', 33]],
            columns=['sector', 'n2']
            )

        df = pd.merge(df, sectors, on='n2', how='left')

        return df




[docs]
    def summary_unit_bar(self, summary_table, write_fig=True):
        """
        Make stacked bar chart showing units by Sector and
        total number of facilities reporting units

        Paramters
        ---------
        summary_table : pandas.DataFrame
            Output of `summary_unit_table` method.

        write_fig : bool; default=True
            Write figure to analysis figures directory

        """

        plot_data = summary_table.reset_index()

        len_unit_types = len(plot_data.unitTypeStd.unique())

        fig = make_subplots(specs=[[{"secondary_y": True}]])

        plot_data.sort_values(by=['Count of Units'], ascending=False)

        fig_units = px.bar(
            plot_data,
            x='sector',
            y='Count of Units',
            labels={
                'value': 'Facility Count',
                'variable': 'Facilities'
                },
            template='simple_white',
            color='unitTypeStd',
            color_discrete_sequence=px.colors.qualitative.Alphabet
            # color_discrete_sequence=px.colors.sample_colorscale(
            #     "plasma", [n/(len_unit_types-1) for n in range(len_unit_types)]
            #     )
            )

        for d in range(0, len(fig_units.data)):
            fig.add_trace(fig_units.data[d], secondary_y=False)

        fac_counts = plot_data.drop_duplicates(['sector', 'Count of Facilities'])

        fig.update_layout(barmode='stack')
        fig.add_trace(
            go.Scatter(
                mode='markers',
                x=fac_counts.sector,
                y=fac_counts['Count of Facilities'],
                marker=dict(size=26, color='LightSkyBlue'),
                showlegend=True,
                name='Facilities'
            ), secondary_y=True
            )
        
        fig.update_yaxes(automargin=True, title='Count of Units',
                        secondary_y=False,
                        range=[0, 70000],
                        tickmode='linear',
                        tick0=0,
                        dtick=7000
                        )

        fig.update_yaxes(automargin=True, title='Count of Facilities',
                        secondary_y=True, range=[0, 20000],
                        tickmode='linear',
                        tick0=0,
                        dtick=2000)

        fig.update_xaxes(automargin=True, title='Sector')
        fig.update_layout(
            template='presentation',
            legend=dict(title_text='Unit Type', font=dict(size=16), yanchor='top', y=0.99),
            width=1200,
            height=800
            )

        fname = self._fig_path / f'summary_figure_{self._year}'

        if write_fig:
            pio.write_image(
                fig,
                file=fname, 
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()

        return None

    

[docs]
    def set_mecs_data(self, year=2018):
        """"
        MECS format is not machine-friendly. This is a manual input
        of MECS combustion energy estimates from MECS Table 3.2:
        https://www.eia.gov/consumption/manufacturing/about.php

        Returns
        -------
        mecs : dict
            Dictionary of combustion energy estimates from MECS (converted from TBtu to MJ), on
            a national and census region basis.
        """

        mecs = {
            2018: {
                'nation': (14859 - 2591) * 1.055E9,
                'northeast': (1130 - 250) * 1.055E9 ,
                'midwest': (3907 - 834) * 1.055E9,
                'south': (7897 - 1143) * 1.055E9,
                'west': (1925 - 365) * 1.055E9
                }
            }
        
        return mecs[year]

    

[docs]
    def get_eia_seds(self, year=2017):
        """
        Get EIA State Energy Data System (SEDS) data

        Parameters
        ----------
        year : int; default=2017
            Year of SEDS data to return.

        Returns
        -------
        seds : pandas.DataFrame
            
        
        """
        seds_url = 'https://www.eia.gov/opendata/bulk/SEDS.zip'

        try:
            r = requests.get(seds_url)
            r.raise_for_status()

        except r.exceptions.HTTPError as e:
            logging.ERROR(e)

        else:
            with zipfile.ZipFile(BytesIO(r.content)) as zf:
                with zf.open(zf.namelist()[0]) as f:
                    seds = pd.read_json(f, lines=True)
                    # eia_data = pd.DataFrame.from_dict(f)


        seds.dropna(subset=['series_id'], inplace=True)
        seds.dropna(axis=1, thresh=1, inplace=True)

        # Natural gas consumed by the industrial sector (including supplemental gaseous fuels)
        # All petroleum products consumed by the industrial sector
        # Coal consumed by the industrial sector
        # Wood and waste consumed in the industrial sector
        series = ['NGICB', 'PAICB' , 'CLICB', 'WWICB']

        seds = seds[seds.series_id.apply(
            lambda x: any([s in x for s in series])
            )].copy(deep=True)
            
        seds.reset_index(drop=True, inplace=True)

        final_seds = pd.DataFrame()

        for i in seds.index:

            data = pd.DataFrame(
                seds.loc[i, 'data'], 
                columns=['year', 'BillionBtu']
                )
            
            data = pd.concat(
                [pd.DataFrame(np.tile(seds.loc[i, seds.columns[0:-1]].values.reshape(13, 1), len(data))).T,
                 data], axis=1)
            
            final_seds = final_seds.append(data)

        for i, v in enumerate(seds.columns[0:-1]):
            final_seds.rename(columns={i:v}, inplace=True)

        final_seds.year.update(final_seds.year.astype(int))

        final_seds = final_seds.query("year==@year").copy(deep=True)
        
        final_seds.loc[:, 'MJ'] = final_seds.BillionBtu * 1.055E6
        final_seds.drop(['start', 'end', 'BillionBtu', 'last_updated'], 
                        axis=1, inplace=True)
        
        final_seds = final_seds[final_seds.geography != 'USA'].copy(deep=True)
        
        def split_columns(**kwargs):

            final_seds = kwargs['data']

            df = pd.DataFrame(
                final_seds[kwargs['data_column']].unique(),
                columns=[kwargs['data_column']]
                )

            df.loc[:, kwargs['new_column']] = df[kwargs['data_column']].apply(
                lambda x: x.split(f'{kwargs["split_char"]}')[1]
                )
            
            final_seds = pd.merge(final_seds, df, on=kwargs['data_column'], how='left')

            return final_seds

        final_seds = split_columns(
            data=final_seds,
            data_column= 'geography', 
            new_column='state',
            split_char='-'
            )

        # final_seds = split_columns(
        #     data = final_seds,
        #     data_column = 'series_id', 
        #     new_column='data_id',
        #     split_char= '.'
        #     )

        # states = pd.DataFrame(final_seds.geography.unique(), columns=['geography'])
        # states.loc[:, 'state'] = states.geography.apply(
        #     lambda x: x.split('-')[1]
        #     )
        
        # final_seds = pd.merge(final_seds, states, on='geography', how='left')

        final_seds = final_seds.groupby('state', as_index=False).MJ.sum()

        return final_seds




[docs]
    def get_state_region(self):
        """ Download state to region file"""

        url = 'https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv'
    
        s_r = pd.read_csv(url, encoding='latin_1')

        return s_r

    

[docs]
    def plot_eia_comparison_maps(self, dataset, year=2017, write_fig=True):
        """
        Plots a relative comparison of FIED vs. EIA on a geographic 
        basis (combustion energy only). 
        For MECS, this is census region; for SEDS, states.

        Parameters
        ----------
        dataset : str; {'mecs', 'seds'}
            EIA dataset to compare FIED against.

        year : int; default=2017

        write_fig : bool; default=True
            Write figure to analysis figures directory

        
        Returns
        -------
        
        """
        shared_plot_args = dict(
            color_continuous_scale= [
                [0, 'rgb(43,131,186)'],
                [0.25, 'rgb(171,221,164)'],
                [0.5, 'rgb(255,255,191)'],
                [0.75, 'rgb(253,174,97)'],
                [1, 'rgb(215,25,28)']
                ],
            color_continuous_midpoint=1,
            locationmode="USA-states",
            scope='usa',
            locations = 'stateCode',
            color = 'fied_relative',
            labels={'fied_relative': f'FIED relative to EIA {dataset.upper()}'}
            )

        plot_data = self._fied.copy(deep=True)

        # with requests.get('https://public.opendatasoft.com/api/explore/v2.1/catalog/datasets/us-state-boundaries/exports/geojson?lang=en&timezone=America%2FNew_York') as r:
        #     states = r.json()

        # for s in range(0, len(states['features'])):
        #     states['features'][s]['id'] =  states['features'][s]['properties']['stusab']

        if dataset == 'mecs':
            
            mecs = self.set_mecs_data()

            s_r = self.get_state_region()[['State Code', 'Region']]
            s_r.Region.update(s_r.Region.apply(lambda x: x.lower()))

            plot_data = pd.merge(
                plot_data, s_r, left_on='stateCode', right_on='State Code',
                how='inner'
                )
            
            # Need to remove non-manufacturing facilities
            plot_data_n = pd.Series(plot_data.naicsCode.unique())
            plot_data_n = pd.concat(
                [plot_data_n, plot_data_n.apply(lambda x: int(str(x)[0]))],
                axis=1
                )
            plot_data_n.columns = ['naicsCode', 'n1']
            plot_data_n = plot_data_n[plot_data_n.n1==3]

            plot_data = pd.merge(plot_data, plot_data_n, how='inner',
                                 on='naicsCode')
            
            plot_data = pd.concat([
                plot_data[
                    (plot_data.energyMJ==0) | (plot_data.energyMJ.isnull())
                    ].groupby(
                        ['Region', 'stateCode']
                        ).energyMJq2.sum(),
                plot_data.groupby(
                        ['Region', 'stateCode']
                    ).energyMJ.sum()
                ], axis=1
                )

            plot_data.loc[:, 'totalMJ'] = plot_data.sum(axis=1)

            plot_data.loc[:, 'mecsMJ'] = plot_data.index.get_level_values(0).map(mecs)

            plot_data.loc[:, 'fied_relative'] =  plot_data.totalMJ.sum(level=0).divide(plot_data.mecsMJ)

            plot_data.reset_index(inplace=True)

        elif dataset == 'seds':

            seds = self.get_eia_seds(year=2017)
            seds.rename(columns={'MJ': 'sedsMJ'}, inplace=True)

            plot_data = pd.concat([
                plot_data[
                    (plot_data.energyMJ==0) | (plot_data.energyMJ.isnull())
                    ].groupby(['stateCode']).energyMJq2.sum(),
                plot_data.groupby(['stateCode']).energyMJ.sum()
                ], axis=1
                )
            
            plot_data.loc[:, 'totalMJ'] = plot_data.sum(axis=1)

            plot_data = plot_data.join(seds.set_index('state'))

            plot_data.loc[:, 'fied_relative'] =  plot_data.totalMJ.divide(plot_data.sedsMJ)
            
            plot_data.reset_index(inplace=True)
    
            plot_data.dropna(inplace=True)  # Drop Territories

        rel_max = plot_data.fied_relative.max()
        
        if rel_max > np.around(rel_max):
            rel_max = np.around(rel_max)+1

        else:   
            rel_max = np.around(rel_max)

        shared_plot_args['range_color'] =(0, rel_max)
        shared_plot_args['data_frame'] = plot_data

        plot_data.to_csv(f'{dataset}_comparison_data.csv')

        fig = px.choropleth(**shared_plot_args)
            # plot_data,
            # color_continuous_scale= [
            #     [0, 'rgb(43,131,186)'],
            #     [0.25, 'rgb(171,221,164)'],
            #     [0.5, 'rgb(255,255,191)'],
            #     [0.75, 'rgb(253,174,97)'],
            #     [1, 'rgb(215,25,28)']
            #     ],
            # range_color=(0, rel_max),
            # color_continuous_midpoint=1,
            # locationmode="USA-states",
            # scope='usa',
            # locations = 'stateCode',
            # color = 'fied_relative',
            #     labels={'fied_relative': 'FIED Relative to EIA MECS'}
            # )
        fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

        if write_fig:
            fname = self._fig_path / f'{dataset}_comaprison_map_{self._year}'
            pio.write_image(
                fig,
                file=fname, 
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()

            
    

[docs]
    def plot_eia_comparison(self, eia_mecs=1.568E13, eia_mer=2.316E13):
        """
        Plot bar chart depicting FIED energy estimates relative to EIA estimates.    


        Parameters
        ----------
        eia_mecs : float; default=1.568E13
            Combustion energy use (in MJ) from EIA Manufacturing Consumption survey. 
            Default represents 2018 value.

        eia_mer : float; default=2.316E13
            Combustion energy use (in MJ) for industry from EIA Monthly Energy Review (MER).
            Default represents 2017 value. 
        
        """





[docs]
    def plot_unit_bubble_map(self, unit_type, measure, max_size=66, write_fig=True):
        """
        Plot locations of a single standard unit type by
        either energy (MJ) or design capacity.

        Parameters
        ----------
        unit_type : str
            Standard unit type: 'other combustion', 'kiln', 'dryer', 
            'boiler', 'heater', 'turbine', 'oven', 'engine', 'furnace', 
            'thermal oxidizer', 'incinerator', 'other', 'generator', 
            'flare', 'stove', 'compressor', 'pump',
            'building heat', 'distillation'

        measure : str; {'energy', 'power', 'ghgs'}
            Either 'energy' (results in MJ), 'power' (results in MW), or
            'ghgs' (results in MTCO2e)

        max_size : int
            Max size of bubbles on map

        write_fig : bool; default=True
            Write figure to analysis figures directory
        """

        if unit_type:

            plot_data = pd.DataFrame(self._fied.query("unitTypeStd == @unit_type"))
            file_name = f'bubble_map_{measure}-{unit_type}_{self._year}'

        else:
            plot_data = self._fied.copy(deep=True)
            file_name = f'bubble_map_{measure}_{self._year}'

        # plot_args = {
        #     'lat': 'latitude',
        #     'lon': 'longitude',
        #     'scope': 'usa',
        #     'color': 'unitTypeStd',
        #     'title': f'Location of {unit_type.title()} Units Reporting {measure.title()} Data',
        #     'size_max': max_size,
        #     'color_discrete_sequence': px.colors.qualitative.Safe
        #     }
        

        if measure == 'energy':
            plot_data.loc[:, 'plotMJ'] = plot_data.query('energyMJ>0 & (energyMJq2.isnull() | energyMJq2==0)').energyMJ
            plot_data.loc[:, 'plotMJ'] = plot_data.query('(energyMJ==0 | energyMJ.isnull()) & energyMJq2>0').energyMJq2
            plot_data.dropna(subset=['plotMJ'], axis=0, inplace=True)

            # plot_args['size'] = plot_data.plotMJ.to_list()
            markersize = plot_data.plotMJ.to_list()
            # plot_args['labels'] = {
            #     'plotMJ': 'Unit Energy Use (MJ)'
            #     }
            ids = [f'{x} MJ' for x in plot_data.plotMJ.values]
            leg_title = 'Unit Energy Use (MJ)'
            plot_color='plotMJ'

        elif measure == 'power':
            plot_data = plot_data.query(
                "designCapacityUOM=='MW' & designCapacity>0"
                )
            # plot_args['size'] = plot_data.designCapacity.to_list()
            markersize = plot_data.designCapacity.to_list()
            # plot_args['labels'] = {
            #     'designCapacity': 'Unit Capacity (MW)'
            #     }
            # plot_args['title'] = f'Location of {unit_type.title()} Units Reporting Capacity Data'
            ids = [f'{x} MW' for x in plot_data.designCapacity.values]
            leg_title = 'Unit Capacity (MW)'
            plot_color='designCapacity'

        
        elif measure == 'ghgs':
            plot_data.loc[:, 'plotGHG'] = plot_data.query('ghgsTonneCO2e>0 & (ghgsTonneCO2eQ2.isnull() | ghgsTonneCO2eQ2==0)').ghgsTonneCO2e
            plot_data.loc[:, 'plotGHG'] = plot_data.query('(ghgsTonneCO2e==0 | ghgsTonneCO2e.isnull()) & ghgsTonneCO2eQ2>0').ghgsTonneCO2eQ2
            plot_data.dropna(subset=['plotGHG'], axis=0, inplace=True)
            plot_color = 'plotGHG'

            # plot_args['size'] = plot_data.plotGHG.to_list()
            markersize = plot_data.plotGHG.to_list()

            # plot_args['size'] = plot_data.plotGHG.to_list()
            # plot_args['labels'] = {
            #     'plotGHG': 'Unit GHG Emissions (tonne CO2e)'
            #     }
            # plot_args['title'] = f'Location of {unit_type.title()} Units Reporting GHG Data'
            ids = [f'{x} tonne CO2e' for x in plot_data.plotGHG.values]
            leg_title = 'Unit GHG Emissions (tonne CO2e)'

        fig = go.Figure(data = go.Scattergeo(
            locationmode='USA-states', 
            lat=plot_data['latitude'],
            lon=plot_data['longitude'],
            mode='markers',
            ids=ids,
            marker=dict(
                size=markersize,
                sizeref=max(markersize)/max_size**2,
                sizemode='area',
                autocolorscale=False,
                color=plot_data[plot_color],
                colorscale='agsunset',
                colorbar=dict(
                    title=leg_title,
                    tickfont=dict(size=18)
                    )
                )
            ))
        fig.update_layout(
            title=f'Location of {unit_type.title()} Units Reporting {measure.title()} Data',
            geo=dict(
                scope='usa',
                projection_type='albers usa',
                showland = True,
                landcolor = "rgb(255, 255, 255)",
                subunitcolor = "rgb(150, 150, 150)",
                subunitwidth = 0.5
                )
            )

        # fig = px.scatter_geo(plot_data, **plot_args)
        # fig.update_geos(bgcolor='#FFFFFF')
        # fig.update_layout(showlegend=True)
        # fig.update_yaxes(automargin=True)
        # fig.update_xaxes(automargin=True)

        if write_fig:
            pio.write_image(
                fig,
                file=self._fig_path / file_name,
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()

        return None

    

[docs]
    def plot_rel_bar_missing(self, write_fig=True):
        """
        Creates a simple stacked bar showing relative amount (percentage)
        of GHGRP and NEI facilities with and without unit-level data.

        Parameters
        ----------
        write_fig : bool; default=True
            Write figure to analysis figures directory
        
        """

        plot_data = {}

        label = "Percentage of Facilities with Unit-Level Characterization"

        for l in ['ghgrp', 'eisFacility']:
            plot_data[l] = self._fied.query(f"{l}ID.notnull()", engine="python").copy()
            
            plot_data[l] = pd.concat(
                [pd.Series(len(
                    plot_data[l].query("unitTypeStd.isnull()", engine="python").registryID.unique()
                    )),
                pd.Series(len(
                    plot_data[l].query("unitTypeStd.notnull()", engine="python").registryID.unique()
                    ))], 
                axis=1,
                ignore_index=False
                )
            
            plot_data[l].columns = ['Without Unit Characterization', 
                                   'With Unit Characterization']
            
            plot_data[l] = plot_data[l].divide(
                plot_data[l].sum(axis=1), axis=0
                )
            
            plot_data[l].loc[:, 'Data Source'] = l

        plot_data = pd.concat(
            [plot_data[l] for l in plot_data.keys()],
            axis=0, ignore_index=True
            )
        
        plot_data.drop(['Without Unit Characterization'], axis=1, inplace=True)
        
        plot_data.replace({'eisFacility': 'NEI', 'ghgrp': 'GHGRP'}, inplace=True)

        fig = px.bar(
            plot_data,
            x='Data Source',
            y=['With Unit Characterization'],
            labels={
                'value': label,
                'variable': 'Facilities'
                },
            template='plotly_white',
            color_discrete_map={
                'With Unit Characterization': "#756bb1"
                })

        fig.update_yaxes(automargin=True, range=[0, 1])
        fig.update_xaxes(automargin=True, type='category')
        fig.update_layout(
            yaxis_tickformat=".0%",
            template='presentation',
            height=800,
            width=600,
            legend=dict(orientation="h", x=0.25, title=""),
            font=dict(size=16)
            )

        if write_fig:
            pio.write_image(
                fig,
                file=self._fig_path / f'rel_bar_unit_char_{self._year}',
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()




[docs]
    def plot_stacked_bar_missing(self, naics_level=2, data_subset=None, write_fig=True):
        """"
        Creates stacked bar showing counts of facilities
        with and without unit-level data.

        Parameters
        ----------
        naics_level : int; default=2
            Specific level of NAICS aggregation to display
            data. NAICS is a range of integers from 2 to 6.
            
        data_subset : str; {None, 'ghgrp', 'nei'}
            Plot subset of data, either facilities that 
            are GHGRP or NEI reporters

        rel_total : bool; default=False
            Plot the relative total of NEI and GHGRP facilities that
            have unit-level characterization. Renders naics_level and data_subset 
            obsolete. 

        write_fig : bool; default=True
            Write figure to analysis figures directory

        """

        plot_data = self._fied.copy(deep=True)
        
        label = "Facility Count"
        
        if data_subset == 'ghgrp':
            plot_data = plot_data.query(
                "ghgrpID.notnull()", engine="python"
                ).copy()
            
            label = label + " (GHGRP Reporters Only)"

        elif data_subset == 'nei':
            plot_data = plot_data.query(
                "eisFacilityID.notnull()", engine="python"
                ).copy()
            
            label = label + " (NEI Reporters Only)"

        else:
            pass

        plot_data = pd.concat(
            [plot_data.query("unitTypeStd.isnull()", engine="python").groupby(
                f'n{naics_level}'
                ).apply(lambda x: np.size(x.registryID.unique())),
            plot_data.query("unitTypeStd.notnull()", engine="python").groupby(
                f'n{naics_level}'
                ).apply(lambda x: np.size(x.registryID.unique()))], 
            axis=1,
            ignore_index=False
            )
        
        if naics_level == 2:
            plot_data.loc['Manufacturing', :] = \
                plot_data.loc[31:, :].sum()

            plot_data.reset_index(inplace=True)

            plot_data.columns = ['NAICS Code', 'Without Unit Characterization', 
                                 'With Unit Characterization']
        
            plot_data['NAICS Code'].replace({11: 'Agriculture', 21: 'Mining', 23: 'Construction'},
                              inplace=True)

            plot_data = plot_data[[type(x)!=int for x in plot_data['NAICS Code']]]

        else:
            plot_data.reset_index(inplace=True)

            plot_data.columns = ['NAICS Code', 'Without Unit Characterization', 
                                'With Unit Characterization']

            plot_data.dropna(subset=['NAICS Code'], inplace=True)

            plot_data.loc[:, 'NAICS Code'] = plot_data['NAICS Code'].astype(str)

        fig = px.bar(plot_data,
                    x='NAICS Code',
                    y=['Without Unit Characterization',
                        'With Unit Characterization'],
                    labels={
                        'value': label,
                        'variable': 'Facilities'
                        },
                    template='plotly_white',
                    color_discrete_map={
                        'Without Unit Characterization': "#bcbddc",
                        'With Unit Characterization': "#756bb1"
                        })

        fig.update_yaxes(automargin=True)
        fig.update_xaxes(automargin=True, type='category')
        fig.update_layout(
            template='presentation',
            height=800,
            width=1200,
            legend=dict(orientation="h", x=0.25, title=""),
            font=dict(size=16)
            )

        if write_fig:
            pio.write_image(
                fig,
                file=self._fig_path / f'stacked_bar_NAICS{naics_level}_{data_subset}_{self._year}',
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()

        return None



[docs]
    def plot_ut_by_naics(self, naics_level=None, variable='count', write_fig=True):
        """
        Creates a table that summarizes by industrial sector
        (i.e., 2-digit NAICS) various aspects of the dataset

        Parameters
        ----------
        naics_level : int; default=None
            Specified NAICS level (None or 2 - 6)

        variable : str; {'count', 'energy', 'capacity'}

        write_fig : bool; default=True
            Write figure to analysis figures directory

        """

        formatting = {
            'x': 'unitTypeStd',
            'template': 'presentation',
            'labels': {
                'unitTypeStd': 'Standardized Unit Type'
                },
            }
        
        plot_data = self._fied.copy(deep=True) 

        if not naics_level:

            grouper = 'unitTypeStd'

        else:

            grouper = ['unitTypeStd', f'n{naics_level}']

            if naics_level == 2:
                pass

            else:
                plot_data = self.make_consistent_naics_column(
                    plot_data, naics_level
                    )

            plot_data.loc[:, f'n{naics_level}'] = \
                plot_data[f'n{naics_level}'].astype(str)

            len_naics = len(plot_data[f'n{naics_level}'].unique())

            formatting['labels'][f'n{naics_level}'] = 'NAICS Code'
            formatting['color'] = f'n{naics_level}'
            # formatting['color_discrete_sequence'] = px.colors.sequential.Plasma_r
            formatting['color_discrete_sequence'] = px.colors.sample_colorscale(
                "plasma_r", [n/(len_naics-1) for n in range(len_naics)]
                )

        if variable == 'count':
            plot_data = plot_data.query(
                "unitTypeStd.notnull()", engine="python"
                ).groupby(grouper).registryID.count()

            formatting['labels']['registryID'] = 'Unit Count'
            formatting['y'] = 'registryID'

        elif variable == 'energy':
            plot_data = plot_data.query(
                "unitTypeStd.notnull()", engine="python"
                ).groupby(grouper)['energyMJ', 'energyMJq0'].sum().sum(axis=1)

            plot_data.name = 'energy'
            formatting['y'] = 'energy' 
            formatting['labels']['energy'] = 'Energy (MJ)'

        elif variable == 'capacity':
            plot_data = plot_data.query(
                "unitTypeStd.notnull() & designCapacityUOM=='MW'", engine="python"
                ).groupby(grouper).designCapacity.sum()

            formatting['y'] = 'designCapacity'
            formatting['labels']['designCapacity'] = 'Unit Design Capacity (MW)'

        else:
            logging.error('No variable specified')
            raise ValueError

        if type(plot_data) == pd.core.series.Series:
            plot_data = pd.DataFrame(plot_data).reset_index()

        else:
            plot_data.reset_index(inplace=True)

        plot_data.loc[:, 'unitTypeStd'] = [x.capitalize() for x in plot_data.unitTypeStd]
        plot_data = plot_data.sort_values(by=grouper, ascending=True)

        fig = px.bar(plot_data, **formatting)

        if variable == 'count':
            pass

        else:
            fig.update_layout(
                yaxis=dict(
                    showexponent='all',
                    exponentformat='power'
                    )
                )

        fig.update_yaxes(automargin=True)
        fig.update_xaxes(automargin=True)

        if write_fig:
            pio.write_image(
                fig,
                file=self._fig_path / f'unittype_NAICS{naics_level}_{variable}_{self._year}',
                format=self._fig_format,
                engine=self._pio_engine
                )

        else:
            fig.show()

        return None



[docs]
    def make_consistent_naics_column(self, final_data, n):
        """
        Creates a column of consisently aggregated NAICS codes
        (i.e., same number of digits) when a column of 
        NAICS codes contains different levels of aggregation.
        Will only include 

        Parameters
        ----------
        final_data : pandas.DataFrame or parquet

        n : int; 2 to 6
            Specified NAICS level

        Returns
        -------
        analysis_data : pandas.DataFrame
            Returns original DataFrame with
            new column named f'n{n}'.
        """

        df_naics = pd.DataFrame(final_data['naicsCode'].drop_duplicates(),
                                dtype=int)

        df_naics_consistent = df_naics[
            df_naics.naicsCode.apply(lambda x: len(str(x)) >= n)
            ]

        if len(df_naics_consistent) < len(df_naics):
            logging.info(f"Caution: not all original NAICS codes are at this aggregation")

        else:
            pass

        df_naics_consistent.loc[:, f'n{n}'] = df_naics_consistent.naicsCode.apply(
            lambda x: int(str(x)[0:n])
            )

        analysis_data = final_data.copy(deep=True)
        analysis_data = pd.merge(
            analysis_data, df_naics_consistent, on='naicsCode'
            )

        return analysis_data


    

[docs]
    def plot_best_characterized(self):
        """
        Identify which NAICS codes have the highest portion of facilities that
        have some unit-level characterization associated with them.

        """

        plot_data = self._fied.copy(deep=True)

        # find most aggregated NAICS (smallest number of digits)
        smallest_n = plot_data.naicsCode.unique()

        smallest_n = min([len(str(int(x))) for x in smallest_n])

        plot_data = self.make_consistent_naics_column(plot_data, smallest_n)

        plot_data = plot_data.groupby('stateCode') 

        plot_data = pd.concat([plot_data.apply(lambda x: pd.concat(
            [x.query("unitTypeStd.isnull()", engine="python").groupby(
                f'n{smallest_n}'
                ).apply(lambda y: np.size(y.registryID.unique())),
            x.query("unitTypeStd.notnull()", engine="python").groupby(
                f'n{smallest_n}'
                ).apply(lambda y: np.size(y.registryID.unique()))], 
            axis=1,
            ignore_index=False
            ))], axis=0)
        
        
        plot_data.columns = ['unchar', 'char']
        plot_data.loc[:, 'total'] = plot_data.sum(axis=1)
        plot_data.loc[:, 'fraction_char'] = plot_data.char.divide(
            plot_data.total
            )
        
        plot_data.fillna(0, inplace=True)
        
        # # State sorted mean characterized
        # state_char = pd.concat(
        #     [plot_data.fraction_char.mean(level=0),
        #      plot_data.total.sum(level=0)], axis=1
        #     ).sort_values(by='fraction_char', ascending=False)

        # # NAICS sorted mean characterized
        # naics_char = pd.concat(
        #     [plot_data.fraction_char.mean(level=1),
        #      plot_data.total.sum(level=1)],
        #      axis=1
        #     ).sort_values(by='fraction_char', ascending=False)
    
        # Level 0 is state aggregation; level 1 is NAICS aggregation
        for k in [0, 1]:

            d = pd.concat(
                [plot_data.fraction_char.mean(level=k),
                plot_data.total.sum(level=k)], axis=1
                ).sort_values(by='fraction_char', ascending=False)
            
            if k == 1:

                d.index = d.index.astype('str')

                d = d[d.fraction_char > 0]

            fig, ax = plt.subplots(figsize=(6, 12), tight_layout=False)
            
            sns.scatterplot(
                data=d,
                y=d.index,
                x="fraction_char",
                size="total",
                sizes=(10, 800),
                alpha=.5, ax=ax
                )
            
            if k == 1:
                [l.set_visible(False) for (i, l) in enumerate(ax.yaxis.get_ticklabels()) if i % 6 != 0]
                ax.set_ylabel('NAICS Code')
                
            else:
                ax.set_ylabel('State')
           
            ax.set_xlabel("Fraction of Facilities with Unit Characterization")
            plt.legend(
                title='Total Number of Facilities', 
                loc='best',
                frameon=False)

            plt.show()

            

    # def unit_capacity_nonintensive(self):
    #     """
        
    #     Parameters
    #     ----------

    #     df : pandas.DataFrame
    #         Final foundational dataset

    #     naics : str; {'all', 'non-mfg', 'intensive', 'other-mfg'} or list of ints; default='all'
    #         Group output by NAICS code group (str) or by list of integer(s). Integers must
    #         be at 4-digit NAICS level.

    #     Returns
    #     -------

    #     naics_plot : 
        
    #     """

    #     # plot_data = id_sectors(final_data)

    #     # Make subplots, one each for non-mfg,
    #     # intenstive, and other-mfg
    #     fig = make_subplots(
    #         rows=2, cols=2,
    #         specs=[
    #             [{}, {}],
    #             [{"colspan": 2}, None]
    #             ],
    #         print_grid=False)
        


[docs]
    def summary_table_intensive(self):
        """
        Summary description of unit coverage for intensive 
        and non-intensive industries.
        
        """


        df_naics = pd.DataFrame(
            self._fied.naicsCode.drop_duplicates()
            )

        df_naics.loc[:, 'n4'] = df_naics.naicsCode.apply(lambda x: int(str(x)[0:4]))

        plot_data = dict(
            all=pd.merge(self._fied, df_naics, on='naicsCode')
            )

        # NAICS subsectors of Paper Manufacturing, Petroleum and Coal Products,
        # Chemical Manufacturing, Glass and Glass Product Manufacturing, Cement and
        # Concrete, Lime and Gypsum, Iron & Steel, Alumina and Aluminum,
        intensive_naics = [3221, 3241, 3251, 3253, 3272, 3273, 3274, 3311, 3313]

        plot_data['non-mfg'] = \
            plot_data['all'][~plot_data['all'].n4.between(3000, 4000)]

        plot_data['intensive'] = \
            plot_data['all'][plot_data['all'].n4.isin(intensive_naics)]

        plot_data['other-mfg'] = \
            plot_data['all'][
                ~plot_data['all'].n4.isin(intensive_naics) & \
                plot_data['all'].n4.between(3000, 4000)]

        for k in plot_data.keys():
            if k == 'all':
                pass
            else:
                plot_data['all'].loc[plot_data[k].index, 'grouping'] = k      

        summ_table = pd.DataFrame(index=['intensive', 'other-mfg', 'non-mfg'])

        ind_grp = plot_data['all'].groupby('grouping')

        summ_table = pd.concat([
            ind_grp.apply(lambda x: len(x.registryID.unique())),
            ind_grp.apply(
                lambda x: len(x[x.unitTypeStd.notnull()].registryID.unique())
                ),
            ind_grp.apply(lambda x: len(x[
                (x.unitTypeStd.notnull()) &
                (x.designCapacity.notnull()) &
                (x.designCapacityUOM == 'MW')
                ].registryID.unique())
                ),
            ind_grp.apply(
                lambda x: x[x.designCapacityUOM == 'MW'].designCapacity.sum()
                ),
            ind_grp.apply(
                lambda x: x[x.designCapacityUOM == 'MW'].designCapacity.median()
                )], axis=1, ignore_index=False
            )

        summ_table.update(
            summ_table.loc[:, [1, 2]].divide(summ_table.loc[:, 0], axis=0)
            )

        summ_table.columns = ['Count of Facilities', 'Facilities with Unit Type',
                            'Facilities with Unit Type & Capacity',
                            'Total Capacity (MW)', 'Median Capacity (MW)']

        summ_table = summ_table.style.format({
            'Facilities with Unit Type': '{:.0%}',
            'Facilities with Unit Type & Capacity': '{:.0%}',
            'Total Capacity (MW)': '{:.2f}'}
            )


        
if __name__ == '__main__':
        
    year = 2017
    filepath = os.path.abspath('foundational_industry_data_2017.csv.gz')
    fa = FIED_analysis(year=year, file_path=filepath, fig_format='svg')

    fa.create_core_analysis(write_fig=False)