Source code for fied.ghgrp.get_GHGRP_data

# -*- coding: utf-8 -*-

import pandas as pd
import requests
import sys
import time
import logging
from requests.adapters import HTTPAdapter, Retry

logging.basicConfig(level=logging.INFO)



[docs]
def requests_retry_session(retries=3, backoff_factor=0.7, status_forcelist=[500, 502, 504], session=None):
    """


    Parameters
    ----------
    retries : int
        Number of retries to allow.

    backoff_factor : float
        Backoff factor to apply between attempts.

    status_forcelist : list
        List of HTTP status codes to force a retry on.

    session : None


    Returns
    -------
    session : 
    
    """

    session = session or requests.Session()

    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
        )

    adapter = HTTPAdapter(max_retries=retry)

    session.mount('http://', adapter)

    session.mount('https://', adapter)

    return session




[docs]
def get_count(table_url):
    """
    Get the number of rows for a specified GHGRP table (via url)
    from EPA Envirofacts.

    Parameters
    ----------
    table_url : str
        URL for Envirofacts API


    Returns
    -------
    row_count : int
        Count of table rows. 
    
    """

    table_url = f'{table_url}/count/json'

    t0 = time.time()
    try:  
        r = requests_retry_session().get(table_url)
        logging.info(f'{r.status_code}')
        # r = requests.get(table_url)

    except requests.exceptions.RequestException as e:
        logging.error(f'{e}\nTable url: {table_url}')
        sys.exit(1)

    try:
        row_count = r.json()[0]['TOTALQUERYRESULTS']

    except (IndexError, requests.exceptions.JSONDecodeError) as e:
        logging.error(f'Check API respose: {e}\n{r.status_code}')
        sys.exit(1)

    t1 = time.time()
    logging.info(f"That took {t1 - t0} seconds")

    return row_count




[docs]
def get_records(table_url, start_end):
    """
    Get specified rows for a specified GHGRP table (via url)
    from EPA Envirofacts.

    Parameters
    ----------
    table_url : str
        URL for Envirofacts API

    start_end : list of integers
        List indicating the starting and ending row to get
    (e.g., [0, 1000])

    Returns
    -------
    records_df : pandas.DataFrame
        DataFrame of records from Envirofacts API.
    
    """

    table_url = f'{table_url}/rows/{start_end[0]}:{start_end[1]}/json'

    t0 = time.time()
    try:
        r_records = requests_retry_session().get(table_url)
        logging.info(f'{r_records.status_code}')
        # r_records = requests.get(table_url)

    except requests.exceptions.RequestException as e:

        logging.error(f'{e}\n{table_url}')
        sys.exit(1)

    try:

        json_data = pd.DataFrame(r_records.json())

    except requests.exceptions.JSONDecodeError as e:

        logging.error(f'{e}\nTable URL: {table_url}\n{r_records.content}')
        sys.exit(1)

    t1 = time.time()
    logging.info(f"That took {t1 - t0} seconds")

    return json_data




[docs]
def get_GHGRP_records(reporting_year, table, rows=None, api_row_max=1000):
    """
    Return GHGRP data using EPA RESTful API based on specified reporting year
    and table. Tables of interest are C_FUEL_LEVEL_INFORMATION,
    D_FUEL_LEVEL_INFORMATION, c_configuration_level_info, and
    V_GHG_EMITTER_FACILITIES.
    Optional argument to specify number of table rows.

    Parameters
    ----------
    reporting_year : int
        Reporting year of GHGRP data

    table : str
        Name of GHGRP Envirofacts table to retrieve records from

    rows : int; default=None
        Number of table rows to retrieve, beginning at row 0.

    api_row_max : int; default={1000}
        Maximum number of table rows to return at a time. 
        Envirofacts API for the GHGRP seems to be overwhelmed by > 1000 rows.
    Returns
    -------
    ghgrp : pandas.DataFrame
        DataFrame of GHGRP Envirofacts data.
    """

    if table[0:14] == 'V_GHG_EMITTER_':

        # EPA changed their table names
        if table == 'V_GHG_EMITTER_FACILITIES':

            table = 'RLPS_GHG_EMITTER_FACILITIES'

        elif table == 'V_GHG_EMITTER_SUBPART':

            table = 'RLPS_GHG_EMITTER_SUBPART'

        table_url = f'https://enviro.epa.gov/enviro/efservice/{table}/YEAR/{reporting_year}'

    else:

        table_url = f'https://enviro.epa.gov/enviro/efservice/{table}/REPORTING_YEAR/{reporting_year}'

    ghgrp = pd.DataFrame()

    if rows is None:

        nrecords = get_count(table_url)

        # API doesn't seem to be able to handle calls for more than 1000 rows at a time. 

        if nrecords > api_row_max:

            rrange = range(0, nrecords, api_row_max)

            for n in range(len(rrange) - 1):

                json_data = get_records(table_url, [rrange[n], rrange[n + 1]])

                ghgrp = ghgrp.append(json_data)

            records_last = get_records(table_url, [rrange[-1], nrecords])

            ghgrp = ghgrp.append(records_last)

        else:

            json_data = get_records(table_url, [0, nrecords])

            ghgrp = ghgrp.append(json_data)

            try:
                r_records = \
                    requests.get(f'{table_url}/rows/0:{nrecords}/json')

            except requests.exceptions.RequestException as e:
                logging.error(f'{e}, {table_url}')
                sys.exit(1)

            try:

                json_data = pd.DataFrame(r_records.json())

            except requests.exceptions.JSONDecodeError:

                logging.error(f'{r_records.content}')
            
            else:

                ghgrp = ghgrp.append(json_data)

    else:

        if rows > api_row_max:

            rrange = range(0, rows, api_row_max)

            for n in range(len(rrange) - 1):

                json_data = get_records(table_url, [rrange[n], rrange[n + 1]])

                ghgrp = ghgrp.append(json_data)

            records_last = get_records(table_url, [rrange[-1], nrecords])

            ghgrp = ghgrp.append(records_last)

        # json_data = get_records(table_url, [0, rows])

        # ghgrp = ghgrp.append(json_data)

    ghgrp.drop_duplicates(inplace=True)

    ghgrp.columns = [c.upper() for c in ghgrp.columns]

    return ghgrp