Source code for fied.ghgrp.get_GHGRP_data

# -*- coding: utf-8 -*-

import pandas as pd
import requests
import sys
import time
import logging
from requests.adapters import HTTPAdapter, Retry

logging.basicConfig(level=logging.INFO)


[docs] def requests_retry_session(retries=3, backoff_factor=0.7, status_forcelist=[500, 502, 504], session=None): """ Parameters ---------- retries : int Number of retries to allow. backoff_factor : float Backoff factor to apply between attempts. status_forcelist : list List of HTTP status codes to force a retry on. session : None Returns ------- session : """ session = session or requests.Session() retry = Retry( total=retries, read=retries, connect=retries, backoff_factor=backoff_factor, status_forcelist=status_forcelist, ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) return session
[docs] def get_count(table_url): """ Get the number of rows for a specified GHGRP table (via url) from EPA Envirofacts. Parameters ---------- table_url : str URL for Envirofacts API Returns ------- row_count : int Count of table rows. """ table_url = f'{table_url}/count/json' t0 = time.time() try: r = requests_retry_session().get(table_url) logging.info(f'{r.status_code}') # r = requests.get(table_url) except requests.exceptions.RequestException as e: logging.error(f'{e}\nTable url: {table_url}') sys.exit(1) try: row_count = r.json()[0]['TOTALQUERYRESULTS'] except (IndexError, requests.exceptions.JSONDecodeError) as e: logging.error(f'Check API respose: {e}\n{r.status_code}') sys.exit(1) t1 = time.time() logging.info(f"That took {t1 - t0} seconds") return row_count
[docs] def get_records(table_url, start_end): """ Get specified rows for a specified GHGRP table (via url) from EPA Envirofacts. Parameters ---------- table_url : str URL for Envirofacts API start_end : list of integers List indicating the starting and ending row to get (e.g., [0, 1000]) Returns ------- records_df : pandas.DataFrame DataFrame of records from Envirofacts API. """ table_url = f'{table_url}/rows/{start_end[0]}:{start_end[1]}/json' t0 = time.time() try: r_records = requests_retry_session().get(table_url) logging.info(f'{r_records.status_code}') # r_records = requests.get(table_url) except requests.exceptions.RequestException as e: logging.error(f'{e}\n{table_url}') sys.exit(1) try: json_data = pd.DataFrame(r_records.json()) except requests.exceptions.JSONDecodeError as e: logging.error(f'{e}\nTable URL: {table_url}\n{r_records.content}') sys.exit(1) t1 = time.time() logging.info(f"That took {t1 - t0} seconds") return json_data
[docs] def get_GHGRP_records(reporting_year, table, rows=None, api_row_max=1000): """ Return GHGRP data using EPA RESTful API based on specified reporting year and table. Tables of interest are C_FUEL_LEVEL_INFORMATION, D_FUEL_LEVEL_INFORMATION, c_configuration_level_info, and V_GHG_EMITTER_FACILITIES. Optional argument to specify number of table rows. Parameters ---------- reporting_year : int Reporting year of GHGRP data table : str Name of GHGRP Envirofacts table to retrieve records from rows : int; default=None Number of table rows to retrieve, beginning at row 0. api_row_max : int; default={1000} Maximum number of table rows to return at a time. Envirofacts API for the GHGRP seems to be overwhelmed by > 1000 rows. Returns ------- ghgrp : pandas.DataFrame DataFrame of GHGRP Envirofacts data. """ if table[0:14] == 'V_GHG_EMITTER_': # EPA changed their table names if table == 'V_GHG_EMITTER_FACILITIES': table = 'RLPS_GHG_EMITTER_FACILITIES' elif table == 'V_GHG_EMITTER_SUBPART': table = 'RLPS_GHG_EMITTER_SUBPART' table_url = f'https://enviro.epa.gov/enviro/efservice/{table}/YEAR/{reporting_year}' else: table_url = f'https://enviro.epa.gov/enviro/efservice/{table}/REPORTING_YEAR/{reporting_year}' ghgrp = pd.DataFrame() if rows is None: nrecords = get_count(table_url) # API doesn't seem to be able to handle calls for more than 1000 rows at a time. if nrecords > api_row_max: rrange = range(0, nrecords, api_row_max) for n in range(len(rrange) - 1): json_data = get_records(table_url, [rrange[n], rrange[n + 1]]) ghgrp = ghgrp.append(json_data) records_last = get_records(table_url, [rrange[-1], nrecords]) ghgrp = ghgrp.append(records_last) else: json_data = get_records(table_url, [0, nrecords]) ghgrp = ghgrp.append(json_data) try: r_records = \ requests.get(f'{table_url}/rows/0:{nrecords}/json') except requests.exceptions.RequestException as e: logging.error(f'{e}, {table_url}') sys.exit(1) try: json_data = pd.DataFrame(r_records.json()) except requests.exceptions.JSONDecodeError: logging.error(f'{r_records.content}') else: ghgrp = ghgrp.append(json_data) else: if rows > api_row_max: rrange = range(0, rows, api_row_max) for n in range(len(rrange) - 1): json_data = get_records(table_url, [rrange[n], rrange[n + 1]]) ghgrp = ghgrp.append(json_data) records_last = get_records(table_url, [rrange[-1], nrecords]) ghgrp = ghgrp.append(records_last) # json_data = get_records(table_url, [0, rows]) # ghgrp = ghgrp.append(json_data) ghgrp.drop_duplicates(inplace=True) ghgrp.columns = [c.upper() for c in ghgrp.columns] return ghgrp