Source code for fied.geocoder.geopandas_tools


import geopandas as gpd
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)


[docs] class FiedGIS: def __init__(self): self._statefips = pd.read_csv( 'https://www2.census.gov/geo/docs/reference/state.txt', sep='|', dtype={'STATE': str, 'STUSAB': str} ) self._statefips = dict( self._statefips[['STUSAB', 'STATE']].values )
[docs] @staticmethod def get_shapefile(year=None, state_fips=None, ftype=None): """ Get Census block group TIGER/Line shapefile for specified year and state FIPS code, or get USGS HUC geodatabase. Parameters ---------- year : int Year of shapefile state_fips : str or None State FIPS of shapefile. Not necessary for congressional district. ftype : str, {'BG', 'CD', 'HUC'} Type of file to return. 'BG' == census block groups; 'CD' == congressional districts; 'HUC' == hydrolic unit code. Returns ------- gf : geopandas.DataFrame gf """ if ftype == 'BG': _url = f'https://www2.census.gov/geo/tiger/TIGER{year}/BG/tl_{year}_{state_fips}_bg.zip' elif ftype == 'CD': _url = f'https://www2.census.gov/geo/tiger/TIGER{year}/CD/tl_{year}_us_cd115.zip' elif ftype == 'COUNTY': _url = f'https://www2.census.gov/geo/tiger/TIGER{year}/COUNTY/tl_{year}_us_county.zip' elif ftype == 'HUC': _url = 'https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/NHDPlusHR/National/GDB/NHDPlus_H_National_Release_1_GDB.zip' gf = gpd.read_file(_url) return gf
[docs] @staticmethod def merge_coordinates_geom(fied_state, gf, ftype=None, data_source='fied'): """" First creates POINT geometry from facility coordinates. Then locates the points within specific geographic identifier type. Finally, merges geographic identifier with facility DataFrame. Parameters ---------- fied_state : pandas.DataFrame DataFrame of foundational data by state gf : geopandas Shapefile containing Census tracts ftype : str, {'BG', 'CD', 'COUNTY', 'HUC'} Type of file to return. 'BG' == census block groups; 'CD' == congressional districts; 'COUNTY' == county FIPS; 'HUC' == hydrolic unit code. data_source : str, {'fied', 'ghgrp'} Source of data with missing geographic identifiers. Returns ------- matched_geo : pandas.DataFrame Geographic identifiers matched to facility coordinates. """ crs = "EPSG:4269" col_fix = { 'HUC': { 'geocolumn': '', 're_column': 'HUC' }, 'BG': { 'geocolumn': 'GEOID', 're_column': 'geoID' }, 'COUNTY': { 'geocolumn': 'GEOID', 're_column': 'COUNTY_FIPS' }, 'CD': { 'geocolumn': 'GEOID', 're_column': 'legislativeDistrictNumber' } } try: geometry = gpd.points_from_xy( fied_state.longitude, fied_state.latitude, crs=crs ) except AttributeError: geometry = gpd.points_from_xy( fied_state.LONGITUDE, fied_state.LATITUDE, crs=crs ) gdf = gpd.GeoDataFrame(fied_state, crs=crs, geometry=geometry) matched_geo= gpd.sjoin( gdf, gf[[col_fix[ftype]['geocolumn'], 'geometry']], how='left', predicate='within' ) matched_geo.rename( columns={col_fix[ftype]['geocolumn']: col_fix[ftype]['re_column']}, inplace=True ) if data_source == 'fied': drop_cols = ['geometry', 'latitude', 'longitude', 'index_right'] elif data_source == 'ghgrp': drop_cols = ['geometry', 'LATITUDE', 'LONGITUDE', 'index_right'] matched_geo.drop(drop_cols, axis=1, inplace=True) return matched_geo
[docs] def merge_geom(self, df, year=None, ftypes=['BG', 'CD'], data_source='fied'): """ Pulls together methods for creating Geopandas DataFrames from geographic information files and merges geographic identifiers with the foundational data set. Parameters ---------- df: pandas.DataFrame DataFrame with missing geographic data. year : int Year of foundational energy data. ftype : list; default=['BG', 'CD'] Type of missing geo data to fill in. data_source : str, {'fied', 'ghgrp'} Source of missing geographic data. Used to specify columns in dataframe to use. Returns ------- new_fied : pandas.DataFrame New foundational dataset with new columns for geographic identifiers. """ geo_data = pd.DataFrame() if data_source == 'fied': state_col = 'stateCode' data_cols = ['registryID', 'latitude', 'longitude'] fac_id = 'registryID' elif data_source == 'ghgrp': state_col = 'STATE' data_cols = ['FACILITY_ID', 'LATITUDE', 'LONGITUDE'] fac_id = 'FACILITY_ID' for state in df[state_col].unique(): geo_data_state = pd.DataFrame() try: state_fips = self._statefips[state] except KeyError: continue df_state = pd.DataFrame( df.query(f"{state_col}==@state")[data_cols] ) df_state.drop_duplicates(inplace=True) for t in ftypes: logging.info(f'Finding {t} for {state}') gf = FiedGIS.get_shapefile( year=year, state_fips=state_fips, ftype=t ) matched = FiedGIS.merge_coordinates_geom( fied_state=df_state, gf=gf, ftype=t, data_source=data_source ) geo_data_state = pd.concat( [geo_data_state, matched.set_index(fac_id)], axis=1 ) geo_data = geo_data.append(geo_data_state) if 'HUC' in ftypes: gf = FiedGIS.get_shapefile(year=year, ftype='HUC') hucs = FiedGIS.merge_coordinates_geom(df, gf, ftype='HUC') geo_data = pd.merge( geo_data, hucs, left_index=True, right_index=True, how='left' ) else: pass if 'legislativeDistrictNumber' in df.columns: df.drop(['legislativeDistrictNumber'], axis=1, inplace=True) else: pass df = pd.merge( df, geo_data, left_on=fac_id, right_index=True, how='left' ) return df