Source code for fied.geocoder.geopandas_tools


import geopandas as gpd
import pandas as pd
import logging

logging.basicConfig(level=logging.INFO)



[docs]
class FiedGIS:
    
    def __init__(self):

        self._statefips = pd.read_csv(
            'https://www2.census.gov/geo/docs/reference/state.txt',
            sep='|', dtype={'STATE': str, 'STUSAB': str}
            )
        
        self._statefips = dict(
            self._statefips[['STUSAB', 'STATE']].values
            )


[docs]
    @staticmethod
    def get_shapefile(year=None, state_fips=None, ftype=None):
        """
        Get Census block group TIGER/Line shapefile for specified year 
        and state FIPS code, or get USGS HUC geodatabase. 

        Parameters
        ----------
        year : int
            Year of shapefile

        state_fips : str or None
            State FIPS of shapefile. Not necessary for congressional
            district.

        ftype : str, {'BG', 'CD', 'HUC'}
            Type of file to return. 'BG' == census block groups; 
            'CD' == congressional districts; 'HUC' == hydrolic unit code.

        Returns
        -------
        gf : geopandas.DataFrame
            gf
        
        """

        if ftype == 'BG':

            _url = f'https://www2.census.gov/geo/tiger/TIGER{year}/BG/tl_{year}_{state_fips}_bg.zip'

        elif ftype == 'CD':
            _url = f'https://www2.census.gov/geo/tiger/TIGER{year}/CD/tl_{year}_us_cd115.zip'

        elif ftype == 'COUNTY':
            _url = f'https://www2.census.gov/geo/tiger/TIGER{year}/COUNTY/tl_{year}_us_county.zip'

        elif ftype == 'HUC': 
            _url = 'https://prd-tnm.s3.amazonaws.com/StagedProducts/Hydrography/NHDPlusHR/National/GDB/NHDPlus_H_National_Release_1_GDB.zip'
        
        gf = gpd.read_file(_url)

        return gf

    

[docs]
    @staticmethod
    def merge_coordinates_geom(fied_state, gf, ftype=None, data_source='fied'):
        """"
        First creates POINT geometry from facility coordinates. Then 
        locates the points within specific geographic identifier type. Finally,
        merges geographic identifier with facility DataFrame.

        Parameters
        ----------
        fied_state : pandas.DataFrame
            DataFrame of foundational data by state

        gf : geopandas
            Shapefile containing Census tracts

        ftype : str, {'BG', 'CD', 'COUNTY', 'HUC'}
            Type of file to return. 'BG' == census block groups; 
            'CD' == congressional districts; 'COUNTY' == county FIPS; 'HUC' == hydrolic unit code.

        data_source : str, {'fied', 'ghgrp'}
            Source of data with missing geographic identifiers.

        Returns
        -------
        matched_geo : pandas.DataFrame
            Geographic identifiers matched to facility coordinates.
        
        """

        crs = "EPSG:4269"

        col_fix = {
            'HUC': {
                'geocolumn': '',
                're_column': 'HUC'
                },
            'BG': {
                'geocolumn': 'GEOID',
                're_column': 'geoID'
                },
            'COUNTY': {
                'geocolumn': 'GEOID',
                're_column': 'COUNTY_FIPS'
                },
            'CD': {
                'geocolumn': 'GEOID',
                're_column': 'legislativeDistrictNumber'
                }
            }
        
        try:
            geometry = gpd.points_from_xy(
                fied_state.longitude, fied_state.latitude, 
                crs=crs
                )
            
        except AttributeError:
            geometry = gpd.points_from_xy(
                fied_state.LONGITUDE, fied_state.LATITUDE, 
                crs=crs
                )

        gdf = gpd.GeoDataFrame(fied_state, crs=crs, geometry=geometry)

        matched_geo= gpd.sjoin(
            gdf, gf[[col_fix[ftype]['geocolumn'], 'geometry']], how='left', 
            predicate='within'
            )
        
        matched_geo.rename(
            columns={col_fix[ftype]['geocolumn']: col_fix[ftype]['re_column']},
            inplace=True
            )
        
        if data_source == 'fied':
            drop_cols =  ['geometry', 'latitude', 'longitude', 'index_right']

        elif data_source == 'ghgrp':
            drop_cols =  ['geometry', 'LATITUDE', 'LONGITUDE', 'index_right']

        matched_geo.drop(drop_cols, axis=1, inplace=True)

        
        return matched_geo



[docs]
    def merge_geom(self, df, year=None, ftypes=['BG', 'CD'], data_source='fied'):
        """
        Pulls together methods for creating Geopandas DataFrames from 
        geographic information files and merges geographic identifiers
        with the foundational data set.

        Parameters
        ----------
        df: pandas.DataFrame
            DataFrame with missing geographic data.

        year : int
            Year of foundational energy data.

        ftype : list; default=['BG', 'CD']
            Type of missing geo data to fill in.

        data_source : str, {'fied', 'ghgrp'}
            Source of missing geographic data. Used to specify
            columns in dataframe to use.

        Returns
        -------
        new_fied : pandas.DataFrame
            New foundational dataset with new columns 
            for geographic identifiers.

        """

        geo_data = pd.DataFrame()

        if data_source == 'fied':
            state_col = 'stateCode'
            data_cols = ['registryID', 'latitude', 'longitude']
            fac_id = 'registryID'

        elif data_source == 'ghgrp':
            state_col = 'STATE'
            data_cols = ['FACILITY_ID', 'LATITUDE', 'LONGITUDE']
            fac_id = 'FACILITY_ID'


        for state in df[state_col].unique():

            geo_data_state = pd.DataFrame()
    
            try:
                state_fips = self._statefips[state]

            except KeyError:
                continue

            df_state = pd.DataFrame(
                df.query(f"{state_col}==@state")[data_cols]
                )
            
            df_state.drop_duplicates(inplace=True)

            for t in ftypes:

                logging.info(f'Finding {t} for {state}')

                gf = FiedGIS.get_shapefile(
                    year=year, state_fips=state_fips,
                    ftype=t
                    )
                
                matched = FiedGIS.merge_coordinates_geom(
                    fied_state=df_state,
                    gf=gf,
                    ftype=t,
                    data_source=data_source
                    )

                geo_data_state = pd.concat(
                    [geo_data_state, matched.set_index(fac_id)], axis=1
                    )
            
            geo_data = geo_data.append(geo_data_state)

        if 'HUC' in ftypes:
            gf = FiedGIS.get_shapefile(year=year, ftype='HUC')
            hucs = FiedGIS.merge_coordinates_geom(df, gf, ftype='HUC')
            geo_data = pd.merge(
                geo_data, hucs,
                left_index=True, right_index=True, how='left'
                )

        else:
            pass

        if 'legislativeDistrictNumber' in df.columns:
            df.drop(['legislativeDistrictNumber'], axis=1, inplace=True)

        else:
            pass

        df = pd.merge(
            df, geo_data, left_on=fac_id, 
            right_index=True, how='left'
            )
        
        return df