Source code for fied.tools.naics_matcher


import pandas as pd


[docs] def naics_matcher(naics_column, naics_vintage=2017): """ Method for matching aggregated NAICS codes (i.e., <6 digit) with 6-digit NAICS codes. Parameters ---------- naics_column : pandas.Series Series of NAICS codes to match to 6-digit NAICS. naics_vintage : int; 2007, 2012, 2017, or 2022 Year of NAICS codes Returns ------- ncmatch : pandas.DataFrame Original NAICS matched to 6-digit versions. """ if naics_vintage < 2017: naics_url = f'https://www.census.gov/naics/{naics_vintage}NAICS/6-digit_{naics_vintage}_Codes.xls' else: naics_url = f'https://www.census.gov/naics/{naics_vintage}NAICS/6-digit_{naics_vintage}_Codes.xlsx' all_naics = pd.read_excel(naics_url, usecols=[0, 1], engine='openpyxl') all_naics.dropna(how='all', axis=1, inplace=True) all_naics.dropna(how='all', axis=0, inplace=True) all_naics[f'{naics_vintage} NAICS Code'] = \ all_naics[f'{naics_vintage} NAICS Code'].astype(int) nctest = pd.DataFrame(all_naics[f'{naics_vintage} NAICS Code'].values, columns=['n6']) for n in range(3, 6): nctest[f'n{n}'] = nctest.n6.apply( lambda x: int(str(x)[0:n]) ) # Drop duplicates naics_column = naics_column.drop_duplicates() # Match only < 6-digit NAICS if any([len(str(x)) == 6 for x in naics_column]): naics_column = naics_column.where( naics_column.apply(lambda x: len(str(int(x))) < 6) ).dropna() else: pass # Int format if naics_column.dtype != 'int32': for i, v in naics_column.iteritems(): try: naics_column.loc[i] = int(v) except ValueError: naics_column.drop(i, inplace=True, axis=0) else: pass # Loop not converting to int naics_column = naics_column.astype(int) ncmatch = pd.concat( [pd.merge(nctest[column], pd.DataFrame(naics_column), left_on=column, right_on=naics_column.name, how='left') for column in ['n6', 'n5', 'n4', 'n3']], axis=1 ) ncmatch = ncmatch[naics_column.name].dropna(how='all').join(nctest['n6']) nclen = len(ncmatch[naics_column.name].columns) if nclen > 1: ncog = pd.concat( [ncmatch.iloc[:, n].dropna().astype(int) for n in range(0, nclen)], axis=0, ignore_index=False ) ncmatch = pd.DataFrame(ncog).join(ncmatch['n6']) ncmatch.reset_index(inplace=True, drop=True) else: ncmatch = pd.concat( [ncmatch[naics_column.name].apply(lambda x: int(x.dropna()), axis=1), ncmatch['n6']], axis=1 ) return ncmatch