Source code for census.query

"""
query.py
=================================================
Functions for Requesting Data from the Census API
"""

#  Copyright (c) 2022. Harvard University
#
#  Developed by Harvard T.H. Chan School of Public Health
#  (HSPH) and Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Ben Sabath (https://github.com/mbsabath)                                                                                                                                          medicaid.rst
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import logging
import os

import pandas as pd
import requests

from .census_info import get_endpoint, get_varlist
from .exceptions import CensusException

# Code for downloading the census data

SUPPORTED_GEOMETRIES = ["county", "state", "zcta", "block group", "tract"]

LOG = logging.getLogger(__name__)


[docs]def get_census_data(year: int, variables: list, geography: str, dataset: str, sum_file: str = None, key: str = None, state: str = None, county: str = None): """ :param year: Year of data that we are querying :param variables: list of strings containing the census variable names to request :param geography: Geographic resolution we're querying at (zcta, county, state) :param dataset: The census data set you want (dec, acs1, acs5, pums) :param sum_file: For the 2000 census, sf1 or sf3 :param key: Your census API key. We recommend not passing it here and instead either setting the "CENSUS_API_KEY" environmental variable or using the `set_api_key` function. :param state: 2 digit FIPS code of the state you want to limit the query to (i.e. "06" for CA) :param county: 3 digit FIPS code of the county you want to include. Requires state to be specified :return: a pandas DataFrame """ if dataset == "acs": dataset = "acs5" if dataset == "census": dataset = "dec" if year == 2000 and dataset == "dec" and sum_file is None: sum_file = _choose_sum_file(variables) endpoint = get_endpoint(year, dataset, sum_file) geography = api_geography(geography) if key is None: if "CENSUS_API_KEY" in os.environ.keys(): key = os.environ["CENSUS_API_KEY"] if type(variables) is str: variables = [variables] if dataset in ["acs1", "acs5"]: _clean_acs_vars(variables) options = dict() options['get'] = _prep_vars(variables) options['for'] = geography if state is not None: options['in'] = 'state:' + state if county is not None: options['in'] += '+county:' + county if key is not None: options['key'] = os.environ['CENSUS_API_KEY'] num_tries = 0 while num_tries < 5: try: response = requests.get(endpoint, params=options) response.raise_for_status() break except: LOG.warning("Query Failed, re-trying") num_tries += 1 if num_tries >= 5: LOG.critical("Unable to complete query %s after %s tries", endpoint, num_tries) raise CensusException("Unable to complete query " + endpoint + " after " + str(num_tries) + " tries") out = response.json() out = pd.DataFrame(out[1:], columns=out[0]) # handle conversion of variables to numeric for var in variables: try: out[var] = out[var].apply(pd.to_numeric) except ValueError: LOG.error("Unable to convert variable " + var + " to Numeric array") out['year'] = year return out
def _clean_acs_vars(variables: list): """ Ensure that the estimate value is specified for a list of ACS variables :param variables: list of strings containing the variable names to request :return: list of strings containing ACS vars, postpended with an "E" (where not previously postended) """ # If vars is not a string, assume it is a list of strings for i in range(len(variables)): if variables[i][-1] != "E": variables[i] += "E" def _prep_vars(variables: list): """ Convert from a list to a comma separated string :param variables: list of vars :return: comma separated string """ # if vars is not a string, assume it's a list of multiple strings out = "" for i in range(len(variables) - 1): out += variables[i] out += "," out += variables[-1] return out
[docs]def api_geography(geo: str): """ go from function shorthand to the input the census api needs :param geo: shorthand for a given geography type :return: corrected geography name """ geo = geo.lower() if geo not in SUPPORTED_GEOMETRIES: raise CensusException("Input Geometry not supported") if geo == "zcta": return "zip code tabulation area" else: return geo
def _choose_sum_file(variables: list): """ Internal function, not exported If we're querying the 2000 US census, and we're not sure which summary file to use, we need to check to see if the variables listed are available in sf1. If they are, we use sf1, if not we use sf3. """ varlist = get_varlist(2000, "dec", "sf1") sf1 = True for var in variables: sf1 = sf1 and (var in varlist) if sf1: return "sf1" else: return "sf3"