Source code for nsaph_gis.downloader

"""
Utilities to download shapefiles from US Census website
"""

#  Copyright (c) 2022. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Mikhail Polykovsky
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import os
import zipfile
from enum import Enum
from typing import Tuple
from urllib import request
import ssl
import certifi


from tqdm import tqdm



[docs]class CensusShapeCollection(Enum): genz = 'genz' tiger = 'tiger'
[docs]class GISDownloader: """ Geographic Downloader downloads shape files for given dates from https://www.census.gov/ """ COUNTY_TEMPLATE = 'https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_county_500k.zip' ZCTA_GENZ_TEMPLATE = 'https://www2.census.gov/geo/tiger/GENZ{year}/shp/cb_{year}_us_zcta510_500k.zip' ZCTA_TIGER_URLs = { 2008: 'https://www2.census.gov/geo/tiger/TIGER2008/tl_2008_us_zcta500.zip', 2010: 'https://www2.census.gov/geo/tiger/TIGER2010/ZCTA5/2010/tl_2010_us_zcta510.zip', } for y in range(2012, 2020): ZCTA_TIGER_URLs[y] = f'https://www2.census.gov/geo/tiger/TIGER{y}/ZCTA5/tl_{y}_us_zcta510.zip' for y in range(2020, 2023): ZCTA_TIGER_URLs[y] = f'https://www2.census.gov/geo/tiger/TIGER{y}/ZCTA520/tl_{y}_us_zcta520.zip'
[docs] @classmethod def download_shapes(cls, source: CensusShapeCollection, year: int, output_dir: str = None, strict: bool = False) -> None: cls.download_zcta(CensusShapeCollection.genz, year, output_dir, strict) cls.download_zcta(CensusShapeCollection.tiger, year, output_dir, strict) cls.download_county(year, output_dir, strict)
[docs] @classmethod def download_zcta(cls, source: CensusShapeCollection, year: int, output_dir: str = None, strict: bool = False) -> None: if source == CensusShapeCollection.genz: zip_url, is_exact = cls._get_genz_zcta_url(year) else: zip_url, is_exact = cls._get_tiger_zcta_url(year) if strict and not is_exact: raise ValueError(f'There is no census data for year { year }.') cls._download_shape(zip_url, output_dir)
[docs] @classmethod def download_county(cls, year: int, output_dir: str = None, strict: bool = False) -> None: county_url, is_exact = cls._get_county_url(year) if strict and not is_exact: raise ValueError(f'There is no census data for year { year }.') cls._download_shape(county_url, output_dir)
@classmethod def _download_shape(cls, url: str, output_dir: str = None) -> None: if output_dir is None: output_dir = '.' shape_file = url.rsplit('/', 1)[1] dest = os.path.join(output_dir, shape_file) if not os.path.exists(dest): https_proxy = os.environ.get('HTTPS_PROXY') if https_proxy: proxy = request.ProxyHandler({'http': https_proxy, 'https': https_proxy}) opener = request.build_opener(proxy) request.install_opener(opener) with tqdm(desc=f'Downloading {url}') as bar: def report(blocknum, bs, size): bar.total = size bar.update(bs) ssl._create_default_https_context = ssl._create_unverified_context request.urlretrieve(url, dest, reporthook=report) with zipfile.ZipFile(dest, 'r') as zip_ref: zip_ref.extractall(output_dir) @classmethod def _get_county_url(cls, year: int) -> Tuple[str, bool]: """ Method returns url to county shape file for nearest existing year data """ if year > 2020: return cls._get_county_url(2020)[0], False if year in (2012, 2011) or year < 2010: return cls._get_county_url(2010)[0], False if year == 2010: return 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_050_00_500k.zip', True if year == 2013: return 'https://www2.census.gov/geo/tiger/GENZ2013/cb_2013_us_county_500k.zip', True if 2014 <= year <= 2020: return cls.COUNTY_TEMPLATE.format(year=year), True @classmethod def _get_genz_zcta_url(cls, year: int) -> Tuple[str, bool]: """ Method returns url to zip shape file for nearest existing year data """ if year > 2020: return cls._get_genz_zcta_url(2020)[0], False if year in (2012, 2011) or year < 2010: return cls._get_genz_zcta_url(2010)[0], False if year == 2010: return 'https://www2.census.gov/geo/tiger/GENZ2010/gz_2010_us_860_00_500k.zip', True if year == 2013: return 'https://www2.census.gov/geo/tiger/GENZ2013/cb_2013_us_zcta510_500k.zip', True if 2014 <= year <= 2019: return cls.ZCTA_GENZ_TEMPLATE.format(year=year), True if year == 2020: return 'https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_zcta520_500k.zip', True @classmethod def _get_tiger_zcta_url(cls, year: int) -> Tuple[str, bool]: """ Method returns url to zip shape file for nearest existing year data """ if year in cls.ZCTA_TIGER_URLs: return cls.ZCTA_TIGER_URLs[year], True available_years = sorted( [key for key in cls.ZCTA_TIGER_URLs], reverse=True ) for y in available_years: if y <= year: return cls.ZCTA_TIGER_URLs[y], False return cls.ZCTA_TIGER_URLs[available_years[-1]], False