Source code for gridmet.registry

"""
A utility to create skeleton data model (database schema)
for gridMET data
"""

#  Copyright (c) 2021. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import os
import sys
from pathlib import Path
import yaml

from nsaph import init_logging
from nsaph.pg_keywords import PG_NUMERIC_TYPE, PG_DATE_TYPE, PG_STR_TYPE

from gridmet.config import Geography, GridmetVariable


DATE_COLUMN = "observation_date"
EXTRACT_YEAR = f"(EXTRACT (YEAR FROM {DATE_COLUMN}))::INT"


[docs]class Registry: """ This class creates YAML data model for gridMET tables. """ COMMON_COLUMNS = f""" - year: source: type: "generated" code: "GENERATED ALWAYS AS (EXTRACT (YEAR FROM {DATE_COLUMN})) STORED" type: INT index: true doc: The year of the observation - month: source: type: "generated" code: "GENERATED ALWAYS AS (EXTRACT (MONTH FROM {DATE_COLUMN})) STORED" type: INT index: true doc: The year of the observation - day_of_the_year: source: type: "generated" code: "GENERATED ALWAYS AS (EXTRACT (DOY FROM {DATE_COLUMN})) STORED" type: INT index: true doc: The year of the observation """ COUNTY_COLUMNS = """ - fips5: source: type: "generated" code: "GENERATED ALWAYS AS (county::INT) STORED" type: INT doc: County FIPS code as an integer, value is equal to "county". - fips2: source: type: "generated" code: "GENERATED ALWAYS AS (substring(county, 1, 2)::INT) STORED" type: INT doc: FIPS code of the US State in which the county is located - fips3: source: type: "generated" code: "GENERATED ALWAYS AS (substring(county, 3, 3)::INT) STORED" type: INT doc: FIPS code of the county without state FIPS code - state: source: type: "generated" code: "GENERATED ALWAYS AS (public.fips2state(substring(county, 1, 2)::VARCHAR)) STORED" - state_iso: source: type: "generated" code: "GENERATED ALWAYS AS (public.fips2state_iso(substring(county, 1, 2))) STORED" """ ZCTA_COLUMNS = f""" - state: index: true source: type: "generated" code: "GENERATED ALWAYS AS (public.zip_to_state({EXTRACT_YEAR}, zcta::INT)) STORED" doc: | This column is for informational purposes only. The US State or territory Id associated with this ZCTA. Some ZCTAs span over more than one states or territories. - city: source: type: "generated" code: "GENERATED ALWAYS AS (public.zip_to_city({EXTRACT_YEAR}, zcta::INT)) STORED" doc: | This column is for informational purposes only. The name of the city preferred by the US Postal service for the ZIP code associated with this ZCTA. - county: source: type: "generated" code: "GENERATED ALWAYS AS (public.zip_to_fips5({EXTRACT_YEAR}, zcta::INT)) STORED" doc: | This column is for informational purposes only. The US County FIPS code, for the county having the largest intersection in terms of population with this ZCTA. """ def __init__(self, destination:str, dn: str = None): self.destination = destination self.name = dn init_logging()
[docs] def update(self): with open(self.destination, "wt") as f: f.write(self.create_yaml()) return
[docs] def create_yaml(self): domain = { self.name: { "schema": self.name, "index": "all", "description": "NSAPH data model for gridMET climate data", "header": True, "quoting": 3, "tables": { } } } for geography in Geography: for band in GridmetVariable: bnd = band.value geo = geography.value date_column = "observation_date" tname = "{}_{}".format(geo, bnd) columns = [ {bnd: { "type": PG_NUMERIC_TYPE }}, {date_column: { "type": PG_DATE_TYPE, "source": "date" }}, {geo: { "type": PG_STR_TYPE }} ] common_columns = yaml.safe_load(self.COMMON_COLUMNS) columns.extend(common_columns) if geography == Geography.county: county_columns = yaml.safe_load(self.COUNTY_COLUMNS) columns.extend(county_columns) elif geography == Geography.zcta: zcta_columns = yaml.safe_load(self.ZCTA_COLUMNS) columns.extend(zcta_columns) table = { "columns": columns, "primary_key": [ geo, date_column ], "indices": { "dt_geo_idx": { "columns": [date_column, geo] }, "ym_idx": { "columns": ["year", "month"] }, "y_geo_idx": { "columns": ["year", geo] } } } domain[self.name]["tables"][tname] = table return yaml.dump(domain)
[docs] @staticmethod def built_in_registry_path(): src = Path(__file__).parents[3] return os.path.join(src, "yml", "gridmet.yaml")
if __name__ == '__main__': if len(sys.argv) > 2: dname = sys.argv[2] else: dname = None Registry(sys.argv[1], dname).update()