Source code for epa.registry

"""
Creates Registries (YAML schemas) for EPA data by introspecting
downloaded files
"""

#  Copyright (c) 2021. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import os
from typing import Dict

import yaml

from nsaph import init_logging
from nsaph.loader.introspector import Introspector
from nsaph_utils.utils.context import Context, Argument, Cardinality

from epa import RECORD

DOMAIN_NAME = "epa"


[docs]class EPAConfig(Context): _output = Argument("output", help = "Output path for schema", type = str, default = DOMAIN_NAME, cardinality = Cardinality.single ) _data = Argument("data", help = "Path to a data file to introspect", type = str, required = True, default = None, cardinality = Cardinality.single ) _table = Argument("table", help = "Name of the table", type = str, required = True, default = None, cardinality = Cardinality.single ) def __init__(self, doc): self.output = None ''' Output path for schema ''' self.data = None ''' Path to a data file to introspect ''' self.table = None ''' Name of the table ''' super().__init__(EPAConfig, doc, include_default = False)
[docs]class Registry: """ This class parses File Transfer Summary files and creates YAML data model. It can either update built-in registry or write the model to a designated path """ def __init__(self, context: EPAConfig = None): init_logging() if not context: context = EPAConfig(__doc__).instantiate() self.context = context self.domain = None
[docs] def update(self): registry_path = self.context.output if os.path.isfile(registry_path): with open(registry_path, "rt") as f: self.domain = yaml.safe_load(f) else: self.domain = self.create_domain_yaml() self.create_table_yaml() with open(registry_path, "wt") as f: yaml.dump(self.domain, stream=f) return
[docs] @staticmethod def create_domain_yaml() -> Dict: domain = { DOMAIN_NAME: { "reference": "https://www.epa.gov/aqs", "schema": DOMAIN_NAME, "index": "selected", "header": True, "tables": { } } } return domain
[docs] def create_table_yaml(self): table = dict() introspector = Introspector(self.context.data) introspector.introspect() columns = introspector.get_columns() table["columns"] = columns table["primary_key"] = [ RECORD ] self.domain[DOMAIN_NAME]["tables"][self.context.table] = table
if __name__ == '__main__': Registry().update()