Source code for nsaph.data_model.inserter

#  Copyright (c) 2021. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import os.path
import sys
import threading
from contextlib import contextmanager
from datetime import timedelta, datetime
import logging
from collections import OrderedDict
from typing import Optional, List
from pyresourcepool.pyresourcepool import ResourcePool
from timeit import default_timer as timer

from sortedcontainers import SortedDict
from psycopg2.extras import execute_values

import nsaph.dictionary.element
from nsaph.data_model.domain import Domain
from nsaph.data_model.utils import split, DataReader, regex
from nsaph.fips import fips_dict
from nsaph.pg_keywords import PG_SERIAL_TYPE, PG_SM_SERIAL_TYPE
from nsaph.util.executors import BlockingThreadPoolExecutor
from nsaph_utils.utils.io_utils import sizeof_fmt, SpecialValues

EMPTY_LIST_SIZE = sys.getsizeof([])


[docs]def compute(how, row): try: value = eval(how["eval"]) except: value = None return value
[docs]class Inserter: def __init__(self, domain, root_table_name, reader: DataReader, connections, page_size = 1000): self.tables: List[Inserter._Table] = [] self.page_size = page_size self.ready = False self.reader = reader self.domain = domain self.write_lock = threading.Lock() self.read_lock = threading.Lock() table = domain.find(root_table_name) self.prepare(root_table_name, table) if isinstance(connections, list): has_triggers = any([table.audit for table in self.tables]) if has_triggers and len(connections) > 1: logging.warning("One of the tables uses triggers, only one connection can be used") self.connections = ResourcePool(connections[0:1]) self.capacity = 2 else: self.connections = ResourcePool(connections) self.capacity = len(connections) else: self.connections = ResourcePool([connections]) self.capacity = 1 self.timings = dict() self.timestamps = dict() self.current_row = 0 self.pushed_rows = 0 self.volume = 0 self.last_logged_row = 0 self.in_wait_state = False
[docs] def prepare(self, name, table): self.tables.append(self.Table(name, table)) if "children" in table: for child_table in table["children"]: child_table_def = table["children"][child_table] if child_table_def.get("hard_linked"): self.tables.append(self.Table(child_table, child_table_def)) self.ready = True for table in self.tables: logging.info(table.insert)
[docs] def read_batch(self): batch = self.Batch() with self.read_lock: for row in self.reader.rows(): self.current_row += 1 self.volume += sys.getsizeof(row) - EMPTY_LIST_SIZE is_valid = True for table in self.tables: records = table.read(row) if records is None: is_valid = False logging.warning("Illegal row #{:d}: {}".format(self.current_row, str(row))) break batch.add(table.name,records) if not is_valid: continue batch.inc() if batch.rows >= self.page_size: break return batch
[docs] def import_file(self, limit = None, log_step = 1000000) -> int: logging.info( "Autocommit is: {}. Page size = {:d}. Writer threads: {:d}. Logging progress every {:d} records" .format(self.get_autocommit(), self.page_size, self.capacity, log_step) ) self.reset_timer() self.stamp_time("start") if self.capacity > 1: max_tasks = self.capacity * 2 + 1 with BlockingThreadPoolExecutor(max_queue_size=max_tasks, max_workers=self.capacity + 1, timeout=14400) as executor: self.in_wait_state = False l: int = self._loop(executor, limit, log_step) self.in_wait_state = True logging.info("Main loop has finished. Waiting for inserter threads to finish") executor.wait_for_completion() else: l = self._loop(None, limit, log_step) logging.info("Total records imported from {}: {:d}".format(self.reader.get_path(), l)) return l
def _loop(self, executor: Optional[BlockingThreadPoolExecutor], limit = None, log_step = 1000000) -> int: l: int = 0 l1 = l while self.ready: with self.timer("read"): batch = self.read_batch() if batch.is_empty(): self.ready = False l += batch.rows if batch.size() > 0: if executor: executor.submit(self.push, batch) else: self.push(batch) if l - l1 >= log_step: self.log_progress() l1 = l if limit and l >= int(limit): break return l
[docs] def push(self, batch): with self.connections.get_resource() as connection: for table in self.tables: records = batch[table] if len(records) < 1: logging.error("Trying to execute an empty batch") continue with connection.cursor() as cursor, self.timer("store"): try: if self.in_wait_state: ts = str(datetime.now()) tid = threading.get_ident() logging.info("{} - {:d}. Sending last Batch[{:d}] to the database." .format(ts, tid, len(records))) execute_values(cursor, table.insert, records, page_size=len(records)) if self.in_wait_state: ts = str(datetime.now()) tid = threading.get_ident() logging.info("{} - {:d}. Last Batch has been executed.".format(ts, tid)) except Exception as x: msg = str(x) logging.error("Error {}; while executing: {} with {:d} records" .format(msg, table.insert, len(records))) self.drilldown(connection, table.insert, records) with self.write_lock: self.pushed_rows += batch.rows
[docs] @staticmethod def drilldown(connection, sql: str, records: list): if not connection.autocommit: connection.rollback() cursor = connection.cursor() if not records: raise Exception("Empty records array for " + sql) n = len(records[0]) sql = sql.replace("%s", "({})".format(','.join(["%s" for _ in range(n)]))) for record in records: try: cursor.execute(sql, record) except Exception as x: msg = str(x) s = ", ".join([str(v) for v in record]) logging.error("Drill down: error {} while executing: {} ({})".format(msg, sql, s)) raise x
[docs] def get_autocommit(self): ac = [ connection.autocommit for connection in self.connections._objects ] if all(ac): return "ON" if all([(not a) for a in ac]): return "OFF" return ", ".join(["ON" if a else "OFF" for a in ac])
[docs] def log_progress(self): t0 = self.get_timestamp("start") t1 = self.get_timestamp("last_logged", t0) now = timer() rate1 = float(self.pushed_rows - self.last_logged_row) / (now - t1) rate = float(self.pushed_rows) / (now - t0) rt = self.get_cumulative_timing("read") st = self.get_cumulative_timing("store") t = rt + st rts = str(timedelta(seconds=rt)) sts = str(timedelta(seconds=st)) if self.capacity > 1: rtl = ["{:.2f}".format(t) for t in self.get_timings("read")] stl = ["{:.2f}".format(t) for t in self.get_timings("store")] rts = "{} = {}".format(" + ".join(rtl), rts) sts = "{} = {}".format(" + ".join(stl), sts) with self.write_lock: if (t1 - now) > 120: self.last_logged_row = self.pushed_rows self.stamp_time("last_logged") path = os.path.basename(self.reader.get_path()) sz = sizeof_fmt(self.volume) logging.info( "Records imported from {}: {:,} => {:,}; rate: {:,.2f} rec/sec; read: {:d}% / store: {:d}%; size: {}" .format(path, self.current_row, self.pushed_rows, rate, int(rt*100/t), int(st*100/t), sz) ) logging.debug( "Current rate: {:,.2f} rec/sec, time read: {}; time store: {}" .format(rate1, rts, sts) ) if self.reader.count is not None and ( 0 < self.reader.count < self.current_row ): logging.error("Continue ingesting over the file size")
[docs] def Batch(self): return self._Batch(self)
class _Batch: def __init__(self, parent): self.data = { table.name: [] for table in parent.tables } self.rows = 0 def add(self, table: str, records: list): self.data[table].extend(records) def inc(self): self.rows += 1 def size(self): return min([len(records) for records in self.data.values()]) def is_empty(self) -> bool: return self.rows == 0 and all([len(records) == 0 for records in self.data.values()]) def __getitem__(self, item): if isinstance(item, Inserter._Table): item = item.name return self.data[item]
[docs] def Table(self, *args, **kwargs): return self._Table(self, *args, **kwargs)
class _Table: def __init__(self, parent, name:str, table: dict): self.reader = parent.reader self.reader_path = os.path.basename(self.reader.get_path()) self.name = parent.domain.fqn(name) self.mapping = None self.range_columns = None self.no_empty_str = None self.computes = None self.pk = None self.insert = None self.range = None self.arrays = dict() self.audit = None self.file_column = None if "invalid.records" in table: self.audit = table["invalid.records"] self.prepare(table) def prepare(self, table: dict): primary_key = table["primary_key"] columns = table["columns"] self.mapping = SortedDict() self.computes = SortedDict() self.no_empty_str = set() self.range_columns = OrderedDict() for c in columns: name, column = split(c) source = None source_index = None try: if "source" in column: if isinstance(column["source"], str): source = column["source"] elif isinstance(column["source"], int): source_index = column["source"] elif isinstance(column["source"], dict): t = column["source"]["type"] if t == "column": source = column["source"]["column"] elif t == "multi_column": if not self.range: raise Exception("Multi columns require range: " + name) pattern = column["source"]["pattern"] self.range_columns[name] = dict() _, rng = self.range for v in rng: source = pattern.format(v) self.range_columns[name][v] = self.reader.columns.index(source) continue elif t == "compute": self.computes[name] = column["source"] continue elif t == "range": if self.range: raise Exception("Only one range is supported column {}: {}".format(name, str(column["source"]))) if "values" not in column["source"]: raise Exception("Range must specify values for column {}: {}".format(name, str(column["source"]))) values = column["source"]["values"] self.range = (name, values) continue elif t == "generated": continue elif t == "file": self.file_column = name continue else: raise Exception("Invalid source for column {}: {}".format(name, str(column["source"]))) else: raise Exception("Invalid source for column {}: {}".format(name, str(column["source"]))) elif "type" in column and column["type"].upper() in [PG_SERIAL_TYPE, PG_SM_SERIAL_TYPE]: continue else: for f in self.reader.columns: if name.lower() == f.lower(): source = f break if not source and source_index is None: raise Exception("Source was not found for column {}".format(name)) if Domain.is_array(column): r = regex(source) i0 = len(self.reader.columns) i1 = 0 for i, clmn in enumerate(self.reader.columns): if r.fullmatch(clmn): self.mapping[i] = name i0 = min(i0, i) i1 = max(i1, i) self.arrays[i0] = i1 else: if source and not source_index: source_index = self.reader.columns.index(source) self.mapping[source_index] = name if "type" in column and column["type"].lower()[:4] not in [ "varc", "char", "text" ]: self.no_empty_str.add(source_index) except Exception as x: raise Exception( "Invalid specification for column {}; error: {}" .format(name, str(x)) ) from x inverse_mapping = { item[1]: item[0] for item in self.mapping.items() } for c in self.computes.values(): parameters = [ inverse_mapping[p] for p in c.get("parameters", []) ] + [ self.reader.columns.index(p) for p in c.get("columns", []) ] arguments = ["empty"] + [ "row[{:d}]".format(p) for p in parameters ] code = c["code"].format(*arguments) c["eval"] = code self.pk = {i for i in self.mapping if self.mapping[i] in primary_key} cc = [] for i in self.mapping: name = self.mapping[i] if name not in cc: cc.append(name) cc.extend(self.computes.keys()) if self.range: cc.append(self.range[0]) cc.extend(self.range_columns.keys()) if self.file_column: cc.append(self.file_column) column_list = ", ".join(cc) self.insert = "INSERT INTO {table} ({columns}) VALUES %s".format(table=self.name, columns=column_list) def read(self, row) -> Optional[list]: if self.range: return self.read_multi(row) record = [] is_valid = self.map(row, record) if not is_valid: return None for c in self.computes: value = compute(self.computes[c], row) record.append(value) if self.file_column: record.append(self.reader_path) return [record] def map(self, row, record): array = None array_end = None for i in self.mapping: if i in self.arrays: array = [] array_end = self.arrays[i] is_end = array_end == i try: value = row[i] except: msg = "Error for column #{:d} ({})".format( i+1, str(self.mapping[i]) ) logging.exception( msg + "\nWhile processing row: " + str(row) ) raise if SpecialValues.is_missing(value): if i in self.pk and self.audit is None: return False else: value = None elif isinstance(value, str) and not value.strip(): value = None if array is None: record.append(value) else: array.append(value) if is_end: record.append(array) array = None array_end = None return True def read_multi(self, row) -> Optional[list]: records = [] range_column, values = self.range for v in values: record = [] is_valid = self.map(row, record) if not is_valid: return None for c in self.computes: value = compute(self.computes[c], row) record.append(value) record.append(v) for c in self.range_columns: record.append(row[self.range_columns[c][v]]) records.append(record) return records
[docs] @contextmanager def timer(self, context: str): t0 = timer() yield t1 = timer() tid = threading.get_ident() if tid not in self.timings: self.timings[tid] = dict() self.timings[tid][context] = self.timings[tid].get(context, 0) + (t1 - t0)
[docs] def get_thread_timing(self, context: str) -> float: tid = threading.get_ident() if tid not in self.timings: return 0 return self.timings[tid].get(context, 0)
[docs] def get_timings(self, context: str) -> List[float]: return [self.timings[tid].get(context, 0) for tid in self.timings]
[docs] def get_cumulative_timing(self, context: str): return sum(self.get_timings(context))
[docs] def stamp_time(self, context: str): self.timestamps[context] = timer()
[docs] def get_timestamp(self, context: str, default = 0): return self.timestamps.get(context, default)
[docs] def reset_timer(self): self.timestamps.clear() self.timings.clear()