Source code for nsaph.dbt.create_test

"""
A utility to generate a test based on a sample table.

The tool introspects a table, given as an input
and generates a set of queries each of which tests that the data in
a certain column has not changed.

The queries are output into a file that can be executed
as a single SQL query producing a table with the following columns:

1. Name of the column being tested
2. What value is being tested not to change:
MD5 hash, number of distinct records, mean value or variance
3. Whether the value has changed (indicated by string `failed`) or remained
the same (indicated by string `passed`)

Individual queries are separated by a comment strings:

* `-- Test case end`
* `-- Test case start`

so a test runner can execute them individually if desired

"""
#  Copyright (c) 2021. Harvard University
#
#  Developed by Research Software Engineering,
#  Faculty of Arts and Sciences, Research Computing (FAS RC)
#  Author: Michael A Bouzinier
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#         http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

from enum import Enum
from typing import List, Dict, Optional, Callable

from nsaph import init_logging
from nsaph.db import Connection
from nsaph.dbt.dbt_config import DBTConfig


[docs]class CType(Enum):
    categorical = "categorical"
    text = "text"
    integral = "integral"
    numeric = "numeric"
    date = 'date'


[docs]class Column:
    def __init__(self, name: str, ctype: CType, is_indexed: bool):
        self.name = name
        self.type = ctype
        self.is_indexed = is_indexed


[docs]class TableFingerprint:
    CATEGORICAL_THRESHOLD = 24
    def __init__(self, context: DBTConfig = None):
        if not context:
            context = DBTConfig(None, __doc__).instantiate()
        self.context = context
        if not self.context.table:
            raise ValueError("'--table' is a required option")
        init_logging(name="generate-test-" + self.context.table)
        if '.' in self.context.table:
            t = self.context.table.split('.')
            self.table = t[1]
            self.schema = t[0]
        else:
            self.table = self.context.table
            self.schema = "public"
        self.fqtn = f"{self.schema}.{self.table}"
        self.count_distinct = "SELECT COUNT(DISTINCT {c}) FROM " + self.fqtn
        self.columns: List[Column] = []
        self.test_cases: List[str] = []
        cnxn = Connection(self.context.db, self.context.connection)
        self.catalog = cnxn.parameters["database"]
        self.get_columns()

[docs]    def get_columns(self):
        i1 = """
                       coalesce(a.attname,                                                     
                                (('{' || pg_get_expr(                                          
                                            i.indexprs,                                        
                                            i.indrelid                                         
                                         )                                                     
                                      || '}')::text[]                                          
                                )[k.i]                                                         
                               ) AS index_column                                     
        """
        i2 = f"""
            CASE WHEN COLUMN_NAME IN (
                SELECT
                    {i1} 
                FROM pg_index i                                                                
                   CROSS JOIN LATERAL unnest(i.indkey) WITH ORDINALITY AS k(attnum, i)         
                   LEFT JOIN pg_attribute AS a                                                 
                      ON i.indrelid = a.attrelid AND k.attnum = a.attnum                       
                WHERE 
                    i.indrelid = '{self.fqtn}'::regclass
                    AND k.i = 1
            ) THEN true ELSE false END AS indexed
        """
        sql = f"""
            SELECT
                COLUMN_NAME,
                data_type,
                {i2}
            FROM 
                information_schema.columns
            WHERE 
                table_name = '{self.table}'
                AND table_schema = '{self.schema}'
                AND table_catalog = '{self.catalog}'
            ORDER BY 
                1        
        """
        with Connection(self.context.db, self.context.connection) as cnxn:
            with cnxn.cursor() as cursor:
                cursor.execute(sql)
                columns = [row for row in cursor]

            for c in columns:
                if c[1] in ['integer']:
                    t = CType.integral
                elif c[1] in ['numeric']:
                    t = CType.numeric
                elif c[1] in ['date']:
                    t = CType.date
                else:
                    t = CType.text
                dc = 1024*1024*1024
                if t != 'numeric':
                    with cnxn.cursor() as cursor:
                        cursor.execute(self.count_distinct.format(c=c[0]))
                        for row in cursor:
                            dc = row[0]
                if dc < self.CATEGORICAL_THRESHOLD and t == CType.text:
                    t = CType.categorical
                if t == CType.integral and dc > self.CATEGORICAL_THRESHOLD:
                    t = CType.numeric
                self.columns.append(Column(c[0], t, c[2]))
        return

[docs]    def get_categories(self) -> List[Column]:
        return [
            c for c in self.columns if c.is_indexed and c.type in
                    [CType.integral, CType.categorical]
        ]

[docs]    def generate_tests(self):
        for c in self.columns:
            self.test_column(c)

[docs]    def test_column(self, c: Column):
        if c.type in [CType.text, CType.categorical, CType.integral, CType.date]:
            s1 = self.count_distinct.format(c=c.name)
            test_case = self.test_exact(s1, c.name, "count distinct")
            self.test_cases.append(test_case)
        if c.type in [CType.text, CType.categorical]:
            s2 = f"SELECT MD5(string_agg({c.name}::varchar, '')) FROM {self.fqtn}"
            test_case = self.test_exact(s2, c.name, "MD5 value")
            self.test_cases.append(test_case)
        if c.type in [CType.numeric, CType.integral]:
            s3 = f"SELECT AVG({c.name}) FROM {self.fqtn}"
            test_case = self.test_approximate(s3, c.name, "Mean value")
            self.test_cases.append(test_case)
            s4 = f"SELECT VARIANCE({c.name}) FROM {self.fqtn}"
            test_case = self.test_approximate(s4, c.name, "Variance")
            self.test_cases.append(test_case)

[docs]    def test_case_sql(self, name: str, test: str, condition: str) -> str:
        return "SELECT \n" \
               + f"\t'{self.fqtn}.{name}' As table_column,\n" \
               + f"\t'{test}' As Testing,\n" \
               + "\tCASE \n" \
               + f"\t\tWHEN {condition} \n" \
               + "\t\tTHEN true ELSE false END AS passed\n"

[docs]    def test_exact(self, sql: str, name: str, test: str) -> str:
        v = None
        with Connection(self.context.db, self.context.connection) as cnxn:
            with cnxn.cursor() as cursor:
                cursor.execute(sql)
                for row in cursor:
                    v = row[0]
        if v is None:
            condition = f"({sql}) IS NULL"
        else:
            condition = f"({sql}) = '{str(v)}'"
        test_case = self.test_case_sql(name, test, condition)
        return test_case

[docs]    def test_approximate(self, sql: str, name: str, test: str) -> str:
        v = None
        with Connection(self.context.db, self.context.connection) as cnxn:
            with cnxn.cursor() as cursor:
                cursor.execute(sql)
                for row in cursor:
                    v = row[0]
        if v >= 0:
            v1 = 0.99 * float(v)
            v2 = 1.01 * float(v)
        else:
            v2 = 0.99 * float(v)
            v1 = 1.01 * float(v)
        condition = f"({sql}) BETWEEN {str(v1)} AND {str(v2)}"
        test_case = self.test_case_sql(name, test, condition)
        return test_case

[docs]    def union(self):
        return "UNION ALL\n".join(self.test_cases)

[docs]    def write_test_script(self):
        first = True
        with open(self.context.script[0], "wt") as script:
            for test_case in self.test_cases:
                if not first:
                    print("UNION ALL", file=script)
                first = False
                print("-- Test case start", file=script)
                print(test_case, file=script)
                print("-- Test case end", file=script)


if __name__ == '__main__':
    fingerprint = TableFingerprint()
    fingerprint.generate_tests()
    # print(fingerprint.union())
    fingerprint.write_test_script()