"""
Generic object for testing for data quality issues.
Tester class contains list of tests to run on data. Tests contain a variable name, a condition, and a severity
"""
# Copyright (c) 2021. Harvard University
#
# Developed by Research Software Engineering,
# Faculty of Arts and Sciences, Research Computing (FAS RC)
# Author: Ben Sabath
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from enum import Enum, auto
import pandas as pd
import numpy as np
import yaml
import logging
[docs]class Condition(Enum):
less_than = "lt"
greater_than = "gt"
data_type = "dtype"
no_missing = "no_nan"
count_missing = "count_nan"
[docs]class Severity(Enum):
debug = logging.DEBUG
info = logging.INFO
warning = logging.WARNING
error = logging.ERROR
critical = logging.CRITICAL
[docs]class ExpectationError(Exception):
"""
Error for when an expected value to a condition cannot be valid
"""
pass
[docs]class Test:
def __init__(self, variable, condition, severity, val=None, name=None, logger = None):
self.variable = variable
self.condition = condition
self.val = val
"""
Value to compare against, can be excluded for a ``no_missing`` check
"""
self.severity = severity
self.name = name
if not name:
self.name = self.variable + "_" + self.condition.value
if self.val:
self.name += "_" + str(self.val)
if logger:
self.__logger = logger
else:
self.__logger = logging.getLogger(__name__ + ".Test." + self.name)
self._validate_test()
self.expectation = self._construct_expectation()
def _validate_test(self):
"""
Confirm that inputs define a valid test
:return:
"""
if self.condition == Condition.count_missing and self.val < 0:
raise ExpectationError(self.name + ": Count Missing conditions must expect at least 0 missing rows")
def _construct_expectation(self):
"""
Phrase test expectation in words
:return: str
"""
out = ""
out += self.name + ":" + self.severity.name + ": " + "For variable " + self.variable + ": "
if self.condition == Condition.count_missing:
if 0 < self.val < 1:
out += "less than " + "%2.2f" % (self.val * 100) + "% missing"
else:
out += "less than " + str(self.val) + " values missing"
elif self.condition == Condition.no_missing:
out += "no missing values"
elif self.condition == Condition.data_type:
out += "all values are " + self.val
elif self.condition == Condition.greater_than:
out += "all values greater than " + str(self.val)
elif self.condition == Condition.less_than:
out += "all values less than " +str(self.val)
return out
[docs] def check(self, df: pd.DataFrame):
"""
Check variable of input dataframe to see if it meets conditions
:param df: Pandas data frame
:return: boolean of if the data passed the test
"""
message = None
result = None
if self.condition == Condition.count_missing:
count = sum(np.isnan(df[self.variable]))
if 1 > self.val > 0:
# assume expectation is a %age
result = count/len(df.index) < self.val
if not result:
message = self.expectation + ". " + "%2.2f" % (count/len(df.index) * 100) + \
"% missing values observed for " + self.variable
else:
result = count < self.val
if not result:
message = self.expectation + ". " + str(count) + \
" missing values observed for " + self.variable
elif self.condition == Condition.data_type:
result = type(df.loc[0, self.variable]).__name__ == self.val
else:
if self.condition == Condition.less_than:
count = sum(df[self.variable] > self.val)
elif self.condition == Condition.greater_than:
count = sum(df[self.variable] < self.val)
elif self.condition == Condition.no_missing:
count = sum(np.isnan(df[self.variable]))
result = count == 0
if not result:
message = self.expectation + ". check failed. " + "%2.2f" % (count/len(df.index) * 100) + \
"% of observations with invalid values."
if message:
self.__logger.log(self.severity.value, message)
return result
[docs]class Tester:
def __init__(self, name, yaml_file=None):
self.name = name
self._logger = logging.getLogger(__name__ + ".Tester." + self.name)
self.tests = []
if yaml_file:
self.load_yaml(yaml_file)
[docs] def add(self, t: Test):
self.tests.append(t)
[docs] def load_yaml(self, yaml_file):
with open(yaml_file) as f:
test_list = yaml.load(f, Loader=yaml.FullLoader)
for item in test_list:
item['condition'] = Condition[item['condition']]
item['severity'] = Severity[item['severity']]
item['logger'] = self._logger
self.add(Test(**item))
[docs] def check(self, df: pd.DataFrame):
out = True
num_tests = 0
num_failures = 0
for t in self.tests:
num_tests += 1
result = t.check(df)
out = out and result
if not result:
num_failures += 1
passes = num_tests - num_failures
self._logger.info("All Tests Completed. Out of " + str(num_tests) + " tests: " +
str(passes) + " passed and " + str(num_failures) + " failed.")
return out