Source code for research_client.datavalidator.validation

"""Validation tools for the datavalidator package."""
import json
import re
import html
from typing import Any, Callable, Union, Optional
from .types import RangeT, SetT, PolarT, PatternT, EnumT, XT, YT
from .exceptions import DataValidationError


[docs]class ValidationResult: """Data validation result. Each ValidationResult represents the result of a data validation attempt in a Validator. The ValidationResult can be queried for details on the data that was validated, whether it passed/failed validation, what the requirements for validation were, etc. A ValidationResult will evaluate to True if the validation has succeeded and to False if it has failed. """ success: bool """Whether the data has been succeessfully validated or not.""" type_: Any """The internal data type indicated for validation.""" typedesc: str """A user-directed description of the type of data being validated.""" constraint: Optional[Union[PatternT, RangeT, SetT, PolarT, PatternT, EnumT, bool]] """The constraint, if any, which was used to validate the data.""" data: Any """The data itself, possibly cast. This is the data post-casting if force- or softcasting were used, and can be used to automatically ensure typecasting or type-narrowing for data storage. For the raw data pre-casting use the `raw_data` attribute. """ rawdata: Any """The raw data, as it was passed to the validator. This is always the data as it was passed to the Validator, irrespective of whether forcecasting or softcasting were applied. """ casting: bool """Whether the data in `data` was cast or not. Note that this does not necessarily mean that casting was necessary, e.g. an integer that was passed to a Validator's `vint()` method will still have been cast to int() and set the casting attribute to True despite being of type int before. """
[docs] def __init__( self, success: bool, type_: Any, typedesc: str, constraint: Optional[ Union[PatternT, RangeT, SetT, PolarT, PatternT, EnumT, bool] ], data: Any, rawdata: Any, casting: bool, ): """Constructs a new ValidationResult. Args: success: Whether the validaton has succeeded or failed. type_: The built-in data type against which validation was carried out. typedesc: A short user-intelligible description of the data's type. constraint: A constraint data type appropriate to the type of the data, see also the `types` submodule. data: The data that was evaluated (after casting if the forcecasting and/or softcasting options were active.) rawdata: The raw, uncast data as it was passed to the validation method. casting: Whether casting was applied to `rawdata` to yield `data`. """ self.success = success self.type_ = type_ self.typedesc = typedesc self.constraint = constraint self.data = data self.rawdata = rawdata self.casting = casting
[docs] def tostring(self) -> str: """Returns string explanation of the data validation result.""" polarity = "is" if self.success else "is not" data = self.data if self.success else self.rawdata string = ( f"`{repr(data)}` {polarity} a valid {self.typedesc} of type {self.type_}." ) if not self.success: string += f" Must match constraint: `{repr(self.constraint)}`." return string
[docs] def tohtml(self) -> str: """Returns HTML formatted explanation of the data validation result.""" output: list[str] = [] output.append('<p class="dv-result">') data = self.data if self.success else self.rawdata data = html.escape(repr(data), True) output.append(f'<code class="dv-data">{data}</code>') if self.success: output.append('<span class="dv-success dv-successful">is</span>') else: output.append('<span class="dv-success dv-failed">is not</span>') output.append("a valid") typedesc = html.escape(self.typedesc, True) output.append(f'<em class="dv-typedesc">{typedesc}</em>') output.append("of type") type_ = html.escape(repr(self.type_), True) output.append(f'<code class="dv-type">{type_}</code>.') if not self.success: output.append("Must match constraint:") if isinstance(self.constraint, str): constraint = html.escape(self.constraint, True) else: constraint = html.escape(repr(self.constraint)) output.append(f'<code class="dv-constraint">{constraint}</code>') output.append("</p>") return " ".join(output)
[docs] def tojson(self) -> str: """Returns a JSON representation of the data validation result.""" data = self.data if not isinstance(self.data, (str, int, float, bool, type(None))): try: json.dumps(data) except TypeError: data = repr(self.data) return json.dumps( { "isvalid": self.success, "type": self.type_, "typedesc": self.typedesc, "constraint": self.constraint, "data": data, } )
def __str__(self) -> str: """Returns a string representation of the data validation result.""" return self.tostring() def __bool__(self) -> bool: """Returns True if the validation was successful, False otherwise.""" return self.success def __repr__(self) -> str: """Returns a python-style representation of the data validation result object.""" indent = " " string = f"{__class__.__name__}(\n{indent}{repr(self.success)},\n{indent}" string += f"{repr(self.type_)},\n{indent}{repr(self.typedesc)},\n" string += f"{indent}{repr(self.constraint)}," return string + f"\n{indent}{repr(self.data)}\n)"
[docs]class Validator: """Data validation interface. A Validator offers a convenient interface for validating a set of data points, of the same or different types. The Validator will store any failed validation results, can optionally force casting of the data to a specific type, can be evaluated for success in a boolean expression, and allows for the conditional raising of a DataValidationError exception if any validation attempts have failed. A single Validator should only be used once for a closed set of data, as reuse will add the results to the existing Validator and always evaluate False if it has previously had unsuccessful validation attempts (though under some circumstances, e.g. the successive building of datasets with late repairs, this may be desirable). """ results: list[ValidationResult] failed: list[ValidationResult] successful: list[ValidationResult] forcecast: bool ignorecase: bool
[docs] def __init__(self, forcecast: bool = False, ignorecase: bool = False): """Constructs a new Validator. Args: forcecast: Whether to force casting of the data arguments to the validation methods to the indicated type (e.g. str for .validatestring()). This will set the default behaviour for validation calls, but can be overwritten by passing the named argument forcecast=True or forcecast=False on individual method calls. For some methods, e.g. polars and enums, casting is done by passing the matched value rather than typecasting. ignorecase: Whether to ignore case in string comparisons. If true, strings will be compared in all uppercase, and regular expression matches will be passed the IGNORECASE flag. Can be overwritten on each validation call by passing ignorecase=True/False. Default value: False. """ self.results = [] self.failed = [] self.successful = [] self.forcecast = forcecast self.ignorecase = ignorecase
def __forcecast(self, overwrite: Optional[bool] = None) -> bool: if overwrite is not None: return overwrite return self.forcecast def __ignorecase(self, overwrite: Optional[bool] = None): if overwrite is not None: return overwrite return self.ignorecase def __casefolddict(self, dict_: dict[XT, YT]) -> dict[XT, YT]: return {self.__casefoldifstr(key): value for key, value in dict_.items()} def __casefoldifstr(self, x: XT) -> XT: if isinstance(x, str): return x.casefold() return x def __trycall( self, func: Callable[..., XT], *args: Any, **kwargs: dict[Any, Any] ) -> Optional[XT]: """Returns result of func() if possible, None if an Exception is raised.""" try: return func(*args, **kwargs) except Exception: return None def __condcast( self, type_: Callable[..., XT], data: YT, overwrite: Optional[bool] = None ) -> Optional[Union[XT, YT]]: """Conditionally casts data to a type. Attempts to cast `data` to `type_` if, taking into account `overwrite`, forcecasting applies. Returns `data` itself if forcecasting doesn't apply, and None if forcecasting applies but `data` cannot be cast to `type_`. """ if self.__forcecast(overwrite): return self.__trycall(type_, data) return data def __storeresult(self, result: ValidationResult): self.results.append(result) if not result: self.failed.append(result) else: self.successful.append(result)
[docs] def vstr( self, typedesc: str, constraint: PatternT, data: Any, forcecast: Optional[bool] = None, ignorecase: Optional[bool] = None, flags: Union[re.RegexFlag, int] = 0, ) -> ValidationResult: r"""Validates a string against a regular expression pattern. Args: typedesc: An end-user intelligible description of the desired data type, e.g. "User ID" or "postcode". constraint: A regular expression to match the string against. Important: Note that the regular expression will implicitly be enclosed by \A and \Z to match the beginning and end of the string. These thus need not be specified in the pattern provided. data: The data to be validated. forcecast: Optional argument to overwrite the validator's default setting for forced casting of data arguments. ignorecase: Optional argument to overwrite the validator's default setting for case sensitivity. flags: Additional regex flags to be passed to re.match(). """ cdata = data try: if not isinstance(data, str): cdata = str(data) flags = re.IGNORECASE | flags if self.__ignorecase(ignorecase) else flags cmp = bool( re.match(r"\A{pattern}\Z".format(pattern=constraint), cdata, flags=flags) ) except TypeError: cmp = False validation = ValidationResult( cmp, str, typedesc, constraint, self.__condcast(str, data, forcecast), data, self.__forcecast(forcecast), ) self.__storeresult(validation) return validation
[docs] def vint( self, typedesc: str, constraint: RangeT, data: Any, forcecast: Optional[bool] = None, ) -> ValidationResult: r"""Validates an integer against an inclusive range of integers or floats. Args: typedesc: An end-user intelligible description of the desired data type, e.g. "User ID" or "postcode". constraint: A two member tuple or list of integers where the first element represents the inclusive lower bound and the second member the inclusive upper bound of the permissible range of integer values. For example, (3, 5) would successfully validate the data inputs 3, 4, 5, but fail validation for 2 or 6. data: The data to be validated. forcecast: Optional argument to overwrite the validator's default setting for forced casting of data arguments. """ cdata = data try: if not isinstance(data, int): cdata = int(data) cmp = min(constraint) <= cdata <= max(constraint) except (TypeError, ValueError): cmp = False validation = ValidationResult( cmp, int, typedesc, constraint, self.__condcast(int, data, forcecast), data, self.__forcecast(forcecast), ) self.__storeresult(validation) return validation
[docs] def vfloat( self, typedesc: str, constraint: RangeT, data: Any, forcecast: Optional[bool] = None, ): r"""Validates a float against an inclusive range of integers or floats. Args: typedesc: An end-user intelligible description of the desired data type, e.g. "User ID" or "postcode". constraint: A two member tuple or list of integers where the first element represents the inclusive lower bound and the second member the inclusive upper bound of the permissible range of integer values. For example, (3, 5) would successfully validate the data inputs 3, 4, 5, but fail validation for 2 or 6. data: The data to be validated. forcecast: Optional argument to overwrite the objects default setting for forced casting of data arguments. """ cdata = data try: if not isinstance(data, float): cdata = float(data) cmp = min(constraint) <= cdata <= max(constraint) except TypeError: cmp = False validation = ValidationResult( cmp, float, typedesc, constraint, self.__condcast(float, data, forcecast), data, self.__forcecast(forcecast), ) self.__storeresult(validation) return validation
[docs] def vbool( self, typedesc: str, constraint: bool, data: Any, forcecast: Optional[bool] = None, ): """Validates a bool. Args: typedesc: An end-user intelligible description of the desired data type, e.g. "User ID" or "postcode". constraint: A boolean that must be matched, or None to just validate any bool. data: The data to be validated. forcecast: Optional argument to overwrite the objects default setting for forced casting of data arguments. """ cdata = data try: if not isinstance(data, bool): cdata = bool(data) cmp = True if constraint is not None: cmp = (cdata == constraint) except TypeError: cmp = False validation = ValidationResult( cmp, bool, typedesc, constraint, self.__condcast(bool, data, forcecast), data, self.__forcecast(forcecast), ) self.__storeresult(validation) return validation
[docs] def vpolar( self, typedesc: str, constraint: PolarT, data: Any, forcecast: Optional[bool] = None, ignorecase: Optional[bool] = None, ) -> ValidationResult: r"""Validates a string against two sets of polar terms. Checkes whether data is in either of two sets of polar opposition terms. If the forcecast option is active, membership in the first of the two sets results in casting to True, membership in the second set to False, and memebership in neither set to None. Validation is successful if data is contained in either of the two sets, and unsuccessful otherwise. Args: typedesc: An end-user intelligible description of the desired data type, e.g. "User ID" or "postcode". constraint: A two member tuple or list of containers of any type (must support membership testing with `in`). The first member is a container of acceptable truthy values, the second is a container of acceptable falsy values. Note that python built-in boolean types True and False are always validated as correct. data: The data to be validated. forcecast: Optional argument to overwrite the objects default setting for forced casting of data arguments. ignorecase: Optional argument to overwrite the validator's default setting for case sensitivity. """ cdata = data cval = None if isinstance(data, str) and self.__ignorecase(ignorecase): cdata = data.casefold() if cdata in set(map(self.__casefoldifstr, constraint[0])): cval = True elif cdata in set(map(self.__casefoldifstr, constraint[1])): cval = False else: if cdata in constraint[0]: cval = True elif cdata in constraint[1]: cval = False cmp = cval is not None validation = ValidationResult( cmp, PolarT, typedesc, constraint, cval if self.__forcecast(forcecast) else data, data, self.__forcecast(forcecast), ) self.__storeresult(validation) return validation
[docs] def venum( self, typedesc: str, constraint: EnumT, data: Any, forcecast: Optional[bool] = None, ignorecase: Optional[bool] = None, ) -> ValidationResult: """Validates a string against a key:value enumerable. Checks whether `data` is either contained in the keys or the values of the enumerable. If forcecasting is used, it casts to the *value* that was matched (not the key), or to None if no match was found. """ cdata = data cfound = False cval = None if isinstance(data, str) and self.__ignorecase(ignorecase): cdata = data.casefold() cconst = self.__casefolddict(constraint) if cdata in cconst: cfound = True cval = cconst[cdata] elif cdata in cconst.values(): cfound = True cval = cdata else: if cdata in constraint: cfound = True cval = constraint[cdata] elif cdata in constraint.values(): cfound = True cval = cdata validation = ValidationResult( cfound, EnumT, typedesc, constraint, cval if self.__forcecast(forcecast) else data, data, self.__forcecast(forcecast), ) self.__storeresult(validation) return validation
[docs] def raiseif(self): """Raises a DataValidationError iff at least one validation has failed.""" if bool(self.failed): raise DataValidationError(self.tostring(errorsonly=True), self)
[docs] def tohtml(self, errorsonly: bool = False): """Returns a paragraph-by-paragraph HTML representation of validation attempts. Args: errorsonly: Whether to include only the errors or all validation attempts. """ if errorsonly: return "\n".join( [f'<div class="dv-list">{e.tohtml()}</div>' for e in self.failed] ) return "\n".join([f'<p class="dv-list">{e.tohtml()}</p>' for e in self.results])
[docs] def tostring(self, errorsonly: bool = False): """Returns a line-by-line string representation of validation attempts. Args: errorsonly: Whether to include only the errors or all validation attempts. """ if errorsonly: return "\n".join([str(e) for e in self.failed]) return "\n".join([str(e) for e in self.results])
def __bool__(self): """Returns True if no validation errors have occured so far, False otherwise.""" return bool(self.failed) def __str__(self): """Returns a line-by-line string representation of all validation attempts.""" return self.tostring() def __repr__(self): """Returns a python-like representation of the (empty) validator object.""" return f"{__class__}(forcecast={self.forcecast})"