List of args to be parsed. Defaults to None, in which case
-sys.argv[1:] is used.
-
-
Returns
-
An instance of ArgumentParser populated with the provided args.
-
-
-Expand source code
-
-
def parse_args(args: Optional[Sequence] = None) -> argparse.Namespace:
- """Arguments parser for dynamicio cli.py.
-
- Args:
- args: List of args to be parsed. Defaults to None, in which case
- sys.argv[1:] is used.
-
- Returns:
- An instance of ArgumentParser populated with the provided args.
- """
- parser = argparse.ArgumentParser(prog="dynamicio", description="Generate dataset schemas")
- group = parser.add_mutually_exclusive_group(required=True)
- group.add_argument(
- "-b",
- "--batch",
- action="store_true",
- help="flag, used to generate multiple schemas provided a datasets directory.",
- )
- group.add_argument(
- "-s",
- "--single",
- action="store_true",
- help="flag, used to generate a schema provided a single dataset.",
- )
- parser.add_argument("-p", "--path", required=True, help="the path to the dataset/datasets-directory.", type=str)
- parser.add_argument("-o", "--output", required=True, help="the path to the schemas output directory.", type=str)
- return parser.parse_args(args)
-
-
-
-def run()
-
-
-
Entry point for the dynamicio cli.py.
-
-
-Expand source code
-
-
def run():
- """Entry point for the dynamicio cli.py."""
- args = parse_args()
- main(args)
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/config.html b/docs/config.html
deleted file mode 100644
index 1978a61..0000000
--- a/docs/config.html
+++ /dev/null
@@ -1,1087 +0,0 @@
-
-
-
-
-
-
-dynamicio.config API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.config
-
-
-
Implements the IOConfig class, generating objects used as a configuration parameter for the instantiation ofsrc.utils.dynamicio.dataio.DynamicDataIO objects.
-
The IOConfig object, essentially parses a yaml file that contains a set of input sources that will be processed by a
-task, converting filtering and converting them into dictionaries.
-
For example, suppose an input.yaml file, containing:
"""Implements the `IOConfig` class, generating objects used as a configuration parameter for the instantiation of`src.utils.dynamicio.dataio.DynamicDataIO` objects.
-
-The `IOConfig` object, essentially parses a yaml file that contains a set of input sources that will be processed by a
-task, converting filtering and converting them into dictionaries.
-
-For example, suppose an `input.yaml` file, containing:
-
- READ_FROM_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
-
-would be loaded with:
-
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=config_module
- )
-
-and:
-
- input_sources_config.config
-
-would return:
-
- {
- "READ_FROM_S3_CSV": {
- "LOCAL": {
- "type": "local",
- "local": {
- "file_path": f"{test_global_vars.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "type": "s3",
- "s3": {
- "bucket": "mock-bucket",
- "file_path": "mock-key",
- "file_type": "csv"
- }
- },
- }
- }
-"""
-__all__ = ["IOConfig", "SafeDynamicResourceLoader", "SafeDynamicSchemaLoader"]
-
-import re
-from types import ModuleType
-from typing import Any, List, Mapping
-
-import yaml
-from magic_logger import logger
-
-
-class SafeDynamicResourceLoader(yaml.SafeLoader):
- """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
- Dynamic variables defined in a provided module object.
- """
-
- module = None
- dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
- @classmethod
- def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
- def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- return value
-
-
-class SafeDynamicSchemaLoader(yaml.SafeLoader):
- """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
- Dynamic variables defined in a provided module object.
- """
-
- module = None
- dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
- @classmethod
- def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
- def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- try:
- value = float(value)
- return value
- except ValueError:
- pass
-
- return value
-
-
-class IOConfig:
- """Generates an object that returns a sub-dictionary of the elements of that yaml file.
-
- The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file,
- an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.
-
- Example:
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=config_module
- )
- """
-
- YAML_TAG = "tag:yaml.org,2002:str"
- SafeDynamicResourceLoader.add_constructor(YAML_TAG, SafeDynamicResourceLoader.dyn_str_constructor)
- SafeDynamicSchemaLoader.add_constructor(YAML_TAG, SafeDynamicSchemaLoader.dyn_value_constructor)
-
- def __init__(self, path_to_source_yaml: str, env_identifier: str, dynamic_vars: ModuleType):
- """Class constructor.
-
- Args:
- path_to_source_yaml: Absolute file path to yaml file containing source definitions
- env_identifier: "LOCAL" or "CLOUD".
- dynamic_vars: module containing values for dynamic values that the source yaml
- may reference.
- """
- self.path_to_source_yaml = path_to_source_yaml
- self.env_identifier = env_identifier
- self.dynamic_vars = dynamic_vars
- self.config = self._parse_sources_config()
-
- def _parse_sources_config(self) -> Mapping:
- """Parses the yaml input and return a dictionary.
-
- Returns:
- A dictionary with the list of all file paths pointing to various input sources as those
- are defined in their respective data/*.yaml files.
- """
- with open(self.path_to_source_yaml, "r") as stream: # pylint: disable=unspecified-encoding]
- logger.debug(f"Parsing {self.path_to_source_yaml}...")
- return yaml.load(stream, SafeDynamicResourceLoader.with_module(self.dynamic_vars))
-
- @property
- def sources(self) -> List[str]:
- """Class property for easy access to a list of sources.
-
- Returns:
- All top level names of the available resources for the used resources yaml config.
- """
- return list(self.config.keys())
-
- def get(self, source_key: str) -> Mapping:
- """A getter.
-
- Args:
- source_key: The name of the resource for which we want to create a config.
-
- Returns:
- A dictionary with the necessary fields for loading the data from a source.
-
- Example:
-
- Given:
-
- VOYAGE_DATA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
- file_type: "parquet"
- CLOUD:
- type: "kafka"
- KAFKA:
- KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
- KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
- If you do:
-
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=globals
- )
- voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
- then `voyage_data_cloud_mapping` is:
-
- "KAFKA": {
- "KAFKA_SERVER": "mock-kafka-server",
- "KAFKA_TOPIC": "mock-kafka-topic"
- }
- """
- source_config = self.config[source_key][self.env_identifier]
- if self.config[source_key].get("schema"):
- schema_definition = self._get_schema_definition(source_key)
- source_config["name"] = schema_definition["name"]
- source_config["schema"] = self._get_schema(schema_definition)
- source_config["validations"] = self._get_validations(schema_definition)
- source_config["metrics"] = self._get_metrics(schema_definition)
- return source_config
-
- def _get_schema_definition(self, source_key: str) -> Mapping:
- """Retrieves the schema definition from a resource definition.
-
- Returns:
- The schema definition provided for a resource definition.
- """
- schema_file_path = self.config[source_key].get("schema")["file_path"]
- with open(schema_file_path, "r") as stream: # pylint: disable=unspecified-encoding]
- logger.debug(f"Parsing schema: {schema_file_path}...")
- return yaml.load(stream, SafeDynamicSchemaLoader.with_module(self.dynamic_vars))
-
- @staticmethod
- def _get_schema(schema_definition: Mapping) -> Mapping:
- """Retrieve the schema from a schema definition.
-
- Args:
- schema_definition:
-
- Returns:
- The column types in the schema definition.
- """
- _schema = {}
- for column in schema_definition["columns"].keys():
- _schema[column] = schema_definition["columns"][column]["type"]
- return _schema
-
- @staticmethod
- def _get_validations(schema_definition: Mapping) -> Mapping:
- """Returns all validations for each column in a schema definition.
-
- Args:
- schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
- Returns:
- The validations applied to each column in the schema definition.
- """
- _validations = {}
- for column in schema_definition["columns"].keys():
- _validations[column] = schema_definition["columns"][column]["validations"]
- return _validations
-
- @staticmethod
- def _get_metrics(schema_definition):
- """Returns all metrics for each column in a schema definition.
-
- Args:
- schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
- Returns:
- The metrics applied to each column in the schema definition.
- """
- _metrics = {}
- for column in schema_definition["columns"].keys():
- _metrics[column] = schema_definition["columns"][column]["metrics"]
- return _metrics
Generates an object that returns a sub-dictionary of the elements of that yaml file.
-
The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file,
-an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.
Absolute file path to yaml file containing source definitions
-
env_identifier
-
"LOCAL" or "CLOUD".
-
dynamic_vars
-
module containing values for dynamic values that the source yaml
-may reference.
-
-
-
-Expand source code
-
-
class IOConfig:
- """Generates an object that returns a sub-dictionary of the elements of that yaml file.
-
- The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file,
- an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.
-
- Example:
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=config_module
- )
- """
-
- YAML_TAG = "tag:yaml.org,2002:str"
- SafeDynamicResourceLoader.add_constructor(YAML_TAG, SafeDynamicResourceLoader.dyn_str_constructor)
- SafeDynamicSchemaLoader.add_constructor(YAML_TAG, SafeDynamicSchemaLoader.dyn_value_constructor)
-
- def __init__(self, path_to_source_yaml: str, env_identifier: str, dynamic_vars: ModuleType):
- """Class constructor.
-
- Args:
- path_to_source_yaml: Absolute file path to yaml file containing source definitions
- env_identifier: "LOCAL" or "CLOUD".
- dynamic_vars: module containing values for dynamic values that the source yaml
- may reference.
- """
- self.path_to_source_yaml = path_to_source_yaml
- self.env_identifier = env_identifier
- self.dynamic_vars = dynamic_vars
- self.config = self._parse_sources_config()
-
- def _parse_sources_config(self) -> Mapping:
- """Parses the yaml input and return a dictionary.
-
- Returns:
- A dictionary with the list of all file paths pointing to various input sources as those
- are defined in their respective data/*.yaml files.
- """
- with open(self.path_to_source_yaml, "r") as stream: # pylint: disable=unspecified-encoding]
- logger.debug(f"Parsing {self.path_to_source_yaml}...")
- return yaml.load(stream, SafeDynamicResourceLoader.with_module(self.dynamic_vars))
-
- @property
- def sources(self) -> List[str]:
- """Class property for easy access to a list of sources.
-
- Returns:
- All top level names of the available resources for the used resources yaml config.
- """
- return list(self.config.keys())
-
- def get(self, source_key: str) -> Mapping:
- """A getter.
-
- Args:
- source_key: The name of the resource for which we want to create a config.
-
- Returns:
- A dictionary with the necessary fields for loading the data from a source.
-
- Example:
-
- Given:
-
- VOYAGE_DATA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
- file_type: "parquet"
- CLOUD:
- type: "kafka"
- KAFKA:
- KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
- KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
- If you do:
-
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=globals
- )
- voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
- then `voyage_data_cloud_mapping` is:
-
- "KAFKA": {
- "KAFKA_SERVER": "mock-kafka-server",
- "KAFKA_TOPIC": "mock-kafka-topic"
- }
- """
- source_config = self.config[source_key][self.env_identifier]
- if self.config[source_key].get("schema"):
- schema_definition = self._get_schema_definition(source_key)
- source_config["name"] = schema_definition["name"]
- source_config["schema"] = self._get_schema(schema_definition)
- source_config["validations"] = self._get_validations(schema_definition)
- source_config["metrics"] = self._get_metrics(schema_definition)
- return source_config
-
- def _get_schema_definition(self, source_key: str) -> Mapping:
- """Retrieves the schema definition from a resource definition.
-
- Returns:
- The schema definition provided for a resource definition.
- """
- schema_file_path = self.config[source_key].get("schema")["file_path"]
- with open(schema_file_path, "r") as stream: # pylint: disable=unspecified-encoding]
- logger.debug(f"Parsing schema: {schema_file_path}...")
- return yaml.load(stream, SafeDynamicSchemaLoader.with_module(self.dynamic_vars))
-
- @staticmethod
- def _get_schema(schema_definition: Mapping) -> Mapping:
- """Retrieve the schema from a schema definition.
-
- Args:
- schema_definition:
-
- Returns:
- The column types in the schema definition.
- """
- _schema = {}
- for column in schema_definition["columns"].keys():
- _schema[column] = schema_definition["columns"][column]["type"]
- return _schema
-
- @staticmethod
- def _get_validations(schema_definition: Mapping) -> Mapping:
- """Returns all validations for each column in a schema definition.
-
- Args:
- schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
- Returns:
- The validations applied to each column in the schema definition.
- """
- _validations = {}
- for column in schema_definition["columns"].keys():
- _validations[column] = schema_definition["columns"][column]["validations"]
- return _validations
-
- @staticmethod
- def _get_metrics(schema_definition):
- """Returns all metrics for each column in a schema definition.
-
- Args:
- schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
- Returns:
- The metrics applied to each column in the schema definition.
- """
- _metrics = {}
- for column in schema_definition["columns"].keys():
- _metrics[column] = schema_definition["columns"][column]["metrics"]
- return _metrics
-
-
Class variables
-
-
var YAML_TAG
-
-
-
-
-
Instance variables
-
-
var sources : List[str]
-
-
Class property for easy access to a list of sources.
-
Returns
-
All top level names of the available resources for the used resources yaml config.
-
-
-Expand source code
-
-
@property
-def sources(self) -> List[str]:
- """Class property for easy access to a list of sources.
-
- Returns:
- All top level names of the available resources for the used resources yaml config.
- """
- return list(self.config.keys())
def get(self, source_key: str) -> Mapping:
- """A getter.
-
- Args:
- source_key: The name of the resource for which we want to create a config.
-
- Returns:
- A dictionary with the necessary fields for loading the data from a source.
-
- Example:
-
- Given:
-
- VOYAGE_DATA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
- file_type: "parquet"
- CLOUD:
- type: "kafka"
- KAFKA:
- KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
- KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
- If you do:
-
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=globals
- )
- voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
- then `voyage_data_cloud_mapping` is:
-
- "KAFKA": {
- "KAFKA_SERVER": "mock-kafka-server",
- "KAFKA_TOPIC": "mock-kafka-topic"
- }
- """
- source_config = self.config[source_key][self.env_identifier]
- if self.config[source_key].get("schema"):
- schema_definition = self._get_schema_definition(source_key)
- source_config["name"] = schema_definition["name"]
- source_config["schema"] = self._get_schema(schema_definition)
- source_config["validations"] = self._get_validations(schema_definition)
- source_config["metrics"] = self._get_metrics(schema_definition)
- return source_config
-
-
-
-
-
-class SafeDynamicResourceLoader
-(stream)
-
-
-
Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
Dynamic variables defined in a provided module object.
-
Initialize the scanner.
-
-
-Expand source code
-
-
class SafeDynamicResourceLoader(yaml.SafeLoader):
- """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
- Dynamic variables defined in a provided module object.
- """
-
- module = None
- dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
- @classmethod
- def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
- def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- return value
-
-
Ancestors
-
-
yaml.loader.SafeLoader
-
yaml.reader.Reader
-
yaml.scanner.Scanner
-
yaml.parser.Parser
-
yaml.composer.Composer
-
yaml.constructor.SafeConstructor
-
yaml.constructor.BaseConstructor
-
yaml.resolver.Resolver
-
yaml.resolver.BaseResolver
-
-
Class variables
-
-
var dynamic_data_matcher
-
-
-
-
var module
-
-
-
-
var yaml_constructors
-
-
-
-
-
Static methods
-
-
-def with_module(module: module)
-
-
-
Creates a dynamic subclass of SafeDynamicLoader with the data_module attribute set to module.
-
Args
-
-
module
-
A global vars module with all the dynamic values defined in it.
-
-
Returns
-
type
-
-
-Expand source code
-
-
@classmethod
-def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
Args
-
-
node
-
Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-are replaced with the respective attributes in te provided module.
-
-
Returns
-
Constructed str or numerical.
-
-
-Expand source code
-
-
def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- return value
-
-
-
-
-
-class SafeDynamicSchemaLoader
-(stream)
-
-
-
Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
Dynamic variables defined in a provided module object.
-
Initialize the scanner.
-
-
-Expand source code
-
-
class SafeDynamicSchemaLoader(yaml.SafeLoader):
- """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
- Dynamic variables defined in a provided module object.
- """
-
- module = None
- dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
- @classmethod
- def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
- def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- try:
- value = float(value)
- return value
- except ValueError:
- pass
-
- return value
-
-
Ancestors
-
-
yaml.loader.SafeLoader
-
yaml.reader.Reader
-
yaml.scanner.Scanner
-
yaml.parser.Parser
-
yaml.composer.Composer
-
yaml.constructor.SafeConstructor
-
yaml.constructor.BaseConstructor
-
yaml.resolver.Resolver
-
yaml.resolver.BaseResolver
-
-
Class variables
-
-
var dynamic_data_matcher
-
-
-
-
var module
-
-
-
-
var yaml_constructors
-
-
-
-
-
Static methods
-
-
-def with_module(module: module)
-
-
-
Creates a dynamic subclass of SafeDynamicLoader with the data_module attribute set to module.
-
Args
-
-
module
-
A global vars module with all the dynamic values defined in it.
-
-
Returns
-
type
-
-
-Expand source code
-
-
@classmethod
-def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-
-
-
Methods
-
-
-def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) ‑> Any
-
-
-
Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
Args
-
-
node
-
Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-are replaced with the respective attributes in te provided module.
-
-
Returns
-
Constructed str or numerical.
-
-
-Expand source code
-
-
def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- try:
- value = float(value)
- return value
- except ValueError:
- pass
-
- return value
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/core.html b/docs/core.html
deleted file mode 100644
index 2de5292..0000000
--- a/docs/core.html
+++ /dev/null
@@ -1,855 +0,0 @@
-
-
-
-
-
-
-dynamicio.core API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.core
-
-
-
Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation.
-
-
-Expand source code
-
-
"""Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation."""
-# pylint: disable=no-member
-__all__ = ["DynamicDataIO", "SCHEMA_FROM_FILE"]
-
-import asyncio
-import inspect
-import re
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Mapping, MutableMapping, Optional
-
-import pandas as pd # type: ignore
-from magic_logger import logger
-
-from dynamicio import validations
-from dynamicio.errors import CASTING_WARNING_MSG, ColumnsDataTypeError, MissingSchemaDefinition, NOTICE_MSG, SchemaNotFoundError, SchemaValidationError
-from dynamicio.metrics import get_metric
-
-SCHEMA_FROM_FILE = {"schema": object()}
-
-pool = ThreadPoolExecutor()
-
-
-class DynamicDataIO:
- """Given a `src.utils.dynamicio.config.IOConfig` object, it generates an object with access to a series of methods for cloud I/O operations and data validations.
-
- Example:
- >>> input_sources_config = IOConfig(
- >>> "path_to/input.yaml",
- >>> os.getenv("ENVIRONMENT",default="LOCAL")
- >>> )
- >>>
- >>> class IO(WithS3File, WithLocal, DynamicDataIO):
- >>> schema = S
- >>>
- >>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET")
- >>> my_dataset_io = IO(my_dataset_local_mapping)
- >>> my_dataset_df = my_dataset_io.read()
- """
-
- schema: Mapping
-
- def __init__(
- self,
- source_config: Mapping,
- apply_schema_validations: bool = False,
- log_schema_metrics: bool = False,
- show_casting_warnings: bool = False,
- **options: MutableMapping[str, Any],
- ):
- """Class constructor.
-
- Args:
- source_config: Configuration to use when reading/writing data from/to a source
- apply_schema_validations: Applies schema validations on either read() or write()
- log_schema_metrics: Logs schema metrics on either read() or write()
- show_casting_warnings: Logs casting warnings on either read() or write() if set to True
- options: Any additional kwargs that may be used throughout the lifecycle of the object
- """
- if type(self) is DynamicDataIO: # pylint: disable=unidiomatic-typecheck
- raise TypeError("Abstract class DynamicDataIO cannot be used to instantiate an object...")
-
- self.sources_config = source_config
- self.name = self._transform_class_name_to_dataset_name(self.__class__.__name__)
- self.apply_schema_validations = apply_schema_validations
- self.log_schema_metrics = log_schema_metrics
- self.show_casting_warnings = show_casting_warnings
- self.options = self._get_options(options, source_config.get("options"))
- source_name = self.sources_config.get("type")
- if self.schema is SCHEMA_FROM_FILE:
- try:
- self.schema = self.sources_config["schema"]
- self.name = self.sources_config["name"].upper()
- self.schema_validations = self.sources_config["validations"]
- self.schema_metrics = self.sources_config["metrics"]
- except KeyError as _error:
- raise SchemaNotFoundError() from _error
-
- assert hasattr(self, f"_read_from_{source_name}") or hasattr(
- self, f"_write_to_{source_name}"
- ), f"No method '_read_from_{source_name}' or '_write_to_{source_name}'. Have you registered a mixin for {source_name}?"
-
- def __init_subclass__(cls):
- """Ensure that all subclasses have a `schema` attribute and a `validate` method.
-
- Raises:
- AssertionError: If either of the attributes is not implemented
- """
- if not inspect.getmodule(cls).__name__.startswith("dynamicio"):
- assert "schema" in cls.__dict__
-
- if cls.schema is None or (cls.schema is not SCHEMA_FROM_FILE and len(cls.schema) == 0):
- raise ValueError(f"schema for class {cls} cannot be None or empty...")
-
- async def async_read(self):
- """Allows the use of asyncio to concurrently read files in memory.
-
- Returns:
- A pandas dataframe or an iterable.
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.read)
-
- def read(self) -> pd.DataFrame:
- """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
- Returns:
- A pandas dataframe or an iterable.
- """
- source_name = self.sources_config.get("type")
- df = getattr(self, f"_read_from_{source_name}")()
-
- df = self._apply_schema(df)
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- return df
-
- async def async_write(self, df: pd.DataFrame):
- """Allows the use of asyncio to concurrently write files out.
-
- Args:
- df: The data to be written
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.write, df)
-
- def write(self, df: pd.DataFrame):
- """Sink data to a given source based on the sources_config.
-
- Args:
- df: The data to be written
- """
- source_name = self.sources_config.get("type")
- if set(df.columns) != set(self.schema.keys()): # pylint: disable=E1101
- columns = [column for column in df.columns.to_list() if column in self.schema.keys()]
- df = df[columns]
-
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
-
- def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Validates a dataframe based on the validations present in its schema definition.
-
- All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
- Args:
- df:
-
- Returns:
- self (to allow for method chaining).
-
- Raises:
- SchemaValidationError: if any of the validations failed. The `message` attribute of
- the exception object is a `List[str]`, where each element is the name of a
- validation that failed.
- """
- if not hasattr(self, "schema_validations"):
- raise MissingSchemaDefinition(self.__class__)
-
- failed_validations = {}
- for column in self.schema_validations.keys():
- for validation in self.schema_validations[column].keys():
- if self.schema_validations[column][validation]["apply"] is True:
- validation_result = getattr(validations, validation)(self.name, df, column, **self.schema_validations[column][validation]["options"])
- if not validation_result.valid:
- failed_validations[validation] = validation_result.message
-
- if len(failed_validations) > 0:
- raise SchemaValidationError(failed_validations)
-
- return self
-
- def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Calculates and logs metrics based on the metrics present in its schema definition.
-
- Args:
- df: A dataframe for which metrics are generated and logged
-
- Returns:
- self (to allow for method chaining).
- """
- if not hasattr(self, "schema_metrics"):
- raise MissingSchemaDefinition(self.__class__)
-
- for column in self.schema_metrics.keys():
- for metric in self.schema_metrics[column]:
- get_metric(metric)(self.name, df, column)() # type: ignore
-
- return self
-
- def _apply_schema(self, df: pd.DataFrame) -> pd.DataFrame:
- """Called by the `self.read()` and the `self._write_to_local()` methods.
-
- Contrasts a dataframe's read from a given source against the class's schema dictionary,
- checking that columns are the same (by means of _has_columns and _has_valid_dtypes). Then,
- check if the columns are fine, it further validates if the types of columns conform to the
- expected schema. Finally, if schema types are different, then it attempts to apply schema;
- if possible then the schema validation is successful.
-
- Args:
- df: A pandas dataframe.
-
- Returns:
- A schema validated dataframe.
- """
- if not self._has_valid_dtypes(df):
- raise ColumnsDataTypeError()
- return df
-
- @staticmethod
- def _transform_class_name_to_dataset_name(string_to_transform: str) -> str:
- """Called by the init function to fetch dataset names from class name.
-
- Used to create dataset name from class name, turns camel case into upper snake case.
- For example: 'ThisNameABC' -> 'THIS_NAME_ABC'.
- """
- words = re.findall(r"\d[A-Z]+|[A-Z]?[a-z\d]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+|[A-Z]{2,}|[A-Z]", string_to_transform)
- return "_".join(map(str.lower, words)).upper()
-
- def _has_valid_dtypes(self, df: pd.DataFrame) -> bool:
- """Checks if `df` has the expected dtypes defined in `schema`.
-
- Schema is a dictionary object where keys are column names and values are dtypes in string format as returned by e.g.
- `df[column].dtype.name`.
-
- This function issues `error` level logs describing the first column that caused the check to fail.
-
- It is assumed that `df` only has the columns defined in `schema`.
-
- Args:
- df:
-
- Returns:
- bool - `True` if `df` has the given dtypes, `False` otherwise
- """
- dtypes = df.dtypes
-
- for column_name, expected_dtype in self.schema.items():
- found_dtype = dtypes[column_name].name
- if found_dtype != expected_dtype:
- if self.show_casting_warnings:
- logger.info(f"Expected: '{expected_dtype}' dtype for {self.name}['{column_name}]', found '{found_dtype}'")
- try:
- if len(set(type(v) for v in df[column_name].values)) > 1: # pylint: disable=consider-using-set-comprehension
- logger.warning(CASTING_WARNING_MSG.format(column_name, expected_dtype, found_dtype)) # pylint: disable=logging-format-interpolation
- logger.info(NOTICE_MSG.format(column_name)) # pylint: disable=logging-format-interpolation
- df[column_name] = df[column_name].astype(self.schema[column_name])
- except (ValueError, TypeError):
- logger.error(f"ValueError: Tried casting column {self.name}['{column_name}]' to '{expected_dtype}' " f"from '{found_dtype}', but failed")
- return False
- return True
-
- @staticmethod
- def _get_options(options_from_code: MutableMapping[str, Any], options_from_resource_definition: Optional[Mapping[str, Any]]) -> MutableMapping[str, Any]:
- """Retrieves options either from code or from a resource-definition.
-
- Options are merged if they are provided by both sources, while in the case of conflicts, the options from the code
- take precedence.
-
- Args:
- options_from_code (Optional[Mapping])
- options_from_resource_definition (Optional[Mapping])
-
- Returns:
- [Optional[Mapping]]: options that are going to be used
- """
- if options_from_resource_definition:
- return {**options_from_resource_definition, **options_from_code}
- return options_from_code
Given a src.utils.dynamicio.config.IOConfig object, it generates an object with access to a series of methods for cloud I/O operations and data validations.
Configuration to use when reading/writing data from/to a source
-
apply_schema_validations
-
Applies schema validations on either read() or write()
-
log_schema_metrics
-
Logs schema metrics on either read() or write()
-
show_casting_warnings
-
Logs casting warnings on either read() or write() if set to True
-
options
-
Any additional kwargs that may be used throughout the lifecycle of the object
-
-
-
-Expand source code
-
-
class DynamicDataIO:
- """Given a `src.utils.dynamicio.config.IOConfig` object, it generates an object with access to a series of methods for cloud I/O operations and data validations.
-
- Example:
- >>> input_sources_config = IOConfig(
- >>> "path_to/input.yaml",
- >>> os.getenv("ENVIRONMENT",default="LOCAL")
- >>> )
- >>>
- >>> class IO(WithS3File, WithLocal, DynamicDataIO):
- >>> schema = S
- >>>
- >>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET")
- >>> my_dataset_io = IO(my_dataset_local_mapping)
- >>> my_dataset_df = my_dataset_io.read()
- """
-
- schema: Mapping
-
- def __init__(
- self,
- source_config: Mapping,
- apply_schema_validations: bool = False,
- log_schema_metrics: bool = False,
- show_casting_warnings: bool = False,
- **options: MutableMapping[str, Any],
- ):
- """Class constructor.
-
- Args:
- source_config: Configuration to use when reading/writing data from/to a source
- apply_schema_validations: Applies schema validations on either read() or write()
- log_schema_metrics: Logs schema metrics on either read() or write()
- show_casting_warnings: Logs casting warnings on either read() or write() if set to True
- options: Any additional kwargs that may be used throughout the lifecycle of the object
- """
- if type(self) is DynamicDataIO: # pylint: disable=unidiomatic-typecheck
- raise TypeError("Abstract class DynamicDataIO cannot be used to instantiate an object...")
-
- self.sources_config = source_config
- self.name = self._transform_class_name_to_dataset_name(self.__class__.__name__)
- self.apply_schema_validations = apply_schema_validations
- self.log_schema_metrics = log_schema_metrics
- self.show_casting_warnings = show_casting_warnings
- self.options = self._get_options(options, source_config.get("options"))
- source_name = self.sources_config.get("type")
- if self.schema is SCHEMA_FROM_FILE:
- try:
- self.schema = self.sources_config["schema"]
- self.name = self.sources_config["name"].upper()
- self.schema_validations = self.sources_config["validations"]
- self.schema_metrics = self.sources_config["metrics"]
- except KeyError as _error:
- raise SchemaNotFoundError() from _error
-
- assert hasattr(self, f"_read_from_{source_name}") or hasattr(
- self, f"_write_to_{source_name}"
- ), f"No method '_read_from_{source_name}' or '_write_to_{source_name}'. Have you registered a mixin for {source_name}?"
-
- def __init_subclass__(cls):
- """Ensure that all subclasses have a `schema` attribute and a `validate` method.
-
- Raises:
- AssertionError: If either of the attributes is not implemented
- """
- if not inspect.getmodule(cls).__name__.startswith("dynamicio"):
- assert "schema" in cls.__dict__
-
- if cls.schema is None or (cls.schema is not SCHEMA_FROM_FILE and len(cls.schema) == 0):
- raise ValueError(f"schema for class {cls} cannot be None or empty...")
-
- async def async_read(self):
- """Allows the use of asyncio to concurrently read files in memory.
-
- Returns:
- A pandas dataframe or an iterable.
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.read)
-
- def read(self) -> pd.DataFrame:
- """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
- Returns:
- A pandas dataframe or an iterable.
- """
- source_name = self.sources_config.get("type")
- df = getattr(self, f"_read_from_{source_name}")()
-
- df = self._apply_schema(df)
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- return df
-
- async def async_write(self, df: pd.DataFrame):
- """Allows the use of asyncio to concurrently write files out.
-
- Args:
- df: The data to be written
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.write, df)
-
- def write(self, df: pd.DataFrame):
- """Sink data to a given source based on the sources_config.
-
- Args:
- df: The data to be written
- """
- source_name = self.sources_config.get("type")
- if set(df.columns) != set(self.schema.keys()): # pylint: disable=E1101
- columns = [column for column in df.columns.to_list() if column in self.schema.keys()]
- df = df[columns]
-
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
-
- def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Validates a dataframe based on the validations present in its schema definition.
-
- All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
- Args:
- df:
-
- Returns:
- self (to allow for method chaining).
-
- Raises:
- SchemaValidationError: if any of the validations failed. The `message` attribute of
- the exception object is a `List[str]`, where each element is the name of a
- validation that failed.
- """
- if not hasattr(self, "schema_validations"):
- raise MissingSchemaDefinition(self.__class__)
-
- failed_validations = {}
- for column in self.schema_validations.keys():
- for validation in self.schema_validations[column].keys():
- if self.schema_validations[column][validation]["apply"] is True:
- validation_result = getattr(validations, validation)(self.name, df, column, **self.schema_validations[column][validation]["options"])
- if not validation_result.valid:
- failed_validations[validation] = validation_result.message
-
- if len(failed_validations) > 0:
- raise SchemaValidationError(failed_validations)
-
- return self
-
- def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Calculates and logs metrics based on the metrics present in its schema definition.
-
- Args:
- df: A dataframe for which metrics are generated and logged
-
- Returns:
- self (to allow for method chaining).
- """
- if not hasattr(self, "schema_metrics"):
- raise MissingSchemaDefinition(self.__class__)
-
- for column in self.schema_metrics.keys():
- for metric in self.schema_metrics[column]:
- get_metric(metric)(self.name, df, column)() # type: ignore
-
- return self
-
- def _apply_schema(self, df: pd.DataFrame) -> pd.DataFrame:
- """Called by the `self.read()` and the `self._write_to_local()` methods.
-
- Contrasts a dataframe's read from a given source against the class's schema dictionary,
- checking that columns are the same (by means of _has_columns and _has_valid_dtypes). Then,
- check if the columns are fine, it further validates if the types of columns conform to the
- expected schema. Finally, if schema types are different, then it attempts to apply schema;
- if possible then the schema validation is successful.
-
- Args:
- df: A pandas dataframe.
-
- Returns:
- A schema validated dataframe.
- """
- if not self._has_valid_dtypes(df):
- raise ColumnsDataTypeError()
- return df
-
- @staticmethod
- def _transform_class_name_to_dataset_name(string_to_transform: str) -> str:
- """Called by the init function to fetch dataset names from class name.
-
- Used to create dataset name from class name, turns camel case into upper snake case.
- For example: 'ThisNameABC' -> 'THIS_NAME_ABC'.
- """
- words = re.findall(r"\d[A-Z]+|[A-Z]?[a-z\d]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+|[A-Z]{2,}|[A-Z]", string_to_transform)
- return "_".join(map(str.lower, words)).upper()
-
- def _has_valid_dtypes(self, df: pd.DataFrame) -> bool:
- """Checks if `df` has the expected dtypes defined in `schema`.
-
- Schema is a dictionary object where keys are column names and values are dtypes in string format as returned by e.g.
- `df[column].dtype.name`.
-
- This function issues `error` level logs describing the first column that caused the check to fail.
-
- It is assumed that `df` only has the columns defined in `schema`.
-
- Args:
- df:
-
- Returns:
- bool - `True` if `df` has the given dtypes, `False` otherwise
- """
- dtypes = df.dtypes
-
- for column_name, expected_dtype in self.schema.items():
- found_dtype = dtypes[column_name].name
- if found_dtype != expected_dtype:
- if self.show_casting_warnings:
- logger.info(f"Expected: '{expected_dtype}' dtype for {self.name}['{column_name}]', found '{found_dtype}'")
- try:
- if len(set(type(v) for v in df[column_name].values)) > 1: # pylint: disable=consider-using-set-comprehension
- logger.warning(CASTING_WARNING_MSG.format(column_name, expected_dtype, found_dtype)) # pylint: disable=logging-format-interpolation
- logger.info(NOTICE_MSG.format(column_name)) # pylint: disable=logging-format-interpolation
- df[column_name] = df[column_name].astype(self.schema[column_name])
- except (ValueError, TypeError):
- logger.error(f"ValueError: Tried casting column {self.name}['{column_name}]' to '{expected_dtype}' " f"from '{found_dtype}', but failed")
- return False
- return True
-
- @staticmethod
- def _get_options(options_from_code: MutableMapping[str, Any], options_from_resource_definition: Optional[Mapping[str, Any]]) -> MutableMapping[str, Any]:
- """Retrieves options either from code or from a resource-definition.
-
- Options are merged if they are provided by both sources, while in the case of conflicts, the options from the code
- take precedence.
-
- Args:
- options_from_code (Optional[Mapping])
- options_from_resource_definition (Optional[Mapping])
-
- Returns:
- [Optional[Mapping]]: options that are going to be used
- """
- if options_from_resource_definition:
- return {**options_from_resource_definition, **options_from_code}
- return options_from_code
Allows the use of asyncio to concurrently read files in memory.
-
Returns
-
A pandas dataframe or an iterable.
-
-
-Expand source code
-
-
async def async_read(self):
- """Allows the use of asyncio to concurrently read files in memory.
-
- Returns:
- A pandas dataframe or an iterable.
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.read)
Allows the use of asyncio to concurrently write files out.
-
Args
-
-
df
-
The data to be written
-
-
-
-Expand source code
-
-
async def async_write(self, df: pd.DataFrame):
- """Allows the use of asyncio to concurrently write files out.
-
- Args:
- df: The data to be written
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.write, df)
Calculates and logs metrics based on the metrics present in its schema definition.
-
Args
-
-
df
-
A dataframe for which metrics are generated and logged
-
-
Returns
-
self (to allow for method chaining).
-
-
-Expand source code
-
-
def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Calculates and logs metrics based on the metrics present in its schema definition.
-
- Args:
- df: A dataframe for which metrics are generated and logged
-
- Returns:
- self (to allow for method chaining).
- """
- if not hasattr(self, "schema_metrics"):
- raise MissingSchemaDefinition(self.__class__)
-
- for column in self.schema_metrics.keys():
- for metric in self.schema_metrics[column]:
- get_metric(metric)(self.name, df, column)() # type: ignore
-
- return self
-
-
-
-def read(self) ‑> pandas.core.frame.DataFrame
-
-
-
Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
Returns
-
A pandas dataframe or an iterable.
-
-
-Expand source code
-
-
def read(self) -> pd.DataFrame:
- """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
- Returns:
- A pandas dataframe or an iterable.
- """
- source_name = self.sources_config.get("type")
- df = getattr(self, f"_read_from_{source_name}")()
-
- df = self._apply_schema(df)
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- return df
Validates a dataframe based on the validations present in its schema definition.
-
All validations are checked and if any of them fails, a SchemaValidationError is raised.
-
Args
-
df:
-
Returns
-
self (to allow for method chaining).
-
Raises
-
-
SchemaValidationError
-
if any of the validations failed. The message attribute of
-the exception object is a List[str], where each element is the name of a
-validation that failed.
-
-
-
-Expand source code
-
-
def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Validates a dataframe based on the validations present in its schema definition.
-
- All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
- Args:
- df:
-
- Returns:
- self (to allow for method chaining).
-
- Raises:
- SchemaValidationError: if any of the validations failed. The `message` attribute of
- the exception object is a `List[str]`, where each element is the name of a
- validation that failed.
- """
- if not hasattr(self, "schema_validations"):
- raise MissingSchemaDefinition(self.__class__)
-
- failed_validations = {}
- for column in self.schema_validations.keys():
- for validation in self.schema_validations[column].keys():
- if self.schema_validations[column][validation]["apply"] is True:
- validation_result = getattr(validations, validation)(self.name, df, column, **self.schema_validations[column][validation]["options"])
- if not validation_result.valid:
- failed_validations[validation] = validation_result.message
-
- if len(failed_validations) > 0:
- raise SchemaValidationError(failed_validations)
-
- return self
Sink data to a given source based on the sources_config.
-
Args
-
-
df
-
The data to be written
-
-
-
-Expand source code
-
-
def write(self, df: pd.DataFrame):
- """Sink data to a given source based on the sources_config.
-
- Args:
- df: The data to be written
- """
- source_name = self.sources_config.get("type")
- if set(df.columns) != set(self.schema.keys()): # pylint: disable=E1101
- columns = [column for column in df.columns.to_list() if column in self.schema.keys()]
- df = df[columns]
-
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
Hosts exception implementations for different errors.
-
-
-Expand source code
-
-
"""Hosts exception implementations for different errors."""
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, super-init-not-called
-__all__ = [
- "DynamicIOError",
- "DataSourceError",
- "ColumnsDataTypeError",
- "NonUniqueIdColumnError",
- "NullValueInColumnError",
- "NotExpectedCategoricalValue",
- "MissingSchemaDefinition",
- "SchemaNotFoundError",
- "SchemaValidationError",
- "InvalidDatasetTypeError",
- "CASTING_WARNING_MSG",
- "NOTICE_MSG",
-]
-
-from typing import Any, Optional
-
-
-class DynamicIOError(Exception):
- """Base class for DynamicIO errors."""
-
- ERROR_STR: str = ""
- ERROR_STR_DETAILED: str = "{0}"
-
- @property
- def message(self) -> Optional[Any]:
- """Easy access for optional message argument.
-
- Returns:
- Message or `None` if not set
- """
- try:
- return self.args[0]
- except IndexError:
- return None
-
- def __str__(self):
- """Enrich and return error message."""
- message = self.message
-
- if message is None:
- return self.ERROR_STR
-
- return self.ERROR_STR_DETAILED.format(message)
-
-
-class SchemaNotFoundError(DynamicIOError):
- """Error raised when schema is not specified in the provided source."""
-
- ERROR_STR = "Schema not specified in the provided source"
- ERROR_STR_DETAILED = "Schema not specified in the provided source: {0} "
-
-
-class SchemaValidationError(DynamicIOError):
- """Error raised when schema validation fails."""
-
-
-class MissingSchemaDefinition(DynamicIOError):
- """Error raised when schema is not specified in the provided source."""
-
- ERROR_STR = "The resource definition for this class is missing a schema definition"
- ERROR_STR_DETAILED = "The resource definition for this class is missing a schema definition: {0}"
-
-
-class DataSourceError(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class ColumnsDataTypeError(DynamicIOError):
- """Error raised when the validated data does not have the expected data types."""
-
-
-class NonUniqueIdColumnError(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class NullValueInColumnError(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class NotExpectedCategoricalValue(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class InvalidDatasetTypeError(DynamicIOError):
- """Error raised when dataset type is not one of [parquet, json, csv, h5]."""
-
- ERROR_STR = "The dataset provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
- ERROR_STR_DETAILED = "Dataset: {0} provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
-
-
-# Warning messages
-CASTING_WARNING_MSG = "Applying casting column: '{0}' to: 'type:{1}' from 'type:{2}' though not advised, as `dtypes`>1 for {0}, which may lead to data corruption!"
-NOTICE_MSG = "Keeping the {0} as is, may anyway cause I/O errors or data corruption issues especially when using `pandas.DataFrame.to_parquet` or `pandas.DataFrame.to_json`."
-
-
-
-
-
-
-
-
-
-
Classes
-
-
-class ColumnsDataTypeError
-(*args, **kwargs)
-
-
-
Error raised when the validated data does not have the expected data types.
-
-
-Expand source code
-
-
class ColumnsDataTypeError(DynamicIOError):
- """Error raised when the validated data does not have the expected data types."""
Error raised when dataset type is not one of [parquet, json, csv, h5].
-
-
-Expand source code
-
-
class InvalidDatasetTypeError(DynamicIOError):
- """Error raised when dataset type is not one of [parquet, json, csv, h5]."""
-
- ERROR_STR = "The dataset provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
- ERROR_STR_DETAILED = "Dataset: {0} provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
Error raised when schema is not specified in the provided source.
-
-
-Expand source code
-
-
class MissingSchemaDefinition(DynamicIOError):
- """Error raised when schema is not specified in the provided source."""
-
- ERROR_STR = "The resource definition for this class is missing a schema definition"
- ERROR_STR_DETAILED = "The resource definition for this class is missing a schema definition: {0}"
Error raised when schema is not specified in the provided source.
-
-
-Expand source code
-
-
class SchemaNotFoundError(DynamicIOError):
- """Error raised when schema is not specified in the provided source."""
-
- ERROR_STR = "Schema not specified in the provided source"
- ERROR_STR_DETAILED = "Schema not specified in the provided source: {0} "
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/metrics.html b/docs/metrics.html
deleted file mode 100644
index 30702b1..0000000
--- a/docs/metrics.html
+++ /dev/null
@@ -1,929 +0,0 @@
-
-
-
-
-
-
-dynamicio.metrics API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.metrics
-
-
-
A module responsible for metrics generation and logging.
-
-
-Expand source code
-
-
"""A module responsible for metrics generation and logging."""
-# pylint: disable=missing-function-docstring,missing-class-docstring
-import json
-import logging
-import sys
-from numbers import Number
-from typing import Any, Dict, Mapping, Type
-
-import pandas as pd # type: ignore
-from magic_logger import logger
-from pythonjsonlogger import jsonlogger # type: ignore
-
-logHandler = logging.StreamHandler(sys.stdout)
-formatter = jsonlogger.JsonFormatter()
-logHandler.setFormatter(formatter)
-logger.addHandler(logHandler)
-
-
-__metrics__: Dict[str, Type["Metric"]] = {}
-
-
-def get_metric(name: str) -> Type["Metric"]:
- return __metrics__[name]
-
-
-def log_metric(dataset: str, column: str, metric: str, value: float):
- """Logs a metric in a structured way for a given dataset column.
-
- Args:
- dataset: The dataset for which the metric is logged
- column: Column for which the metric is logged
- metric: name fo the metric, e.g. "unique_vals"
- value: The metric's value, e.g. "10000"
- """
- logger.info(json.dumps({"message": "METRIC", "dataset": dataset, "column": column, "metric": metric, "value": float(value)}))
-
-
-class Metric:
- """A base class for implementing metrics classes."""
-
- def __init__(self, dataset_name: str, df: pd.DataFrame, column: str): # noqa
- self.dataset_name = dataset_name
- self.df = df
- self.column = column
-
- def __init_subclass__(cls): # noqa
- __metrics__[cls.__name__] = cls
- assert "calculate_metric" in cls.__dict__
-
- def __call__(self) -> Any: # noqa
- metric_value = self.calculate_metric()
-
- if isinstance(metric_value, Mapping):
- for entity in sorted(metric_value.keys()): # pylint: disable=no-member
- column = metric_value[entity] # pylint: disable=unsubscriptable-object
- log_metric(self.dataset_name, entity, self.metric_name, column)
- else:
- log_metric(dataset=self.dataset_name, column=self.column, metric=self.metric_name, value=metric_value)
- return metric_value
-
- @property
- def metric_name(self) -> str:
- """Retrieves the name of the metric from the class name.
-
- Returns:
- The name of the metric, e.g. "Min or Mean".
- """
- return self.__class__.__name__
-
- def calculate_metric(self) -> Any:
- """Dictates that subclasses need to implement this method.
-
- Returns:
- NotImplemented is returned if the method is not implemented, by the subclass
- inevitably pointing to the parent implementation.
- """
- return NotImplemented
-
-
-class Min(Metric):
- """A metric instance that enables generating and returning the minimum value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the minimum value of a column.
-
- Returns:
- The minimum value of a column.
- """
- return self.df[self.column].min()
-
-
-class Max(Metric):
- """A metric instance that enables generating and returning the maximum value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the maximum value of a column.
-
- Returns:
- The maximum value of a column.
- """
- return self.df[self.column].max()
-
-
-class Mean(Metric):
- """A metric instance that enables generating and returning the mean value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the mean value of a column.
-
- Returns:
- The mean value of a column.
- """
- return self.df[self.column].mean()
-
-
-class Std(Metric):
- """A metric instance that enables generating and returning the standard deviation of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the standard deviation of a column.
-
- Returns:
- The standard deviation of a column.
- """
- return self.df[self.column].std()
-
-
-class Variance(Metric):
- """A metric instance that generated and returns the variance of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the variance of a column.
-
- Returns:
- The variance of a column.
- """
- return self.df[self.column].var()
-
-
-class Counts(Metric):
- """A metric instance that enables generating and returning the length of a column."""
-
- def calculate_metric(self) -> int:
- """Generate and return the length of a column.
-
- Returns:
- The length of a column.
- """
- return len(self.df[self.column])
-
-
-class UniqueCounts(Metric):
- """A metric instance that enables generating and returning the unique values of a column."""
-
- def calculate_metric(self) -> int:
- """Generate and return the unique values of a column.
-
- Returns:
- The unique values of a column.
- """
- return len(self.df[self.column].unique())
-
-
-class CountsPerLabel(Metric):
- """A metric instance that enables generating and returning the counts per label in a categorical column."""
-
- def calculate_metric(self) -> Mapping:
- """Generate and return the counts per label in a categorical column.
-
- Returns:
- The counts per label in a categorical column
- """
- column_vs_metric_value = self.df[self.column].value_counts().to_dict()
- label_vs_metric_value_with_column_prefix = {}
- for key in column_vs_metric_value.keys():
- new_key = self.column + "-" + key
- label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
- return label_vs_metric_value_with_column_prefix
Logs a metric in a structured way for a given dataset column.
-
Args
-
-
dataset
-
The dataset for which the metric is logged
-
column
-
Column for which the metric is logged
-
metric
-
name fo the metric, e.g. "unique_vals"
-
value
-
The metric's value, e.g. "10000"
-
-
-
-Expand source code
-
-
def log_metric(dataset: str, column: str, metric: str, value: float):
- """Logs a metric in a structured way for a given dataset column.
-
- Args:
- dataset: The dataset for which the metric is logged
- column: Column for which the metric is logged
- metric: name fo the metric, e.g. "unique_vals"
- value: The metric's value, e.g. "10000"
- """
- logger.info(json.dumps({"message": "METRIC", "dataset": dataset, "column": column, "metric": metric, "value": float(value)}))
A metric instance that enables generating and returning the length of a column.
-
-
-Expand source code
-
-
class Counts(Metric):
- """A metric instance that enables generating and returning the length of a column."""
-
- def calculate_metric(self) -> int:
- """Generate and return the length of a column.
-
- Returns:
- The length of a column.
- """
- return len(self.df[self.column])
def calculate_metric(self) -> int:
- """Generate and return the length of a column.
-
- Returns:
- The length of a column.
- """
- return len(self.df[self.column])
A metric instance that enables generating and returning the counts per label in a categorical column.
-
-
-Expand source code
-
-
class CountsPerLabel(Metric):
- """A metric instance that enables generating and returning the counts per label in a categorical column."""
-
- def calculate_metric(self) -> Mapping:
- """Generate and return the counts per label in a categorical column.
-
- Returns:
- The counts per label in a categorical column
- """
- column_vs_metric_value = self.df[self.column].value_counts().to_dict()
- label_vs_metric_value_with_column_prefix = {}
- for key in column_vs_metric_value.keys():
- new_key = self.column + "-" + key
- label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
- return label_vs_metric_value_with_column_prefix
-class Max
-(dataset_name: str, df: pandas.core.frame.DataFrame, column: str)
-
-
-
A metric instance that enables generating and returning the maximum value of a column.
-
-
-Expand source code
-
-
class Max(Metric):
- """A metric instance that enables generating and returning the maximum value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the maximum value of a column.
-
- Returns:
- The maximum value of a column.
- """
- return self.df[self.column].max()
Generate and return the maximum value of a column.
-
Returns
-
The maximum value of a column.
-
-
-Expand source code
-
-
def calculate_metric(self) -> Number:
- """Generate and return the maximum value of a column.
-
- Returns:
- The maximum value of a column.
- """
- return self.df[self.column].max()
-class Mean
-(dataset_name: str, df: pandas.core.frame.DataFrame, column: str)
-
-
-
A metric instance that enables generating and returning the mean value of a column.
-
-
-Expand source code
-
-
class Mean(Metric):
- """A metric instance that enables generating and returning the mean value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the mean value of a column.
-
- Returns:
- The mean value of a column.
- """
- return self.df[self.column].mean()
def calculate_metric(self) -> Number:
- """Generate and return the mean value of a column.
-
- Returns:
- The mean value of a column.
- """
- return self.df[self.column].mean()
Retrieves the name of the metric from the class name.
-
Returns
-
The name of the metric, e.g. "Min or Mean".
-
-
-Expand source code
-
-
@property
-def metric_name(self) -> str:
- """Retrieves the name of the metric from the class name.
-
- Returns:
- The name of the metric, e.g. "Min or Mean".
- """
- return self.__class__.__name__
-
-
-
-
Methods
-
-
-def calculate_metric(self) ‑> Any
-
-
-
Dictates that subclasses need to implement this method.
-
Returns
-
NotImplemented is returned if the method is not implemented, by the subclass
-inevitably pointing to the parent implementation.
-
-
-Expand source code
-
-
def calculate_metric(self) -> Any:
- """Dictates that subclasses need to implement this method.
-
- Returns:
- NotImplemented is returned if the method is not implemented, by the subclass
- inevitably pointing to the parent implementation.
- """
- return NotImplemented
-
-
-
-
-
-class Min
-(dataset_name: str, df: pandas.core.frame.DataFrame, column: str)
-
-
-
A metric instance that enables generating and returning the minimum value of a column.
-
-
-Expand source code
-
-
class Min(Metric):
- """A metric instance that enables generating and returning the minimum value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the minimum value of a column.
-
- Returns:
- The minimum value of a column.
- """
- return self.df[self.column].min()
Generate and return the minimum value of a column.
-
Returns
-
The minimum value of a column.
-
-
-Expand source code
-
-
def calculate_metric(self) -> Number:
- """Generate and return the minimum value of a column.
-
- Returns:
- The minimum value of a column.
- """
- return self.df[self.column].min()
A metric instance that enables generating and returning the standard deviation of a column.
-
-
-Expand source code
-
-
class Std(Metric):
- """A metric instance that enables generating and returning the standard deviation of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the standard deviation of a column.
-
- Returns:
- The standard deviation of a column.
- """
- return self.df[self.column].std()
Generate and return the standard deviation of a column.
-
Returns
-
The standard deviation of a column.
-
-
-Expand source code
-
-
def calculate_metric(self) -> Number:
- """Generate and return the standard deviation of a column.
-
- Returns:
- The standard deviation of a column.
- """
- return self.df[self.column].std()
A metric instance that enables generating and returning the unique values of a column.
-
-
-Expand source code
-
-
class UniqueCounts(Metric):
- """A metric instance that enables generating and returning the unique values of a column."""
-
- def calculate_metric(self) -> int:
- """Generate and return the unique values of a column.
-
- Returns:
- The unique values of a column.
- """
- return len(self.df[self.column].unique())
Generate and return the unique values of a column.
-
Returns
-
The unique values of a column.
-
-
-Expand source code
-
-
def calculate_metric(self) -> int:
- """Generate and return the unique values of a column.
-
- Returns:
- The unique values of a column.
- """
- return len(self.df[self.column].unique())
A metric instance that generated and returns the variance of a column.
-
-
-Expand source code
-
-
class Variance(Metric):
- """A metric instance that generated and returns the variance of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the variance of a column.
-
- Returns:
- The variance of a column.
- """
- return self.df[self.column].var()
def calculate_metric(self) -> Number:
- """Generate and return the variance of a column.
-
- Returns:
- The variance of a column.
- """
- return self.df[self.column].var()
"""Mixin utility functions"""
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-import inspect
-import string
-from functools import wraps
-from types import FunctionType, MethodType
-from typing import Any, Collection, Iterable, Mapping, MutableMapping, Union
-
-from magic_logger import logger
-
-
-def allow_options(options: Union[Iterable, FunctionType, MethodType]):
- """Validate **options for a decorated reader function.
-
- Args:
- options: A set of valid options for a reader (e.g. `pandas.read_parquet` or `pandas.read_csv`)
-
- Returns:
- read_with_valid_options: The input function called with modified options.
- """
-
- def _filter_out_irrelevant_options(kwargs: Mapping, valid_options: Iterable):
- filtered_options = {}
- invalid_options = {}
- for key_arg in kwargs.keys():
- if key_arg in valid_options:
- filtered_options[key_arg] = kwargs[key_arg]
- else:
- invalid_options[key_arg] = kwargs[key_arg]
- if len(invalid_options) > 0:
- logger.warning(
- f"Options {invalid_options} were not used because they were not supported by the read or write method configured for this source. "
- "Check if you expected any of those to have been used by the operation!"
- )
- return filtered_options
-
- def read_with_valid_options(func):
- @wraps(func)
- def _(*args, **kwargs):
- if callable(options):
- return func(*args, **_filter_out_irrelevant_options(kwargs, args_of(options)))
- return func(*args, **_filter_out_irrelevant_options(kwargs, options))
-
- return _
-
- return read_with_valid_options
-
-
-def args_of(func):
- """Retrieve allowed options for a given function.
-
- Args:
- func: A function like, e.g., pd.read_csv
-
- Returns:
- A set of allowed options
- """
- return set(inspect.signature(func).parameters.keys())
-
-
-def get_string_template_field_names(s: str) -> Collection[str]: # pylint: disable=C0103
- """Given a string `s`, it parses the string to identify any template fields and returns the names of those fields.
-
- If `s` is not a string template, the returned `Collection` is empty.
-
- Args:
- s:
-
- Returns:
- Collection[str]
-
- Example:
-
- >>> get_string_template_field_names("abc{def}{efg}")
- ["def", "efg"]
- >>> get_string_template_field_names("{0}-{1}")
- ["0", "1"]
- >>> get_string_template_field_names("hello world")
- []
- """
- # string.Formatter.parse returns a 4-tuple of:
- # `literal_text`, `field_name`, `form_at_spec`, `conversion`
- # More info here https://docs.python.org/3.8/library/string.html#string.Formatter.parse
- field_names = [group[1] for group in string.Formatter().parse(s) if group[1] is not None]
-
- return field_names
-
-
-def resolve_template(path: str, options: MutableMapping[str, Any]) -> str: # pylint: disable=C0103
- """Given a string `path`, it attempts to replace all templates fields with values provided in `options`.
-
- If `path` is not a string template, `path` is returned.
-
- Args:
- path: A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
- options: A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
- Returns:
- str: Returns a static path replaced with the value in the options mapping.
-
- Raises:
- ValueError: if any template fields in s are not named using valid Python identifiers
- ValueError: if a given template field cannot be resolved in `options`
- """
- fields = get_string_template_field_names(path)
-
- if len(fields) == 0:
- return path
-
- if not all(field.isidentifier() for field in fields):
- raise ValueError(f"Expected valid Python identifiers, found {fields}")
-
- if not all(field in options for field in fields):
- raise ValueError(f"Expected values for all fields in {fields}, found {list(options.keys())}")
-
- path = path.format(**{field: options[field] for field in fields})
- for field in fields:
- options.pop(field)
-
- return path
Validate **options for a decorated reader function.
-
Args
-
-
options
-
A set of valid options for a reader (e.g. pandas.read_parquet or pandas.read_csv)
-
-
Returns
-
-
read_with_valid_options
-
The input function called with modified options.
-
-
-
-Expand source code
-
-
def allow_options(options: Union[Iterable, FunctionType, MethodType]):
- """Validate **options for a decorated reader function.
-
- Args:
- options: A set of valid options for a reader (e.g. `pandas.read_parquet` or `pandas.read_csv`)
-
- Returns:
- read_with_valid_options: The input function called with modified options.
- """
-
- def _filter_out_irrelevant_options(kwargs: Mapping, valid_options: Iterable):
- filtered_options = {}
- invalid_options = {}
- for key_arg in kwargs.keys():
- if key_arg in valid_options:
- filtered_options[key_arg] = kwargs[key_arg]
- else:
- invalid_options[key_arg] = kwargs[key_arg]
- if len(invalid_options) > 0:
- logger.warning(
- f"Options {invalid_options} were not used because they were not supported by the read or write method configured for this source. "
- "Check if you expected any of those to have been used by the operation!"
- )
- return filtered_options
-
- def read_with_valid_options(func):
- @wraps(func)
- def _(*args, **kwargs):
- if callable(options):
- return func(*args, **_filter_out_irrelevant_options(kwargs, args_of(options)))
- return func(*args, **_filter_out_irrelevant_options(kwargs, options))
-
- return _
-
- return read_with_valid_options
-
-
-
-def args_of(func)
-
-
-
Retrieve allowed options for a given function.
-
Args
-
-
func
-
A function like, e.g., pd.read_csv
-
-
Returns
-
A set of allowed options
-
-
-Expand source code
-
-
def args_of(func):
- """Retrieve allowed options for a given function.
-
- Args:
- func: A function like, e.g., pd.read_csv
-
- Returns:
- A set of allowed options
- """
- return set(inspect.signature(func).parameters.keys())
def get_string_template_field_names(s: str) -> Collection[str]: # pylint: disable=C0103
- """Given a string `s`, it parses the string to identify any template fields and returns the names of those fields.
-
- If `s` is not a string template, the returned `Collection` is empty.
-
- Args:
- s:
-
- Returns:
- Collection[str]
-
- Example:
-
- >>> get_string_template_field_names("abc{def}{efg}")
- ["def", "efg"]
- >>> get_string_template_field_names("{0}-{1}")
- ["0", "1"]
- >>> get_string_template_field_names("hello world")
- []
- """
- # string.Formatter.parse returns a 4-tuple of:
- # `literal_text`, `field_name`, `form_at_spec`, `conversion`
- # More info here https://docs.python.org/3.8/library/string.html#string.Formatter.parse
- field_names = [group[1] for group in string.Formatter().parse(s) if group[1] is not None]
-
- return field_names
Given a string path, it attempts to replace all templates fields with values provided in options.
-
If path is not a string template, path is returned.
-
Args
-
-
path
-
A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
-
options
-
A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
-
Returns
-
-
str
-
Returns a static path replaced with the value in the options mapping.
-
-
Raises
-
-
ValueError
-
if any template fields in s are not named using valid Python identifiers
-
ValueError
-
if a given template field cannot be resolved in options
-
-
-
-Expand source code
-
-
def resolve_template(path: str, options: MutableMapping[str, Any]) -> str: # pylint: disable=C0103
- """Given a string `path`, it attempts to replace all templates fields with values provided in `options`.
-
- If `path` is not a string template, `path` is returned.
-
- Args:
- path: A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
- options: A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
- Returns:
- str: Returns a static path replaced with the value in the options mapping.
-
- Raises:
- ValueError: if any template fields in s are not named using valid Python identifiers
- ValueError: if a given template field cannot be resolved in `options`
- """
- fields = get_string_template_field_names(path)
-
- if len(fields) == 0:
- return path
-
- if not all(field.isidentifier() for field in fields):
- raise ValueError(f"Expected valid Python identifiers, found {fields}")
-
- if not all(field in options for field in fields):
- raise ValueError(f"Expected values for all fields in {fields}, found {list(options.keys())}")
-
- path = path.format(**{field: options[field] for field in fields})
- for field in fields:
- options.pop(field)
-
- return path
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/mixins/with_kafka.html b/docs/mixins/with_kafka.html
deleted file mode 100644
index e708c09..0000000
--- a/docs/mixins/with_kafka.html
+++ /dev/null
@@ -1,477 +0,0 @@
-
-
-
-
-
-
-dynamicio.mixins.with_kafka API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.mixins.with_kafka
-
-
-
This module provides mixins that are providing Kafka I/O support.
-
-
-Expand source code
-
-
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Kafka I/O support."""
-
-
-from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
-
-import pandas as pd # type: ignore
-import simplejson
-from kafka import KafkaProducer # type: ignore
-from magic_logger import logger
-
-
-from . import utils
-
-
-class WithKafka:
- """Handles I/O operations for Kafka.
-
- Args:
- - options:
- - Standard: Keyword-arguments passed to the KafkaProducer constructor (see `KafkaProducer.DEFAULT_CONFIG.keys()`).
- - Additional Options:
-
- - `key_generator: Callable[[Any, Mapping], T]`: defines the keying policy to be used for sending keyed-messages to Kafka. It is a `Callable` that takes a
- `tuple(idx, row)` and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index
- (which may not be composed of unique values or string type keys). It goes hand in hand with the default `key-serialiser`, which assumes that the keys
- are strings and encode's them as such.
-
- - `key_serializer: Callable[T, bytes]`: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).
-
- N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.
-
- - `document_transformer: Callable[[Mapping[Any, Any]`: Manipulates the messages/rows sent to Kafka as values. It is a `Callable` taking a `Mapping` as its only
- argument and return a `Mapping`, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each
- document that will be written to the target Kafka topic.
-
- - `value_serializer: Callable[Mapping, bytes]`: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..
-
- Example:
- >>> # Given
- >>> keyed_test_df = pd.DataFrame.from_records(
- >>> [
- >>> ["key-01", "cm_1", "id_1", 1000, "ABC"],
- >>> ["key-02", "cm_2", "id_2", 1000, "ABC"],
- >>> ["key-03", "cm_3", "id_3", 1000, "ABC"],
- >>> ],
- >>> columns=["key", "id", "foo", "bar", "baz"],
- >>> ).set_index("key")
- >>>
- >>> kafka_cloud_config = IOConfig(
- >>> path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")),
- >>> env_identifier="CLOUD",
- >>> dynamic_vars=constants,
- >>> ).get(source_key="WRITE_TO_KAFKA_JSON")
- >>>
- >>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value")
- >>>
- >>> # When
- >>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer:
- >>> mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- >>> mock_producer = MockKafkaProducer()
- >>> mock__kafka_producer.return_value = mock_producer
- >>> write_kafka_io.write(keyed_test_df)
- >>>
- >>> # Then
- >>> assert mock_producer.my_stream == [
- >>> {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}},
- >>> {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}},
- >>> {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}},
- >>> ]
- """
-
- sources_config: Mapping
- schema: Mapping
- options: MutableMapping[str, Any]
- __kafka_config: Optional[Mapping] = None
- __producer: Optional[KafkaProducer] = None
- __key_generator: Optional[Callable[[Any, Mapping[Any, Any]], Optional[str]]] = None
- __document_transformer: Optional[Callable[[Mapping[Any, Any]], Mapping[Any, Any]]] = None
-
- def _write_to_kafka(self, df: pd.DataFrame) -> None:
- """Given a dataframe where each row is a message to be sent to a Kafka Topic, iterate through all rows and send them to a Kafka topic.
-
- The topic is defined in `self.sources_config["kafka"]` and using a kafka producer, which is flushed at the
- end of this process.
-
- Args:
- df: A dataframe where each row is a message to be sent to a Kafka Topic.
- """
- if self.__key_generator is None:
- self.__key_generator = lambda idx, __: idx # default key generator uses the dataframe's index
- if self.options.get("key_generator") is not None:
- self.__key_generator = self.options.pop("key_generator")
-
- if self.__document_transformer is None:
- self.__document_transformer = lambda value: value
- if self.options.get("document_transformer") is not None:
- self.__document_transformer = self.options.pop("document_transformer")
-
- if self.__producer is None:
- self.__producer = self._get_producer(self.sources_config["kafka"]["kafka_server"], **self.options)
-
- self._send_messages(df=df, topic=self.sources_config["kafka"]["kafka_topic"])
-
- @utils.allow_options(KafkaProducer.DEFAULT_CONFIG.keys())
- def _get_producer(self, server: str, **options: MutableMapping[str, Any]) -> KafkaProducer:
- """Generate and return a Kafka Producer.
-
- Default options are used to generate the producer. Specifically:
- - `bootstrap_servers`: Passed on through the source_config
- - `value_serializer`: Uses a default_value_serializer defined in this mixin
-
- More options can be added to the producer by passing them as keyword arguments, through valid options.
-
- These can also override the default options.
-
- Args:
- server: The host name.
- **options: Keyword arguments to pass to the KafkaProducer.
-
- Returns:
- A Kafka producer instance.
- """
- self.__kafka_config = {
- **{
- "bootstrap_servers": server,
- "compression_type": "snappy",
- "key_serializer": self._default_key_serializer,
- "value_serializer": self._default_value_serializer,
- },
- **options,
- }
- return KafkaProducer(**self.__kafka_config)
-
- def _send_messages(self, df: pd.DataFrame, topic: str) -> None:
- logger.info(f"Sending {len(df)} messages to Kafka topic:{topic}.")
-
- messages = df.reset_index(drop=True).to_dict("records")
- for idx, message in zip(df.index.values, messages):
- self.__producer.send(topic, key=self.__key_generator(idx, message), value=self.__document_transformer(message)) # type: ignore
-
- self.__producer.flush() # type: ignore
-
- @staticmethod
- def _default_key_serializer(key: Optional[str]) -> Optional[bytes]:
- if key:
- return key.encode("utf-8")
- return None
-
- @staticmethod
- def _default_value_serializer(value: Mapping) -> bytes:
- return simplejson.dumps(value, ignore_nan=True).encode("utf-8")
-
- def _read_from_kafka(self) -> Iterable[Mapping]: # type: ignore
- """Read messages from a Kafka Topic and convert them to separate dataframes.
-
- Returns:
- Multiple dataframes, one per message read from the Kafka topic of interest.
- """
- # TODO: Implement kafka reader
-
-
-
-
-
-
-
-
-
-
Classes
-
-
-class WithKafka
-
-
-
Handles I/O operations for Kafka.
-
Args
-
-
options:
-
Standard: Keyword-arguments passed to the KafkaProducer constructor (see KafkaProducer.DEFAULT_CONFIG.keys()).
-
-
Additional Options:
-
-
-
key_generator: Callable[[Any, Mapping], T]: defines the keying policy to be used for sending keyed-messages to Kafka. It is a Callable that takes a
-tuple(idx, row) and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index
-(which may not be composed of unique values or string type keys). It goes hand in hand with the default key-serialiser, which assumes that the keys
-are strings and encode's them as such.
-
-
-
key_serializer: Callable[T, bytes]: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).
-
-
-
N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.
-
-
-
document_transformer: Callable[[Mapping[Any, Any]: Manipulates the messages/rows sent to Kafka as values. It is
-a Callable taking a Mapping as its only
-argument and return a Mapping, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each
-document that will be written to the target
-Kafka topic.
-
-
-
value_serializer: Callable[Mapping, bytes]: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..
class WithKafka:
- """Handles I/O operations for Kafka.
-
- Args:
- - options:
- - Standard: Keyword-arguments passed to the KafkaProducer constructor (see `KafkaProducer.DEFAULT_CONFIG.keys()`).
- - Additional Options:
-
- - `key_generator: Callable[[Any, Mapping], T]`: defines the keying policy to be used for sending keyed-messages to Kafka. It is a `Callable` that takes a
- `tuple(idx, row)` and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index
- (which may not be composed of unique values or string type keys). It goes hand in hand with the default `key-serialiser`, which assumes that the keys
- are strings and encode's them as such.
-
- - `key_serializer: Callable[T, bytes]`: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).
-
- N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.
-
- - `document_transformer: Callable[[Mapping[Any, Any]`: Manipulates the messages/rows sent to Kafka as values. It is a `Callable` taking a `Mapping` as its only
- argument and return a `Mapping`, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each
- document that will be written to the target Kafka topic.
-
- - `value_serializer: Callable[Mapping, bytes]`: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..
-
- Example:
- >>> # Given
- >>> keyed_test_df = pd.DataFrame.from_records(
- >>> [
- >>> ["key-01", "cm_1", "id_1", 1000, "ABC"],
- >>> ["key-02", "cm_2", "id_2", 1000, "ABC"],
- >>> ["key-03", "cm_3", "id_3", 1000, "ABC"],
- >>> ],
- >>> columns=["key", "id", "foo", "bar", "baz"],
- >>> ).set_index("key")
- >>>
- >>> kafka_cloud_config = IOConfig(
- >>> path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")),
- >>> env_identifier="CLOUD",
- >>> dynamic_vars=constants,
- >>> ).get(source_key="WRITE_TO_KAFKA_JSON")
- >>>
- >>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value")
- >>>
- >>> # When
- >>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer:
- >>> mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- >>> mock_producer = MockKafkaProducer()
- >>> mock__kafka_producer.return_value = mock_producer
- >>> write_kafka_io.write(keyed_test_df)
- >>>
- >>> # Then
- >>> assert mock_producer.my_stream == [
- >>> {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}},
- >>> {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}},
- >>> {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}},
- >>> ]
- """
-
- sources_config: Mapping
- schema: Mapping
- options: MutableMapping[str, Any]
- __kafka_config: Optional[Mapping] = None
- __producer: Optional[KafkaProducer] = None
- __key_generator: Optional[Callable[[Any, Mapping[Any, Any]], Optional[str]]] = None
- __document_transformer: Optional[Callable[[Mapping[Any, Any]], Mapping[Any, Any]]] = None
-
- def _write_to_kafka(self, df: pd.DataFrame) -> None:
- """Given a dataframe where each row is a message to be sent to a Kafka Topic, iterate through all rows and send them to a Kafka topic.
-
- The topic is defined in `self.sources_config["kafka"]` and using a kafka producer, which is flushed at the
- end of this process.
-
- Args:
- df: A dataframe where each row is a message to be sent to a Kafka Topic.
- """
- if self.__key_generator is None:
- self.__key_generator = lambda idx, __: idx # default key generator uses the dataframe's index
- if self.options.get("key_generator") is not None:
- self.__key_generator = self.options.pop("key_generator")
-
- if self.__document_transformer is None:
- self.__document_transformer = lambda value: value
- if self.options.get("document_transformer") is not None:
- self.__document_transformer = self.options.pop("document_transformer")
-
- if self.__producer is None:
- self.__producer = self._get_producer(self.sources_config["kafka"]["kafka_server"], **self.options)
-
- self._send_messages(df=df, topic=self.sources_config["kafka"]["kafka_topic"])
-
- @utils.allow_options(KafkaProducer.DEFAULT_CONFIG.keys())
- def _get_producer(self, server: str, **options: MutableMapping[str, Any]) -> KafkaProducer:
- """Generate and return a Kafka Producer.
-
- Default options are used to generate the producer. Specifically:
- - `bootstrap_servers`: Passed on through the source_config
- - `value_serializer`: Uses a default_value_serializer defined in this mixin
-
- More options can be added to the producer by passing them as keyword arguments, through valid options.
-
- These can also override the default options.
-
- Args:
- server: The host name.
- **options: Keyword arguments to pass to the KafkaProducer.
-
- Returns:
- A Kafka producer instance.
- """
- self.__kafka_config = {
- **{
- "bootstrap_servers": server,
- "compression_type": "snappy",
- "key_serializer": self._default_key_serializer,
- "value_serializer": self._default_value_serializer,
- },
- **options,
- }
- return KafkaProducer(**self.__kafka_config)
-
- def _send_messages(self, df: pd.DataFrame, topic: str) -> None:
- logger.info(f"Sending {len(df)} messages to Kafka topic:{topic}.")
-
- messages = df.reset_index(drop=True).to_dict("records")
- for idx, message in zip(df.index.values, messages):
- self.__producer.send(topic, key=self.__key_generator(idx, message), value=self.__document_transformer(message)) # type: ignore
-
- self.__producer.flush() # type: ignore
-
- @staticmethod
- def _default_key_serializer(key: Optional[str]) -> Optional[bytes]:
- if key:
- return key.encode("utf-8")
- return None
-
- @staticmethod
- def _default_value_serializer(value: Mapping) -> bytes:
- return simplejson.dumps(value, ignore_nan=True).encode("utf-8")
-
- def _read_from_kafka(self) -> Iterable[Mapping]: # type: ignore
- """Read messages from a Kafka Topic and convert them to separate dataframes.
-
- Returns:
- Multiple dataframes, one per message read from the Kafka topic of interest.
- """
- # TODO: Implement kafka reader
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/mixins/with_local.html b/docs/mixins/with_local.html
deleted file mode 100644
index 507ecb4..0000000
--- a/docs/mixins/with_local.html
+++ /dev/null
@@ -1,698 +0,0 @@
-
-
-
-
-
-
-dynamicio.mixins.with_local API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.mixins.with_local
-
-
-
This module provides mixins that are providing Local FS I/O support.
-
-
-Expand source code
-
-
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Local FS I/O support."""
-
-import glob
-import os
-from contextlib import contextmanager
-from threading import Lock
-from typing import Any, Mapping, MutableMapping, Optional
-
-import pandas as pd # type: ignore
-from fastparquet import ParquetFile, write # type: ignore
-from pyarrow.parquet import read_table, write_table # type: ignore # pylint: disable=no-name-in-module
-
-from . import utils
-
-hdf_lock = Lock()
-
-
-@contextmanager
-def pickle_protocol(protocol: Optional[int]):
- """Downgrade to the provided pickle protocol within the context manager.
-
- Args:
- protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
- """
- import pickle # pylint: disable=import-outside-toplevel
-
- previous = pickle.HIGHEST_PROTOCOL
- try:
- pickle.HIGHEST_PROTOCOL = 4
- if protocol:
- pickle.HIGHEST_PROTOCOL = protocol
- yield
- finally:
- pickle.HIGHEST_PROTOCOL = previous
-
-
-class WithLocal:
- """Handles local I/O operations."""
-
- sources_config: Mapping
- schema: Mapping
- options: MutableMapping[str, Any]
-
- def _read_from_local(self) -> pd.DataFrame:
- """Read a local file as a `DataFrame`.
-
- The configuration object is expected to have two keys:
- - `file_path`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using
- "_read_{file_type}_file".
-
- Returns:
- DataFrame
- """
- local_config = self.sources_config["local"]
- file_path = utils.resolve_template(local_config["file_path"], self.options)
- file_type = local_config["file_type"]
-
- return getattr(self, f"_read_{file_type}_file")(file_path, self.schema, **self.options)
-
- def _write_to_local(self, df: pd.DataFrame):
- """Write a dataframe locally based on the {file_type} of the config_io configuration.
-
- The configuration object is expected to have two keys:
-
- - `file_path`
- - `file_type`
-
- To actually write the file, a method is dynamically invoked by name, using
- "_write_{file_type}_file".
-
- Args:
- df: The dataframe to be written out.
- """
- local_config = self.sources_config["local"]
- file_path = utils.resolve_template(local_config["file_path"], self.options)
- file_type = local_config["file_type"]
-
- getattr(self, f"_write_{file_type}_file")(df, file_path, **self.options)
-
- @staticmethod
- @utils.allow_options(pd.read_hdf)
- def _read_hdf_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a HDF file as a DataFrame using `pd.read_hdf`.
-
- All `options` are passed directly to `pd.read_hdf`.
-
- Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
- that when used with asyncio through `async_read()` HDF files will be read sequentially.
- For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
- Args:
- file_path: The path to the hdf file to be read.
- options: The pandas `read_hdf` options.
-
- Returns:
- DataFrame: The dataframe read from the hdf file.
- """
- with hdf_lock:
- df = pd.read_hdf(file_path, **options)
-
- columns = [column for column in df.columns.to_list() if column in schema.keys()]
- df = df[columns]
- return df
-
- @staticmethod
- @utils.allow_options(pd.read_csv)
- def _read_csv_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a CSV file as a DataFrame using `pd.read_csv`.
-
- All `options` are passed directly to `pd.read_csv`.
-
- Args:
- file_path: The path to the csv file to be read.
- options: The pandas `read_csv` options.
-
- Returns:
- DataFrame: The dataframe read from the csv file.
- """
- options["usecols"] = list(schema.keys())
- return pd.read_csv(file_path, **options)
-
- @staticmethod
- @utils.allow_options(pd.read_json)
- def _read_json_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a json file as a DataFrame using `pd.read_hdf`.
-
- All `options` are passed directly to `pd.read_hdf`.
-
- Args:
- file_path:
- options:
-
- Returns:
- DataFrame
- """
- df = pd.read_json(file_path, **options)
- columns = [column for column in df.columns.to_list() if column in schema.keys()]
- df = df[columns]
- return df
-
- @staticmethod
- def _read_parquet_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a Parquet file as a DataFrame using `pd.read_parquet`.
-
- All `options` are passed directly to `pd.read_parquet`.
-
- Args:
- file_path: The path to the parquet file to be read.
- options: The pandas `read_parquet` options.
-
- Returns:
- DataFrame: The dataframe read from the parquet file.
- """
- options["columns"] = list(schema.keys())
-
- if options.get("engine") == "fastparquet":
- return WithLocal.__read_with_fastparquet(file_path, **options)
- return WithLocal.__read_with_pyarrow(file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(read_table)])
- def __read_with_pyarrow(cls, file_path: str, **options: Any) -> pd.DataFrame:
- return pd.read_parquet(file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(ParquetFile)])
- def __read_with_fastparquet(cls, file_path: str, **options: Any) -> pd.DataFrame:
- return pd.read_parquet(file_path, **options)
-
- @staticmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_hdf), *["protocol"]])
- def _write_hdf_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe to hdf using `df.to_hdf`.
-
- All `options` are passed directly to `df.to_hdf`.
-
- Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
- that when used with asyncio through `async_read()` HDF files will be written sequentially.
- For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: The pandas `to_hdf` options.
-
- - The pandas `to_hdf` options, &;
- - protocol: The pickle protocol to use for writing the hdf file out; a value <=5.
- """
- with pickle_protocol(protocol=options.pop("protocol", None)), hdf_lock:
- df.to_hdf(file_path, key="df", mode="w", **options)
-
- @staticmethod
- @utils.allow_options(pd.DataFrame.to_csv)
- def _write_csv_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a CSV file using `df.to_csv`.
-
- All `options` are passed directly to `df.to_csv`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a csv file.
- """
- df.to_csv(file_path, **options)
-
- @staticmethod
- @utils.allow_options(pd.DataFrame.to_json)
- def _write_json_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a json file using `df.to_json`.
-
- All `options` are passed directly to `df.to_json`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a json file.
- """
- df.to_json(file_path, **options)
-
- @staticmethod
- def _write_parquet_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a parquet file using `df.to_parquet`.
-
- All `options` are passed directly to `df.to_parquet`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a parquet file.
- """
- if options.get("engine") == "fastparquet":
- return WithLocal.__write_with_fastparquet(df, file_path, **options)
- return WithLocal.__write_with_pyarrow(df, file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write_table)])
- def __write_with_pyarrow(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
- return df.to_parquet(filepath, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write)])
- def __write_with_fastparquet(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
- return df.to_parquet(filepath, **options)
-
-
-class WithLocalBatch(WithLocal):
- """Responsible for batch reading local files."""
-
- def _read_from_local_batch(self) -> pd.DataFrame:
- """Reads a set of files for a specified file type, concatenates them and returns a dataframe.
-
- Returns:
- A concatenated dataframe composed of all files read through local_batch.
- """
- local_batch_config = self.sources_config["local"]
-
- file_type = local_batch_config["file_type"]
- filtering_file_type = file_type
- if filtering_file_type == "hdf":
- filtering_file_type = "h5"
-
- files = glob.glob(f"{local_batch_config['path_prefix']}/*.{filtering_file_type}")
-
- dfs_to_concatenate = []
- for file in files:
- file_to_load = os.path.join(local_batch_config["path_prefix"], file)
- dfs_to_concatenate.append(getattr(self, f"_read_{file_type}_file")(file_to_load, self.schema, **self.options)) # type: ignore
-
- return pd.concat(dfs_to_concatenate).reset_index(drop=True)
-
-
-
-
-
-
-
-
Functions
-
-
-def pickle_protocol(protocol: Optional[int])
-
-
-
Downgrade to the provided pickle protocol within the context manager.
-
Args
-
-
protocol
-
The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
-
-
-
-Expand source code
-
-
@contextmanager
-def pickle_protocol(protocol: Optional[int]):
- """Downgrade to the provided pickle protocol within the context manager.
-
- Args:
- protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
- """
- import pickle # pylint: disable=import-outside-toplevel
-
- previous = pickle.HIGHEST_PROTOCOL
- try:
- pickle.HIGHEST_PROTOCOL = 4
- if protocol:
- pickle.HIGHEST_PROTOCOL = protocol
- yield
- finally:
- pickle.HIGHEST_PROTOCOL = previous
-
-
-
-
-
-
Classes
-
-
-class WithLocal
-
-
-
Handles local I/O operations.
-
-
-Expand source code
-
-
class WithLocal:
- """Handles local I/O operations."""
-
- sources_config: Mapping
- schema: Mapping
- options: MutableMapping[str, Any]
-
- def _read_from_local(self) -> pd.DataFrame:
- """Read a local file as a `DataFrame`.
-
- The configuration object is expected to have two keys:
- - `file_path`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using
- "_read_{file_type}_file".
-
- Returns:
- DataFrame
- """
- local_config = self.sources_config["local"]
- file_path = utils.resolve_template(local_config["file_path"], self.options)
- file_type = local_config["file_type"]
-
- return getattr(self, f"_read_{file_type}_file")(file_path, self.schema, **self.options)
-
- def _write_to_local(self, df: pd.DataFrame):
- """Write a dataframe locally based on the {file_type} of the config_io configuration.
-
- The configuration object is expected to have two keys:
-
- - `file_path`
- - `file_type`
-
- To actually write the file, a method is dynamically invoked by name, using
- "_write_{file_type}_file".
-
- Args:
- df: The dataframe to be written out.
- """
- local_config = self.sources_config["local"]
- file_path = utils.resolve_template(local_config["file_path"], self.options)
- file_type = local_config["file_type"]
-
- getattr(self, f"_write_{file_type}_file")(df, file_path, **self.options)
-
- @staticmethod
- @utils.allow_options(pd.read_hdf)
- def _read_hdf_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a HDF file as a DataFrame using `pd.read_hdf`.
-
- All `options` are passed directly to `pd.read_hdf`.
-
- Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
- that when used with asyncio through `async_read()` HDF files will be read sequentially.
- For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
- Args:
- file_path: The path to the hdf file to be read.
- options: The pandas `read_hdf` options.
-
- Returns:
- DataFrame: The dataframe read from the hdf file.
- """
- with hdf_lock:
- df = pd.read_hdf(file_path, **options)
-
- columns = [column for column in df.columns.to_list() if column in schema.keys()]
- df = df[columns]
- return df
-
- @staticmethod
- @utils.allow_options(pd.read_csv)
- def _read_csv_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a CSV file as a DataFrame using `pd.read_csv`.
-
- All `options` are passed directly to `pd.read_csv`.
-
- Args:
- file_path: The path to the csv file to be read.
- options: The pandas `read_csv` options.
-
- Returns:
- DataFrame: The dataframe read from the csv file.
- """
- options["usecols"] = list(schema.keys())
- return pd.read_csv(file_path, **options)
-
- @staticmethod
- @utils.allow_options(pd.read_json)
- def _read_json_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a json file as a DataFrame using `pd.read_hdf`.
-
- All `options` are passed directly to `pd.read_hdf`.
-
- Args:
- file_path:
- options:
-
- Returns:
- DataFrame
- """
- df = pd.read_json(file_path, **options)
- columns = [column for column in df.columns.to_list() if column in schema.keys()]
- df = df[columns]
- return df
-
- @staticmethod
- def _read_parquet_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
- """Read a Parquet file as a DataFrame using `pd.read_parquet`.
-
- All `options` are passed directly to `pd.read_parquet`.
-
- Args:
- file_path: The path to the parquet file to be read.
- options: The pandas `read_parquet` options.
-
- Returns:
- DataFrame: The dataframe read from the parquet file.
- """
- options["columns"] = list(schema.keys())
-
- if options.get("engine") == "fastparquet":
- return WithLocal.__read_with_fastparquet(file_path, **options)
- return WithLocal.__read_with_pyarrow(file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(read_table)])
- def __read_with_pyarrow(cls, file_path: str, **options: Any) -> pd.DataFrame:
- return pd.read_parquet(file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(ParquetFile)])
- def __read_with_fastparquet(cls, file_path: str, **options: Any) -> pd.DataFrame:
- return pd.read_parquet(file_path, **options)
-
- @staticmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_hdf), *["protocol"]])
- def _write_hdf_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe to hdf using `df.to_hdf`.
-
- All `options` are passed directly to `df.to_hdf`.
-
- Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
- that when used with asyncio through `async_read()` HDF files will be written sequentially.
- For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: The pandas `to_hdf` options.
-
- - The pandas `to_hdf` options, &;
- - protocol: The pickle protocol to use for writing the hdf file out; a value <=5.
- """
- with pickle_protocol(protocol=options.pop("protocol", None)), hdf_lock:
- df.to_hdf(file_path, key="df", mode="w", **options)
-
- @staticmethod
- @utils.allow_options(pd.DataFrame.to_csv)
- def _write_csv_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a CSV file using `df.to_csv`.
-
- All `options` are passed directly to `df.to_csv`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a csv file.
- """
- df.to_csv(file_path, **options)
-
- @staticmethod
- @utils.allow_options(pd.DataFrame.to_json)
- def _write_json_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a json file using `df.to_json`.
-
- All `options` are passed directly to `df.to_json`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a json file.
- """
- df.to_json(file_path, **options)
-
- @staticmethod
- def _write_parquet_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a parquet file using `df.to_parquet`.
-
- All `options` are passed directly to `df.to_parquet`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a parquet file.
- """
- if options.get("engine") == "fastparquet":
- return WithLocal.__write_with_fastparquet(df, file_path, **options)
- return WithLocal.__write_with_pyarrow(df, file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write_table)])
- def __write_with_pyarrow(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
- return df.to_parquet(filepath, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write)])
- def __write_with_fastparquet(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
- return df.to_parquet(filepath, **options)
Connect to a database using connection_string and returns an active session to that connection.
-
Args
-
connection_string:
-
Yields
-
Active session
-
-
-Expand source code
-
-
@contextmanager
-def session_for(connection_string: str) -> Generator[SqlAlchemySession, None, None]:
- """Connect to a database using `connection_string` and returns an active session to that connection.
-
- Args:
- connection_string:
-
- Yields:
- Active session
- """
- engine = create_engine(connection_string)
- session = Session(bind=engine)
-
- try:
- yield session
- finally:
- session.close() # pylint: disable=no-member
-
-
-
-
-
-
Classes
-
-
-class WithPostgres
-
-
-
Handles I/O operations for Postgres.
-
Args
-
-
options:
-
truncate_and_append: bool: If set to True, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows.
-
-
-
-
-
-Expand source code
-
-
class WithPostgres:
- """Handles I/O operations for Postgres.
-
- Args:
- - options:
- - `truncate_and_append: bool`: If set to `True`, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows.
- """
-
- sources_config: Mapping
- schema: Mapping
- options: MutableMapping[str, Any]
-
- def _read_from_postgres(self) -> pd.DataFrame:
- """Read data from postgres as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `db_user`
- - `db_password`
- - `db_host`
- - `db_port`
- - `db_name`
-
- Returns:
- DataFrame
- """
- postgres_config = self.sources_config["postgres"]
- db_user = postgres_config["db_user"]
- db_password = postgres_config["db_password"]
- db_host = postgres_config["db_host"]
- db_port = postgres_config["db_port"]
- db_name = postgres_config["db_name"]
-
- connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
- sql_query = self.options.pop("sql_query", None)
-
- if "schema" not in self.sources_config:
- schema_dict = self.schema
- else:
- schema_dict = self.sources_config["schema"]
- schema_name = self.sources_config["name"]
-
- model = self._generate_model_from_schema(schema_dict, schema_name)
-
- query = Query(self._get_table_columns(model))
- if sql_query:
- query = sql_query
-
- logger.info(f"[postgres] Started downloading table: {schema_name} from: {db_host}:{db_name}")
- with session_for(connection_string) as session:
- return self._read_database(session, query, **self.options)
-
- @staticmethod
- def _generate_model_from_schema(schema_dict: Mapping, schema_name: str) -> DeclarativeMeta:
- json_cls_schema: Dict[str, Any] = {"tablename": schema_name, "columns": []}
-
- for col, dtype in schema_dict.items():
- new_col = {"name": col}
-
- if dtype in _type_lookup:
- new_col.update({"name": col, "type": _type_lookup[dtype]})
- json_cls_schema["columns"].append(new_col)
-
- class_name = "".join(word.capitalize() or "_" for word in schema_name.split("_")) + "Model"
-
- class_dict = {"clsname": class_name, "__tablename__": schema_name, "__table_args__": {"extend_existing": True}}
- class_dict.update({column["name"]: Column(column["type"], primary_key=True) if idx == 0 else Column(column["type"]) for idx, column in enumerate(json_cls_schema["columns"])})
-
- generated_model = type(class_name, (Base,), class_dict)
- return generated_model
-
- @staticmethod
- def _get_table_columns(model):
- tables_colums = []
- if model:
- for col in list(model.__table__.columns):
- tables_colums.append(getattr(model, col.name))
- return tables_colums
-
- @staticmethod
- @utils.allow_options(pd.read_sql)
- def _read_database(session: SqlAlchemySession, query: Union[str, Query], **options: Any) -> pd.DataFrame:
- """Run `query` against active `session` and returns the result as a `DataFrame`.
-
- Args:
- session: Active session
- query: If a `Query` object is given, it should be unbound. If a `str` is given, the
- value is used as-is.
-
- Returns:
- DataFrame
- """
- if isinstance(query, Query):
- query = query.with_session(session).statement
- return pd.read_sql(sql=query, con=session.get_bind(), **options)
-
- def _write_to_postgres(self, df: pd.DataFrame):
- """Write a dataframe to postgres based on the {file_type} of the config_io configuration.
-
- Args:
- df: The dataframe to be written
- """
- postgres_config = self.sources_config["postgres"]
- db_user = postgres_config["db_user"]
- db_password = postgres_config["db_password"]
- db_host = postgres_config["db_host"]
- db_port = postgres_config["db_port"]
- db_name = postgres_config["db_name"]
-
- connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
- schema_dict = self.sources_config["schema"]
- schema_name = self.sources_config["name"]
- model = self._generate_model_from_schema(schema_dict, schema_name)
-
- is_truncate_and_append = self.options.get("truncate_and_append", False)
-
- logger.info(f"[postgres] Started downloading table: {schema_name} from: {db_host}:{db_name}")
- with session_for(connection_string) as session:
- self._write_to_database(session, model.__tablename__, df, is_truncate_and_append) # type: ignore
-
- @staticmethod
- def _write_to_database(session: SqlAlchemySession, table_name: str, df: pd.DataFrame, is_truncate_and_append: bool):
- """Write a dataframe to any database provided a session with a data model and a table name.
-
- Args:
- session: Generated from a data model and a table name
- table_name: The name of the table to read from a DB
- df: The dataframe to be written out
- is_truncate_and_append: Supply to truncate the table and append new rows to it; otherwise, delete and replace
- """
- if is_truncate_and_append:
- session.execute(f"TRUNCATE TABLE {table_name};")
-
- # Below is a speedup hack in place of `df.to_csv` with the multipart option. As of today, even with
- # `method="multi"`, uploading to Postgres is painfully slow. Hence, we're resorting to dumping the file as
- # csv and using Postgres's CSV import function.
- # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table
- with tempfile.NamedTemporaryFile(mode="r+") as temp_file:
- df.to_csv(temp_file, index=False, header=False, sep="\t", doublequote=False, escapechar="\\", quoting=csv.QUOTE_NONE)
- temp_file.flush()
- temp_file.seek(0)
-
- cur = session.connection().connection.cursor()
- cur.copy_from(temp_file, table_name, columns=df.columns, null="")
- else:
- df.to_sql(name=table_name, con=session.get_bind(), if_exists="replace", index=False)
-
- session.commit()
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/mixins/with_s3.html b/docs/mixins/with_s3.html
deleted file mode 100644
index a7c053c..0000000
--- a/docs/mixins/with_s3.html
+++ /dev/null
@@ -1,659 +0,0 @@
-
-
-
-
-
-
-dynamicio.mixins.with_s3 API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.mixins.with_s3
-
-
-
This module provides mixins that are providing S3 I/O support.
-
-
-Expand source code
-
-
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing S3 I/O support."""
-
-import os
-import tempfile
-from contextlib import contextmanager
-from typing import Generator
-
-import boto3 # type: ignore
-import pandas as pd # type: ignore
-from awscli.clidriver import create_clidriver # type: ignore
-from magic_logger import logger
-
-
-from . import (
- utils,
- with_local,
-)
-
-
-def awscli_runner(*cmd: str):
- """Runs the awscli command provided.
-
- Args:
- *cmd: A list of args used in the command.
-
- Raises:
- A runtime error exception is raised if download fails.
-
- Example:
-
- >>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".")
- """
- # Run
- exit_code = create_clidriver().main(cmd)
-
- if exit_code > 0:
- raise RuntimeError(f"AWS CLI exited with code {exit_code}")
-
-
-class WithS3PathPrefix(with_local.WithLocal):
- """Handles I/O operations for AWS S3; implements read operations only.
-
- This mixin assumes that the directories it reads from will only contain a single file-type.
- """
-
- def _write_to_s3_path_prefix(self, df: pd.DataFrame):
- """Write a DataFrame to an S3 path prefix.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `path_prefix`
- - `file_type`
-
- Args:
- df (pd.DataFrame): the DataFrame to be written to S3
-
- Raises:
- ValueError: In case `path_prefix` is missing from config
- ValueError: In case the `partition_cols` arg is missing while trying to write a parquet file
- """
- s3_config = self.sources_config["s3"]
- if "path_prefix" not in s3_config:
- raise ValueError("`path_prefix` is required to write multiple files to an S3 key")
-
- file_type = s3_config["file_type"]
- if file_type != "parquet":
- raise ValueError(f"File type not supported: {file_type}, only parquet files can be written to an S3 key")
- if "partition_cols" not in self.options:
- raise ValueError("`partition_cols` is required as an option to write partitioned parquet files to S3")
-
- bucket = s3_config["bucket"]
- path_prefix = s3_config["path_prefix"]
- full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
- with tempfile.TemporaryDirectory() as temp_dir:
- self._write_parquet_file(df, temp_dir, **self.options)
- awscli_runner(
- "s3",
- "sync",
- temp_dir,
- full_path_prefix,
- "--acl",
- "bucket-owner-full-control",
- "--only-show-errors",
- "--exact-timestamps",
- )
-
- def _read_from_s3_path_prefix(self) -> pd.DataFrame:
- """Read all files under a path prefix from an S3 bucket as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `path_prefix`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using
- "_read_{file_type}_path_prefix".
-
- Returns:
- DataFrame
- """
- s3_config = self.sources_config["s3"]
- if "path_prefix" not in s3_config:
- raise ValueError("`path_prefix` is required to read multiple files from an S3 source")
-
- file_type = s3_config["file_type"]
- if file_type not in {"parquet", "csv", "hdf", "json"}:
- raise ValueError(f"File type not supported: {file_type}")
-
- bucket = s3_config["bucket"]
- path_prefix = s3_config["path_prefix"]
- full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
- # The `no_disk_space` option should be used only when reading a subset of columns from S3
- if self.options.pop("no_disk_space", False) and file_type == "parquet":
- return self._read_parquet_file(full_path_prefix, self.schema, **self.options)
-
- with tempfile.TemporaryDirectory() as temp_dir:
- # aws-cli is shown to be up to 6 times faster when downloading the complete dataset from S3 than using the boto3
- # client or pandas directly. This is because aws-cli uses the parallel downloader, which is much faster than the
- # boto3 client.
- awscli_runner(
- "s3",
- "sync",
- full_path_prefix,
- temp_dir,
- "--acl",
- "bucket-owner-full-control",
- "--only-show-errors",
- "--exact-timestamps",
- )
-
- dfs = []
- for file in os.listdir(temp_dir):
- df = getattr(self, f"_read_{file_type}_file")(os.path.join(temp_dir, file), self.schema, **self.options) # type: ignore
- if len(df) > 0:
- dfs.append(df)
-
- return pd.concat(dfs, ignore_index=True)
-
-
-class WithS3File(with_local.WithLocal):
- """Handles I/O operations for AWS S3.
-
- All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory.
- Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support
- for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.
-
- Options:
- no_disk_space: If `True`, then s3fs + fsspec will be used to read data directly into memory.
- """
-
- boto3_client = boto3.client("s3")
-
- @contextmanager
- def _s3_reader(self, s3_bucket: str, s3_key: str) -> Generator:
- """Contextmanager to abstract reading different file types in S3.
-
- Args:
- s3_bucket: The S3 bucket from where to read the file.
- s3_key: The file-path to the target file to be read.
-
- Returns:
- The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
-
- """
- with tempfile.NamedTemporaryFile("wb") as target_file:
- # Download the file from S3
- self.boto3_client.download_fileobj(s3_bucket, s3_key, target_file)
- # Yield local file path to body of `with` statement
- target_file.flush()
- yield target_file
-
- @contextmanager
- def _s3_writer(self, s3_bucket: str, s3_key: str) -> Generator:
- """Contextmanager to abstract loading different file types to S3.
-
- Args:
- s3_bucket: The S3 bucket to upload the file to.
- s3_key: The file-path where the target file should be uploaded to.
-
- Returns:
- The local file path where to actually write the file, to be read and uploaded by boto3.client.
- """
- with tempfile.NamedTemporaryFile("wb") as target_file:
- # Yield local file path to body of `with` statement
- yield target_file
- target_file.flush()
-
- # Upload the file to S3
- self.boto3_client.upload_file(target_file.name, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"})
-
- def _read_from_s3_file(self) -> pd.DataFrame:
- """Read a file from an S3 bucket as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `file_path`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using "_read_{file_type}_file".
-
- Returns:
- DataFrame
- """
- s3_config = self.sources_config["s3"]
- if "file_path" not in s3_config:
- raise ValueError("`file_path` is required for reading a file from an S3 source")
-
- file_type = s3_config["file_type"]
- file_path = utils.resolve_template(s3_config["file_path"], self.options)
- bucket = s3_config["bucket"]
-
- logger.info(f"[s3] Started downloading: s3://{s3_config['bucket']}/{file_path}")
- if file_type in ["csv", "json", "parquet"] and self.options.pop("no_disk_space", None):
- return getattr(self, f"_read_{file_type}_file")(f"s3://{s3_config['bucket']}/{file_path}", self.schema, **self.options) # type: ignore
- with self._s3_reader(s3_bucket=bucket, s3_key=file_path) as target_file: # type: ignore
- return getattr(self, f"_read_{file_type}_file")(target_file.name, self.schema, **self.options) # type: ignore
-
- def _write_to_s3_file(self, df: pd.DataFrame):
- """Write a dataframe to s3 based on the {file_type} of the config_io configuration.
-
- The configuration object is expected to have two keys:
-
- - `file_path`
- - `file_type`
-
- To actually write the file, a method is dynamically invoked by name, using "_write_{file_type}_file".
-
- Args:
- df: The dataframe to be written out
- """
- s3_config = self.sources_config["s3"]
- file_path = utils.resolve_template(s3_config["file_path"], self.options)
- file_type = s3_config["file_type"]
-
- logger.info(f"[s3] Started uploading: s3://{s3_config['bucket']}/{file_path}")
- if file_type in ["csv", "json", "parquet"]:
- getattr(self, f"_write_{file_type}_file")(df, f"s3://{s3_config['bucket']}/{file_path}", **self.options) # type: ignore
- elif file_type == "hdf":
- with self._s3_writer(s3_bucket=s3_config["bucket"], s3_key=file_path) as target_file: # type: ignore
- self._write_hdf_file(df, target_file.name, **self.options) # type: ignore
- else:
- raise ValueError(f"File type: {file_type} not supported!")
- logger.info(f"[s3] Finished uploading: s3://{s3_config['bucket']}/{file_path}")
-
-
-
-
-
-
-
-
Functions
-
-
-def awscli_runner(*cmd: str)
-
-
-
Runs the awscli command provided.
-
Args
-
-
*cmd
-
A list of args used in the command.
-
-
Raises
-
A runtime error exception is raised if download fails.
def awscli_runner(*cmd: str):
- """Runs the awscli command provided.
-
- Args:
- *cmd: A list of args used in the command.
-
- Raises:
- A runtime error exception is raised if download fails.
-
- Example:
-
- >>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".")
- """
- # Run
- exit_code = create_clidriver().main(cmd)
-
- if exit_code > 0:
- raise RuntimeError(f"AWS CLI exited with code {exit_code}")
-
-
-
-
-
-
Classes
-
-
-class WithS3File
-
-
-
Handles I/O operations for AWS S3.
-
All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory.
-Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support
-for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.
-
Options
-
no_disk_space: If True, then s3fs + fsspec will be used to read data directly into memory.
-
-
-Expand source code
-
-
class WithS3File(with_local.WithLocal):
- """Handles I/O operations for AWS S3.
-
- All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory.
- Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support
- for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.
-
- Options:
- no_disk_space: If `True`, then s3fs + fsspec will be used to read data directly into memory.
- """
-
- boto3_client = boto3.client("s3")
-
- @contextmanager
- def _s3_reader(self, s3_bucket: str, s3_key: str) -> Generator:
- """Contextmanager to abstract reading different file types in S3.
-
- Args:
- s3_bucket: The S3 bucket from where to read the file.
- s3_key: The file-path to the target file to be read.
-
- Returns:
- The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
-
- """
- with tempfile.NamedTemporaryFile("wb") as target_file:
- # Download the file from S3
- self.boto3_client.download_fileobj(s3_bucket, s3_key, target_file)
- # Yield local file path to body of `with` statement
- target_file.flush()
- yield target_file
-
- @contextmanager
- def _s3_writer(self, s3_bucket: str, s3_key: str) -> Generator:
- """Contextmanager to abstract loading different file types to S3.
-
- Args:
- s3_bucket: The S3 bucket to upload the file to.
- s3_key: The file-path where the target file should be uploaded to.
-
- Returns:
- The local file path where to actually write the file, to be read and uploaded by boto3.client.
- """
- with tempfile.NamedTemporaryFile("wb") as target_file:
- # Yield local file path to body of `with` statement
- yield target_file
- target_file.flush()
-
- # Upload the file to S3
- self.boto3_client.upload_file(target_file.name, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"})
-
- def _read_from_s3_file(self) -> pd.DataFrame:
- """Read a file from an S3 bucket as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `file_path`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using "_read_{file_type}_file".
-
- Returns:
- DataFrame
- """
- s3_config = self.sources_config["s3"]
- if "file_path" not in s3_config:
- raise ValueError("`file_path` is required for reading a file from an S3 source")
-
- file_type = s3_config["file_type"]
- file_path = utils.resolve_template(s3_config["file_path"], self.options)
- bucket = s3_config["bucket"]
-
- logger.info(f"[s3] Started downloading: s3://{s3_config['bucket']}/{file_path}")
- if file_type in ["csv", "json", "parquet"] and self.options.pop("no_disk_space", None):
- return getattr(self, f"_read_{file_type}_file")(f"s3://{s3_config['bucket']}/{file_path}", self.schema, **self.options) # type: ignore
- with self._s3_reader(s3_bucket=bucket, s3_key=file_path) as target_file: # type: ignore
- return getattr(self, f"_read_{file_type}_file")(target_file.name, self.schema, **self.options) # type: ignore
-
- def _write_to_s3_file(self, df: pd.DataFrame):
- """Write a dataframe to s3 based on the {file_type} of the config_io configuration.
-
- The configuration object is expected to have two keys:
-
- - `file_path`
- - `file_type`
-
- To actually write the file, a method is dynamically invoked by name, using "_write_{file_type}_file".
-
- Args:
- df: The dataframe to be written out
- """
- s3_config = self.sources_config["s3"]
- file_path = utils.resolve_template(s3_config["file_path"], self.options)
- file_type = s3_config["file_type"]
-
- logger.info(f"[s3] Started uploading: s3://{s3_config['bucket']}/{file_path}")
- if file_type in ["csv", "json", "parquet"]:
- getattr(self, f"_write_{file_type}_file")(df, f"s3://{s3_config['bucket']}/{file_path}", **self.options) # type: ignore
- elif file_type == "hdf":
- with self._s3_writer(s3_bucket=s3_config["bucket"], s3_key=file_path) as target_file: # type: ignore
- self._write_hdf_file(df, target_file.name, **self.options) # type: ignore
- else:
- raise ValueError(f"File type: {file_type} not supported!")
- logger.info(f"[s3] Finished uploading: s3://{s3_config['bucket']}/{file_path}")
Handles I/O operations for AWS S3; implements read operations only.
-
This mixin assumes that the directories it reads from will only contain a single file-type.
-
-
-Expand source code
-
-
class WithS3PathPrefix(with_local.WithLocal):
- """Handles I/O operations for AWS S3; implements read operations only.
-
- This mixin assumes that the directories it reads from will only contain a single file-type.
- """
-
- def _write_to_s3_path_prefix(self, df: pd.DataFrame):
- """Write a DataFrame to an S3 path prefix.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `path_prefix`
- - `file_type`
-
- Args:
- df (pd.DataFrame): the DataFrame to be written to S3
-
- Raises:
- ValueError: In case `path_prefix` is missing from config
- ValueError: In case the `partition_cols` arg is missing while trying to write a parquet file
- """
- s3_config = self.sources_config["s3"]
- if "path_prefix" not in s3_config:
- raise ValueError("`path_prefix` is required to write multiple files to an S3 key")
-
- file_type = s3_config["file_type"]
- if file_type != "parquet":
- raise ValueError(f"File type not supported: {file_type}, only parquet files can be written to an S3 key")
- if "partition_cols" not in self.options:
- raise ValueError("`partition_cols` is required as an option to write partitioned parquet files to S3")
-
- bucket = s3_config["bucket"]
- path_prefix = s3_config["path_prefix"]
- full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
- with tempfile.TemporaryDirectory() as temp_dir:
- self._write_parquet_file(df, temp_dir, **self.options)
- awscli_runner(
- "s3",
- "sync",
- temp_dir,
- full_path_prefix,
- "--acl",
- "bucket-owner-full-control",
- "--only-show-errors",
- "--exact-timestamps",
- )
-
- def _read_from_s3_path_prefix(self) -> pd.DataFrame:
- """Read all files under a path prefix from an S3 bucket as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `path_prefix`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using
- "_read_{file_type}_path_prefix".
-
- Returns:
- DataFrame
- """
- s3_config = self.sources_config["s3"]
- if "path_prefix" not in s3_config:
- raise ValueError("`path_prefix` is required to read multiple files from an S3 source")
-
- file_type = s3_config["file_type"]
- if file_type not in {"parquet", "csv", "hdf", "json"}:
- raise ValueError(f"File type not supported: {file_type}")
-
- bucket = s3_config["bucket"]
- path_prefix = s3_config["path_prefix"]
- full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
- # The `no_disk_space` option should be used only when reading a subset of columns from S3
- if self.options.pop("no_disk_space", False) and file_type == "parquet":
- return self._read_parquet_file(full_path_prefix, self.schema, **self.options)
-
- with tempfile.TemporaryDirectory() as temp_dir:
- # aws-cli is shown to be up to 6 times faster when downloading the complete dataset from S3 than using the boto3
- # client or pandas directly. This is because aws-cli uses the parallel downloader, which is much faster than the
- # boto3 client.
- awscli_runner(
- "s3",
- "sync",
- full_path_prefix,
- temp_dir,
- "--acl",
- "bucket-owner-full-control",
- "--only-show-errors",
- "--exact-timestamps",
- )
-
- dfs = []
- for file in os.listdir(temp_dir):
- df = getattr(self, f"_read_{file_type}_file")(os.path.join(temp_dir, file), self.schema, **self.options) # type: ignore
- if len(df) > 0:
- dfs.append(df)
-
- return pd.concat(dfs, ignore_index=True)
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/docs/validations.html b/docs/validations.html
deleted file mode 100644
index 4a52e20..0000000
--- a/docs/validations.html
+++ /dev/null
@@ -1,923 +0,0 @@
-
-
-
-
-
-
-dynamicio.validations API documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Module dynamicio.validations
-
-
-
Implements the Validator class responsible for various generic data validations and metrics generation.
-
-
-Expand source code
-
-
"""Implements the Validator class responsible for various generic data validations and metrics generation."""
-__all__ = [
- "has_unique_values",
- "has_no_null_values",
- "has_acceptable_percentage_of_nulls",
- "is_in",
- "is_greater_than",
- "is_greater_than_or_equal",
- "is_lower_than",
- "is_lower_than_or_equal",
- "is_between",
-]
-
-import operator
-from typing import NamedTuple, Set
-
-import pandas as pd # type: ignore
-
-
-class ValidationResult(NamedTuple):
- """A NamedTuple for capturing different outputs after a validation."""
-
- valid: bool
- message: str
- value: float
-
-
-def has_unique_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
- """Checks if values in column are unique.
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_duplicated_elements
- """
- counts = df[column].value_counts()
- if not (counts > 1).any():
- return ValidationResult(valid=True, message=f"{dataset}[{column}] has unique values", value=0)
-
- duplicates = counts[counts > 1].index.to_list()
- return ValidationResult(valid=False, message=f"Values {duplicates} for {dataset}[{column}] are duplicated!", value=len(duplicates))
-
-
-def has_no_null_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
- """Checks if column has any null values (including NaN and NaT values).
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_nulls
- """
- mask = df[column].isnull()
- no_of_nulls = mask.sum()
- return ValidationResult(valid=not mask.any(), message=f"{dataset}[{column}] has {no_of_nulls} nulls", value=no_of_nulls)
-
-
-def has_acceptable_percentage_of_nulls(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Checks if a provided threshold of max nulls has been exceeded.
-
- Note: For an empty df the validation will always be successful
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
- threshold: Maximum allowed threshold
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is percentage_of_nulls
- """
- if threshold <= 0 or threshold >= 1:
- raise ValueError(f"Threshold value: {threshold} must be a value between 0 and 1.")
-
- no_of_nulls = df[column].isnull().sum()
- if len(df) == 0:
- percentage_of_nulls = 0
- else:
- percentage_of_nulls = no_of_nulls / len(df)
-
- if percentage_of_nulls < threshold:
- return ValidationResult(
- valid=True,
- message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls}",
- value=percentage_of_nulls,
- )
- return ValidationResult(
- valid=False,
- message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls} which exceeds threshold: {threshold}",
- value=percentage_of_nulls,
- )
-
-
-def is_in(dataset: str, df: pd.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) -> ValidationResult:
- """Checks if the column only has allowed categorical values as per the set provided.
-
- Note:
- Ignores nulls
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- categorical_values: The allowed set of categorical values
- match_all: If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_not_acceptable
- """
- unique_values = set(df[column][df[column].notna()].unique())
-
- if match_all:
- return _validate_categoricals_are_a_subset_of_the_acceptable(categorical_values, unique_values, column, dataset, df)
- return _validate_all_acceptable_categoricals_are_present(categorical_values, unique_values, column, dataset, df)
-
-
-def _validate_all_acceptable_categoricals_are_present(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult:
- if unique_values == acceptable_categoricals:
- validation_result = ValidationResult(valid=True, message=f"All acceptable categorical values for {dataset}[{column}] are present", value=0)
- elif unique_values < acceptable_categoricals:
- validation_result = ValidationResult(
- valid=False,
- message=f"Missing categorical values for {dataset}[{column}]: {acceptable_categoricals - unique_values}",
- value=len(acceptable_categoricals - unique_values),
- )
- else:
- count_invalid = (~df[column].isin(acceptable_categoricals)).sum()
- validation_result = ValidationResult(
- valid=False,
- message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells",
- value=count_invalid,
- )
- return validation_result
-
-
-def _validate_categoricals_are_a_subset_of_the_acceptable(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult:
- if unique_values.issubset(acceptable_categoricals):
- return ValidationResult(valid=True, message=f"Categorical values for {dataset}[{column}] are acceptable", value=0)
- count_invalid = (~df[column].isin(acceptable_categoricals)).sum()
- return ValidationResult(
- valid=False,
- message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells",
- value=count_invalid,
- )
-
-
-def is_greater_than(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are above a given threshold.
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
- percentage of invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df > threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-def is_greater_than_or_equal(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are above a given threshold.
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
- percentage of invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df >= threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-def is_lower_than(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are below a given threshold.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df < threshold # pd.DataFrame
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-def is_lower_than_or_equal(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are below a given threshold.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df <= threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-def is_between(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- lower: float,
- upper: float,
- include_left: bool = False,
- include_right: bool = False,
-) -> ValidationResult:
- """Confirms column values are between a lower bound and an upper bound thresholds.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- lower: The lower bound (left)
- upper: The upper bound (right)
- include_left: `left <= df[column]`
- include_right: `df[column] <=right`
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- lower_bound_operator = operator.ge if include_left else operator.gt
- upper_bound_operator = operator.le if include_right else operator.lt
-
- valid = lower_bound_operator(no_nulls_for_column_df, lower) & upper_bound_operator(no_nulls_for_column_df, upper)
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] is between {lower} and {upper} thresholds", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are either below {lower} or above {upper}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
Checks if a provided threshold of max nulls has been exceeded.
-
Note: For an empty df the validation will always be successful
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A pandas DataFrame
-
column
-
The column to be validated
-
threshold
-
Maximum allowed threshold
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and
-Validation.Result.value is percentage_of_nulls
-
-
-Expand source code
-
-
def has_acceptable_percentage_of_nulls(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Checks if a provided threshold of max nulls has been exceeded.
-
- Note: For an empty df the validation will always be successful
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
- threshold: Maximum allowed threshold
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is percentage_of_nulls
- """
- if threshold <= 0 or threshold >= 1:
- raise ValueError(f"Threshold value: {threshold} must be a value between 0 and 1.")
-
- no_of_nulls = df[column].isnull().sum()
- if len(df) == 0:
- percentage_of_nulls = 0
- else:
- percentage_of_nulls = no_of_nulls / len(df)
-
- if percentage_of_nulls < threshold:
- return ValidationResult(
- valid=True,
- message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls}",
- value=percentage_of_nulls,
- )
- return ValidationResult(
- valid=False,
- message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls} which exceeds threshold: {threshold}",
- value=percentage_of_nulls,
- )
Checks if column has any null values (including NaN and NaT values).
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A pandas DataFrame
-
column
-
The column to be validated
-
-
Returns
-
An instance of
-ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and
-Validation.Result.value is no_of_nulls
-
-
-Expand source code
-
-
def has_no_null_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
- """Checks if column has any null values (including NaN and NaT values).
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_nulls
- """
- mask = df[column].isnull()
- no_of_nulls = mask.sum()
- return ValidationResult(valid=not mask.any(), message=f"{dataset}[{column}] has {no_of_nulls} nulls", value=no_of_nulls)
An instance of
-ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and
-Validation.Result.value is no_of_duplicated_elements
-
-
-Expand source code
-
-
def has_unique_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
- """Checks if values in column are unique.
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_duplicated_elements
- """
- counts = df[column].value_counts()
- if not (counts > 1).any():
- return ValidationResult(valid=True, message=f"{dataset}[{column}] has unique values", value=0)
-
- duplicates = counts[counts > 1].index.to_list()
- return ValidationResult(valid=False, message=f"Values {duplicates} for {dataset}[{column}] are duplicated!", value=len(duplicates))
Confirms column values are between a lower bound and an upper bound thresholds.
-
IMPORTANT NOTE: Ignores nulls!
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
lower
-
The lower bound (left)
-
upper
-
The upper bound (right)
-
include_left
-
left <= df[column]
-
include_right
-
df[column] <=right
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the percentage of
-invalid values
-
-
-Expand source code
-
-
def is_between(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- lower: float,
- upper: float,
- include_left: bool = False,
- include_right: bool = False,
-) -> ValidationResult:
- """Confirms column values are between a lower bound and an upper bound thresholds.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- lower: The lower bound (left)
- upper: The upper bound (right)
- include_left: `left <= df[column]`
- include_right: `df[column] <=right`
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- lower_bound_operator = operator.ge if include_left else operator.gt
- upper_bound_operator = operator.le if include_right else operator.lt
-
- valid = lower_bound_operator(no_nulls_for_column_df, lower) & upper_bound_operator(no_nulls_for_column_df, upper)
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] is between {lower} and {upper} thresholds", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are either below {lower} or above {upper}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
Confirms column values are above a given threshold.
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the
-percentage of invalid values
-
-
-Expand source code
-
-
def is_greater_than(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are above a given threshold.
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
- percentage of invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df > threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
Confirms column values are above a given threshold.
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the
-percentage of invalid values
-
-
-Expand source code
-
-
def is_greater_than_or_equal(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are above a given threshold.
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
- percentage of invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df >= threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
Checks if the column only has allowed categorical values as per the set provided.
-
Note
-
Ignores nulls
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
categorical_values
-
The allowed set of categorical values
-
match_all
-
If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is no_of_not_acceptable
-
-
-Expand source code
-
-
def is_in(dataset: str, df: pd.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) -> ValidationResult:
- """Checks if the column only has allowed categorical values as per the set provided.
-
- Note:
- Ignores nulls
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- categorical_values: The allowed set of categorical values
- match_all: If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_not_acceptable
- """
- unique_values = set(df[column][df[column].notna()].unique())
-
- if match_all:
- return _validate_categoricals_are_a_subset_of_the_acceptable(categorical_values, unique_values, column, dataset, df)
- return _validate_all_acceptable_categoricals_are_present(categorical_values, unique_values, column, dataset, df)
Confirms column values are below a given threshold.
-
IMPORTANT NOTE: Ignores nulls!
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the percentage of
-invalid values
-
-
-Expand source code
-
-
def is_lower_than(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are below a given threshold.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df < threshold # pd.DataFrame
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
Confirms column values are below a given threshold.
-
IMPORTANT NOTE: Ignores nulls!
-
Args
-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-
Returns
-
An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation,
-Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the percentage of
-invalid values
-
-
-Expand source code
-
-
def is_lower_than_or_equal(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are below a given threshold.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df <= threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/tests/resources/data/processed/.gitkeep b/dynamicio/.gitkeep
similarity index 100%
rename from tests/resources/data/processed/.gitkeep
rename to dynamicio/.gitkeep
diff --git a/dynamicio/__init__.py b/dynamicio/__init__.py
index 6b037c0..6ca418f 100644
--- a/dynamicio/__init__.py
+++ b/dynamicio/__init__.py
@@ -1,53 +1,7 @@
"""A package for wrapping your I/O operations."""
-import os
-from contextlib import suppress
-import pkg_resources
-from magic_logger import logger
+import logging
-with suppress(Exception):
- __version__ = pkg_resources.get_distribution("dynamicio").version
+from dynamicio.io import LocalFileResource, S3Resource, PostgresResource, KafkaResource
-from dynamicio.core import DynamicDataIO
-from dynamicio.mixins import WithKafka, WithLocal, WithLocalBatch, WithPostgres, WithS3File, WithS3PathPrefix
-
-os.environ["LC_CTYPE"] = "en_US.UTF" # Set your locale to a unicode-compatible one
-
-
-class UnifiedIO(WithS3File, WithS3PathPrefix, WithLocalBatch, WithLocal, WithKafka, WithPostgres, DynamicDataIO): # type: ignore
- """A unified io composed of dynamicio.mixins."""
-
-
-logging_config = {
- "version": 1,
- "disable_existing_loggers": True,
- "formatters": {
- "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"},
- "generic-metrics": {"format": "%(message)s"},
- },
- "handlers": {
- "default": {
- "level": "INFO",
- "formatter": "standard",
- "class": "logging.StreamHandler",
- "stream": "ext://sys.stdout", # Default is stderr
- },
- "metrics": {
- "level": "INFO",
- "formatter": "generic-metrics",
- "class": "logging.StreamHandler",
- "stream": "ext://sys.stdout", # Default is stderr
- },
- },
- "loggers": {
- "": {"handlers": ["default"], "level": "INFO", "propagate": False},
- "dynamicio.metrics": {"handlers": ["metrics"], "level": "INFO", "propagate": False},
- "awscli": {
- "handlers": ["default"],
- "level": "INFO",
- "propagate": False,
- },
- },
-}
-
-logger.dict_config(logging_config)
+logging.getLogger(__name__).addHandler(logging.NullHandler())
diff --git a/dynamicio/__main__.py b/dynamicio/__main__.py
deleted file mode 100644
index ba3addf..0000000
--- a/dynamicio/__main__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Invokes dynamicio cli."""
-from dynamicio.cli import run
-
-run()
diff --git a/dynamicio/cli.py b/dynamicio/cli.py
deleted file mode 100644
index 9dfeb16..0000000
--- a/dynamicio/cli.py
+++ /dev/null
@@ -1,103 +0,0 @@
-"""Implements the dynamicio Command Line Interface (CLI)."""
-import argparse
-import glob
-import os
-import pprint
-from typing import Mapping, MutableMapping, Optional, Sequence
-
-import pandas as pd # type: ignore
-import yaml
-
-from dynamicio.errors import InvalidDatasetTypeError
-
-
-def parse_args(args: Optional[Sequence] = None) -> argparse.Namespace:
- """Arguments parser for dynamicio cli.py.
-
- Args:
- args: List of args to be parsed. Defaults to None, in which case
- sys.argv[1:] is used.
-
- Returns:
- An instance of ArgumentParser populated with the provided args.
- """
- parser = argparse.ArgumentParser(prog="dynamicio", description="Generate dataset schemas")
- group = parser.add_mutually_exclusive_group(required=True)
- group.add_argument(
- "-b",
- "--batch",
- action="store_true",
- help="flag, used to generate multiple schemas provided a datasets directory.",
- )
- group.add_argument(
- "-s",
- "--single",
- action="store_true",
- help="flag, used to generate a schema provided a single dataset.",
- )
- parser.add_argument("-p", "--path", required=True, help="the path to the dataset/datasets-directory.", type=str)
- parser.add_argument("-o", "--output", required=True, help="the path to the schemas output directory.", type=str)
- return parser.parse_args(args)
-
-
-def generate_schema_for(dataset: str) -> Mapping:
- """Generate a schema for a dataset.
-
- Args:
- dataset: The path to the dataset for which we want to generate a schema
-
- Returns:
- A dictionary containing the schema for the dataset, or None if the dataset is not valid.
-
- Raises:
- InvalidDatasetTypeError: If the dataset type is not supported by dynamicio.
- """
- dataset_name, file_type = os.path.splitext(os.path.basename(dataset))
- if file_type == ".parquet":
- df = pd.read_parquet(dataset)
- elif file_type == ".csv":
- df = pd.read_csv(dataset)
- elif file_type == ".json":
- df = pd.read_json(dataset)
- elif file_type == ".h5":
- df = pd.read_hdf(dataset)
- else:
- raise InvalidDatasetTypeError(dataset)
-
- print(f"Generating schema for: {dataset}")
- json_schema: MutableMapping = {"name": dataset_name, "columns": {}}
- for column, d_type in zip(list(df.columns), list(df.dtypes)):
- json_schema["columns"][column] = {"type": "", "validations": {}, "metrics": []}
- json_schema["columns"][column]["type"] = d_type.name
-
- return json_schema
-
-
-def main(args: argparse.Namespace):
- """Main function for dynamicio cli.py.
-
- Args:
- args: Parsed args.
- """
- if args.batch:
- dataset_files = glob.glob(os.path.join(args.path, "*.*"))
- for dataset in dataset_files:
- try:
- json_schema = generate_schema_for(dataset)
- except InvalidDatasetTypeError as exception:
- print(f"Skipping {exception.message}! You may want to remove this file from the datasets directory")
- else:
- with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml: # pylint: disable=unspecified-encoding]
- yaml.safe_dump(json_schema, yml)
-
- if args.single:
- json_schema = generate_schema_for(str(args.path))
- with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml: # pylint: disable=unspecified-encoding]
- yaml.safe_dump(json_schema, yml)
- pprint.pprint(json_schema)
-
-
-def run():
- """Entry point for the dynamicio cli.py."""
- args = parse_args()
- main(args)
diff --git a/dynamicio/config/__init__.py b/dynamicio/config/__init__.py
deleted file mode 100644
index 47a7ec6..0000000
--- a/dynamicio/config/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-"""Dynamicio config file handling routines."""
-
-from dynamicio.config import pydantic
-from dynamicio.config.io_config import IOConfig
diff --git a/dynamicio/config/io_config.py b/dynamicio/config/io_config.py
deleted file mode 100644
index f1f6033..0000000
--- a/dynamicio/config/io_config.py
+++ /dev/null
@@ -1,273 +0,0 @@
-"""Implements the `IOConfig` class, generating objects used as a configuration parameter for the instantiation of`src.utils.dynamicio.dataio.DynamicDataIO` objects.
-
-The `IOConfig` object, essentially parses a yaml file that contains a set of input sources that will be processed by a
-task, converting filtering and converting them into dictionaries.
-
-For example, suppose an `input.yaml` file, containing:
-
- READ_FROM_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
-
-would be loaded with:
-
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=config_module
- )
-
-and:
-
- input_sources_config.config
-
-would return:
-
- {
- "READ_FROM_S3_CSV": {
- "LOCAL": {
- "type": "local",
- "local": {
- "file_path": f"{test_global_vars.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "type": "s3",
- "s3": {
- "bucket": "mock-bucket",
- "file_path": "mock-key",
- "file_type": "csv"
- }
- },
- }
- }
-"""
-__all__ = ["IOConfig", "SafeDynamicResourceLoader", "SafeDynamicSchemaLoader"]
-
-import re
-from types import ModuleType
-from typing import Any, List, MutableMapping
-
-import pydantic
-import yaml
-from magic_logger import logger
-
-from dynamicio.config.pydantic import BindingsYaml, IOEnvironment
-
-
-class SafeDynamicResourceLoader(yaml.SafeLoader):
- """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
- Dynamic variables defined in a provided module object.
- """
-
- module = None
- dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
- @classmethod
- def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
- def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- return value
-
-
-class SafeDynamicSchemaLoader(yaml.SafeLoader):
- """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
- Dynamic variables defined in a provided module object.
- """
-
- module = None
- dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
- @classmethod
- def with_module(cls, module: ModuleType):
- """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
- Args:
- module: A global vars module with all the dynamic values defined in it.
-
- Returns:
- type
- """
- return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
- def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
- """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
- Args:
- node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
- are replaced with the respective attributes in te provided module.
-
- Returns:
- Constructed `str` or numerical.
- """
- value = node.value
-
- while result := self.dynamic_data_matcher.match(value):
- ref = result.group(3)
- replacement = getattr(self.module, ref)
-
- value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
- try:
- value = float(value)
- return value
- except ValueError:
- pass
-
- return value
-
-
-class IOConfig:
- """Generates an object that returns a sub-dictionary of the elements of that yaml file.
-
- The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file,
- an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.
-
- Example:
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=config_module
- )
- """
-
- YAML_TAG = "tag:yaml.org,2002:str"
- SafeDynamicResourceLoader.add_constructor(YAML_TAG, SafeDynamicResourceLoader.dyn_str_constructor)
- SafeDynamicSchemaLoader.add_constructor(YAML_TAG, SafeDynamicSchemaLoader.dyn_value_constructor)
-
- path_to_source_yaml: str
- env_identifier: str
- config: BindingsYaml
-
- def __init__(self, path_to_source_yaml: str, env_identifier: str, dynamic_vars: ModuleType):
- """Class constructor.
-
- Args:
- path_to_source_yaml: Absolute file path to yaml file containing source definitions
- env_identifier: "LOCAL" or "CLOUD".
- dynamic_vars: module containing values for dynamic values that the source yaml
- may reference.
- """
- self.path_to_source_yaml = path_to_source_yaml
- self.env_identifier = env_identifier
- self.dynamic_vars = dynamic_vars
- self.config = self._parse_sources_config()
-
- def _parse_sources_config(self) -> BindingsYaml:
- """Parses the yaml input and return a dictionary.
-
- Returns:
- A dictionary with the list of all file paths pointing to various input sources as those
- are defined in their respective data/*.yaml files.
- """
- used_file_inputs = [self.path_to_source_yaml]
- with open(self.path_to_source_yaml, "r") as stream: # pylint: disable=unspecified-encoding]
- logger.debug(f"Parsing {self.path_to_source_yaml}...")
- data = yaml.load(stream, SafeDynamicResourceLoader.with_module(self.dynamic_vars))
-
- # Load any file_path's found in schema definitions
- for io_binding in data.values():
- if isinstance(io_binding, MutableMapping) and io_binding.get("schema", {}).get("file_path"):
- file_path = io_binding["schema"]["file_path"]
- used_file_inputs.append(file_path)
- # schema has `file_path`` in it
- with open(file_path, "r", encoding="utf8") as stream:
- io_binding["schema"] = yaml.load(stream, SafeDynamicSchemaLoader.with_module(self.dynamic_vars))
-
- try:
- config = BindingsYaml(bindings=data)
- config.update_config_refs()
- except pydantic.ValidationError:
- logger.exception(f"Error loading {data=!r}, {used_file_inputs=!r}")
- raise
- return config
-
- @property
- def sources(self) -> List[str]:
- """Class property for easy access to a list of sources.
-
- Returns:
- All top level names of the available resources for the used resources yaml config.
- """
- return list(self.config.bindings.keys())
-
- def get(self, source_key: str) -> IOEnvironment:
- """A getter.
-
- Args:
- source_key: The name of the resource for which we want to create a config.
-
- Returns:
- A dictionary with the necessary fields for loading the data from a source.
-
- Example:
-
- Given:
-
- VOYAGE_DATA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
- file_type: "parquet"
- CLOUD:
- type: "kafka"
- KAFKA:
- KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
- KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
- If you do:
-
- input_sources_config = IOConfig(
- "path_to/input.yaml",
- env_identifier="CLOUD",
- dynamic_vars=globals
- )
- voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
- then `voyage_data_cloud_mapping` is:
-
- "KAFKA": {
- "KAFKA_SERVER": "mock-kafka-server",
- "KAFKA_TOPIC": "mock-kafka-topic"
- }
- """
- return self.config.bindings[source_key].get_binding_for_environment(self.env_identifier)
diff --git a/dynamicio/config/pydantic/__init__.py b/dynamicio/config/pydantic/__init__.py
deleted file mode 100644
index 68a6fe8..0000000
--- a/dynamicio/config/pydantic/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-"""Pydantic config models."""
-
-from dynamicio.config.pydantic.config import BindingsYaml
-from dynamicio.config.pydantic.io_resources import (
- IOEnvironment,
- KafkaDataEnvironment,
- LocalBatchDataEnvironment,
- LocalDataEnvironment,
- PostgresDataEnvironment,
- S3DataEnvironment,
- S3PathPrefixEnvironment,
-)
-from dynamicio.config.pydantic.table_schema import DataframeSchema
diff --git a/dynamicio/config/pydantic/config.py b/dynamicio/config/pydantic/config.py
deleted file mode 100644
index acc8772..0000000
--- a/dynamicio/config/pydantic/config.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# pylint: disable=no-member, no-self-argument, unused-argument
-"""Pydantic schema for YAML files"""
-
-from typing import Mapping, MutableMapping
-
-import pydantic
-
-import dynamicio.config.pydantic.io_resources as env_spec
-
-
-class BindingsYaml(pydantic.BaseModel):
- """Class controlling structure of the top-level IOConfig yaml file.
-
- The top-level config is a dictionary of ->
- """
-
- bindings: Mapping[str, env_spec.IOBinding]
-
- @pydantic.validator("bindings", pre=True)
- def _validate_bindings(cls, value: Mapping):
- if not isinstance(value, Mapping):
- raise ValueError(f"Bindings must be a mapping. (got {value!r} instead).")
- # Tell each binding its name
- for (name, sub_config) in value.items():
- if not isinstance(sub_config, MutableMapping):
- raise ValueError(f"Each element for the name binding must be a dict. (got {sub_config!r} instead)")
- sub_config["__binding_name__"] = name
- return value
-
- def update_config_refs(self) -> "BindingsYaml":
- """Updates dynamic parts of the config:
- - Configure _parent for all `IOEnvironment`s
- - Replace all IOSchemaRef with actual schema objects
- """
- for binding in self.bindings.values():
- for io_env in binding.environments.values():
- io_env.set_parent(binding)
- return self
diff --git a/dynamicio/config/pydantic/io_resources.py b/dynamicio/config/pydantic/io_resources.py
deleted file mode 100644
index d799c97..0000000
--- a/dynamicio/config/pydantic/io_resources.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# pylint: disable=no-member, no-self-argument, unused-argument
-
-"""This module contains pylint models for physical data sources (places the bytes are being read from)"""
-
-import enum
-import posixpath
-from typing import Mapping, Optional, Union
-
-import pydantic
-
-import dynamicio.config.pydantic.table_schema as table_spec
-
-
-@enum.unique
-class DataBackendType(str, enum.Enum):
- """Input file types"""
-
- # pylint: disable=invalid-name
- local = "local"
- local_batch = "local_batch"
- s3 = "s3" # is there a difference between 's3' and 's3_file' ?
- s3_file = "s3_file"
- s3_path_prefix = "s3_path_prefix"
- postgres = "postgres"
- athena = "athena"
- kafka = "kafka"
-
-
-@enum.unique
-class FileType(str, enum.Enum):
- """List of supported file formats."""
-
- # pylint: disable=invalid-name
- parquet = "parquet"
- csv = "csv"
- json = "json"
- hdf = "hdf"
-
-
-class IOBinding(pydantic.BaseModel):
- """A binding for a single i/o object"""
-
- name: str = pydantic.Field(alias="__binding_name__")
- environments: Mapping[str, "IOEnvironment"]
- dynamicio_schema: Union[table_spec.DataframeSchema, None] = pydantic.Field(default=None, alias="schema")
-
- def get_binding_for_environment(self, environment: str) -> "IOEnvironment":
- """Fetch the IOEnvironment spec for the name provided."""
- return self.environments[environment]
-
- @pydantic.validator("environments", pre=True, always=True)
- def pick_correct_env_cls(cls, value, values, config, field):
- """This pre-validator picks an appropriate IOEnvironment subclass for the `data_backend_type`"""
- if not isinstance(value, Mapping):
- raise ValueError(f"Environments input should be a dict. Got {value!r} instead.")
- config_cls_overrides = {
- DataBackendType.local: LocalDataEnvironment,
- DataBackendType.local_batch: LocalBatchDataEnvironment,
- DataBackendType.s3: S3DataEnvironment,
- DataBackendType.s3_file: S3DataEnvironment,
- DataBackendType.s3_path_prefix: S3PathPrefixEnvironment,
- DataBackendType.kafka: KafkaDataEnvironment,
- DataBackendType.postgres: PostgresDataEnvironment,
- }
- out_dict = {}
- for (env_name, env_data) in value.items():
- base_obj: IOEnvironment = field.type_(**env_data)
- override_cls = config_cls_overrides.get(base_obj.data_backend_type)
- if override_cls:
- use_obj = override_cls(**env_data)
- else:
- use_obj = base_obj
- out_dict[env_name] = use_obj
- return out_dict
-
- @pydantic.root_validator(pre=True)
- def _preprocess_raw_config(cls, values):
- if not isinstance(values, Mapping):
- raise ValueError(f"IOBinding must be a dict at the top level. (got {values!r} instead)")
- remapped_value = {"environments": {}}
- for (key, value) in values.items():
- if key in ("__binding_name__", "schema"):
- # Passthrough params
- remapped_value[key] = value
- else:
- # Assuming an environment config
- remapped_value["environments"][key] = value
- return remapped_value
-
-
-class IOEnvironment(pydantic.BaseModel):
- """A section specifiing an data source backed by a particular data backend"""
-
- _parent: Optional[IOBinding] = None # noqa: F821
- options: Mapping = pydantic.Field(default_factory=dict)
- data_backend_type: DataBackendType = pydantic.Field(alias="type", const=None)
-
- class Config:
- """Additional pydantic configuration for the model."""
-
- underscore_attrs_are_private = True
-
- @property
- def dynamicio_schema(self) -> Union[table_spec.DataframeSchema, None]:
- """Returns tabular data structure definition for the data source (if available)"""
- if not self._parent:
- raise Exception("Parent field is not set.")
- return self._parent.dynamicio_schema
-
- def set_parent(self, parent: IOBinding): # noqa: F821
- """Helper method to set parent config object."""
- assert self._parent is None
- self._parent = parent
-
-
-class LocalDataSubSection(pydantic.BaseModel):
- """Config section for local data provider"""
-
- file_path: str
- file_type: FileType
-
-
-class LocalDataEnvironment(IOEnvironment):
- """The data is provided by local storage"""
-
- local: LocalDataSubSection
-
-
-class LocalBatchDataSubSection(pydantic.BaseModel):
- """Config section for local batch data (multiple input files)"""
-
- path_prefix: str
- file_type: FileType
-
-
-class LocalBatchDataEnvironment(IOEnvironment):
- """Parent section for local batch (multiple files) config."""
-
- local: LocalBatchDataSubSection
-
-
-class S3DataSubSection(pydantic.BaseModel):
- """Config section for S3 data source"""
-
- file_path: str
- file_type: FileType
- bucket: str
-
-
-class S3DataEnvironment(IOEnvironment):
- """Parent section for s3 data source config"""
-
- s3: S3DataSubSection
-
-
-class S3PathPrefixSubSection(pydantic.BaseModel):
- """Config section for s3 prefix data source (multiple s3 objects)"""
-
- path_prefix: str
- file_type: FileType
- bucket: str
-
- @pydantic.root_validator(pre=True)
- def support_legacy_config_path_prefix(cls, values):
- """
- This validator implements support for legacy config format where the
- bucket & path_prefix path could've been passed as a single param in 'bucket' field.
-
- E.g.
- bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.hdf"
- """
- bucket = values.get("bucket")
- path_prefix = values.get("path_prefix")
- if (bucket and isinstance(bucket, str) and posixpath.sep in bucket) and (not path_prefix):
- (new_bucket, new_path_prefix) = bucket.split(posixpath.sep, 1)
- values.update(
- {
- "bucket": new_bucket,
- "path_prefix": new_path_prefix,
- }
- )
- return values
-
-
-class S3PathPrefixEnvironment(IOEnvironment):
- """Parent section for the multi-object s3 data source"""
-
- s3: S3PathPrefixSubSection
-
-
-class KafkaDataSubSection(pydantic.BaseModel):
- """Kafka configuration section."""
-
- kafka_server: str
- kafka_topic: str
-
-
-class KafkaDataEnvironment(IOEnvironment):
- """Parent section for kafka data source config"""
-
- kafka: KafkaDataSubSection
-
-
-class PostgresDataSubSection(pydantic.BaseModel):
- """Postgres data source configuration."""
-
- db_host: str
- db_port: str
- db_name: str
- db_user: str
- db_password: str
-
-
-class PostgresDataEnvironment(IOEnvironment):
- """Parent section for postgres data source."""
-
- postgres: PostgresDataSubSection
-
-
-IOBinding.update_forward_refs()
diff --git a/dynamicio/config/pydantic/table_schema.py b/dynamicio/config/pydantic/table_schema.py
deleted file mode 100644
index f61f383..0000000
--- a/dynamicio/config/pydantic/table_schema.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# pylint: disable=no-member, no-self-argument, unused-argument
-
-"""This module defines Config schema for data source (pandas dataframe)"""
-
-import enum
-from typing import Mapping, Sequence
-
-import pydantic
-
-
-@enum.unique
-class MetricsName(str, enum.Enum):
- """The list of valid metrics names."""
-
- # pylint: disable=invalid-name
- min = "Min"
- max = "Max"
- mean = "Mean"
- stddev = "Std"
- variance = "Variance"
- counts = "Counts"
- counts_per_label = "CountsPerLabel"
- unique_counts = "UniqueCounts"
-
-
-@enum.unique
-class ColumnType(str, enum.Enum):
- """The list of valid column types."""
-
- # pylint: disable=invalid-name
- object = "object"
- string = "string"
- array = "array"
- number = "number"
-
- float = "float"
- float32 = "float32"
- float64 = "float64"
- double = "double"
-
- int = "int"
- integer = "integer"
-
- int8 = "int8"
- int32 = "int32"
- int64 = "int64"
-
- Int8 = "Int8"
- Int32 = "Int32"
- Int64 = "Int64"
-
- uint8 = "uint8"
- uint32 = "uint32"
- uint64 = "uint64"
-
- bool = "bool"
- boolean = "boolean"
-
- datetime64_ns = "datetime64[ns]"
- datetime64_ns_utc = "datetime64[ns,UTC]"
- datetime64_ms = "datetime64[ms]"
-
- timedelta64_ns = "timedelta64[ns]"
-
-
-class ColumnValidationBase(pydantic.BaseModel):
- """A single column validator."""
-
- name: str
- apply: bool
- options: Mapping[str, object]
-
-
-class SchemaColumn(pydantic.BaseModel):
- """Definition os a single data source column."""
-
- name: str
- data_type: ColumnType = pydantic.Field(alias="type")
- validations: Sequence[ColumnValidationBase] = pydantic.Field(default_factory=list)
- metrics: Sequence[MetricsName] = ()
-
- @pydantic.validator("validations", pre=True)
- def remap_validations(cls, field):
- """Remap the yaml structure of {validation_type: } to a list with validation_type as a key"""
- if not isinstance(field, dict):
- raise ValueError(f"{field!r} should be a dict")
- out = []
- for (key, params) in field.items():
- new_el = params.copy()
- new_el.update({"name": key})
- out.append(new_el)
- return out
-
- @pydantic.validator("metrics", pre=True, always=True)
- def validate_metrics(cls, field):
- """Remap any false-ish `metrics` value to an empty list."""
- if field:
- out = field
- else:
- out = []
- return out
-
-
-class DataframeSchema(pydantic.BaseModel):
- """Pydantic model describing the tabular data provided by the data source."""
-
- name: str
- columns: Mapping[str, SchemaColumn]
-
- @pydantic.validator("columns", pre=True)
- def supply_column_names(cls, field):
- """Tell each column its name (the key it is listed under)"""
- if not isinstance(field, Mapping):
- raise ValueError(f"{field!r} shoudl be a dict.")
-
- return {col_name: {**{"name": col_name}, **col_data} for (col_name, col_data) in field.items()}
-
- @property
- def validations(self) -> Mapping[str, Sequence[ColumnValidationBase]]:
- """A short-hand property to access the validators for each column."""
- return {col_name: col.validations for (col_name, col) in self.columns.items()}
-
- @property
- def metrics(self) -> Mapping[str, Sequence[MetricsName]]:
- """A short-hand property to access the metrics for each column."""
- return {col_name: col.metrics for (col_name, col) in self.columns.items()}
-
- @property
- def column_names(self) -> Sequence[str]:
- """Property providing the list of all column names."""
- return tuple(self.columns.keys())
diff --git a/dynamicio/core.py b/dynamicio/core.py
deleted file mode 100644
index 0bee13f..0000000
--- a/dynamicio/core.py
+++ /dev/null
@@ -1,315 +0,0 @@
-"""Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation."""
-# pylint: disable=no-member
-__all__ = ["DynamicDataIO", "SCHEMA_FROM_FILE"]
-
-import asyncio
-import inspect
-import re
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Mapping, MutableMapping, Optional
-
-import pandas as pd # type: ignore
-import pydantic
-from magic_logger import logger
-
-from dynamicio import validations
-from dynamicio.config.pydantic import DataframeSchema, IOEnvironment
-from dynamicio.errors import CASTING_WARNING_MSG, ColumnsDataTypeError, NOTICE_MSG, SchemaNotFoundError, SchemaValidationError
-from dynamicio.metrics import get_metric
-
-SCHEMA_FROM_FILE = {"schema": object()}
-
-pool = ThreadPoolExecutor()
-
-
-class DynamicDataIO:
- """Given a `src.utils.dynamicio.config.IOConfig` object, it generates an object with access to a series of methods for cloud I/O operations and data validations.
-
- Example:
- >>> input_sources_config = IOConfig(
- >>> "path_to/input.yaml",
- >>> os.getenv("ENVIRONMENT",default="LOCAL")
- >>> )
- >>>
- >>> class IO(WithS3File, WithLocal, DynamicDataIO):
- >>> schema = S
- >>>
- >>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET")
- >>> my_dataset_io = IO(my_dataset_local_mapping)
- >>> my_dataset_df = my_dataset_io.read()
- """
-
- schema: DataframeSchema
- sources_config: IOEnvironment
-
- def __init__(
- self,
- source_config: IOEnvironment,
- apply_schema_validations: bool = False,
- log_schema_metrics: bool = False,
- show_casting_warnings: bool = False,
- **options: MutableMapping[str, Any],
- ):
- """Class constructor.
-
- Args:
- source_config: Configuration to use when reading/writing data from/to a source
- apply_schema_validations: Applies schema validations on either read() or write()
- log_schema_metrics: Logs schema metrics on either read() or write()
- show_casting_warnings: Logs casting warnings on either read() or write() if set to True
- options: Any additional kwargs that may be used throughout the lifecycle of the object
- """
- if type(self) is DynamicDataIO: # pylint: disable=unidiomatic-typecheck
- raise TypeError("Abstract class DynamicDataIO cannot be used to instantiate an object...")
-
- self.sources_config = source_config
- self.name = self._transform_class_name_to_dataset_name(self.__class__.__name__)
- self.apply_schema_validations = apply_schema_validations
- self.log_schema_metrics = log_schema_metrics
- self.show_casting_warnings = show_casting_warnings
- self.options = self._get_options(options, source_config.options)
- source_name = self.sources_config.data_backend_type
- if self.schema is SCHEMA_FROM_FILE:
- active_schema = self.sources_config.dynamicio_schema
- else:
- active_schema = self._schema_from_obj(self)
-
- if not active_schema:
- raise SchemaNotFoundError()
-
- assert isinstance(active_schema, DataframeSchema)
- self.schema = active_schema
- self.name = self.schema.name.upper()
- self.schema_validations = self.schema.validations
- self.schema_metrics = self.schema.metrics
-
- assert hasattr(self, f"_read_from_{source_name}") or hasattr(
- self, f"_write_to_{source_name}"
- ), f"No method '_read_from_{source_name}' or '_write_to_{source_name}'. Have you registered a mixin for {source_name}?"
-
- @staticmethod
- def _schema_from_obj(target) -> DataframeSchema:
- """Construct `DataframeSchema` from an object.
-
- The object:
- - MUST have `schema` attribute that is a dictionary specifying columns and datatypes
- - CAN have `schema_validations` and `schema_metrics` attributes
- """
- col_info = {}
- for (col_name, dtype) in target.schema.items():
- col_validations = {}
- col_metrics = []
- try:
- col_validations = target.schema_validations[col_name]
- except (KeyError, AttributeError):
- pass
- try:
- col_metrics = target.schema_metrics[col_name]
- except (KeyError, AttributeError):
- pass
- col_info[col_name] = {
- "type": dtype,
- "validations": col_validations,
- "metrics": col_metrics,
- }
- try:
- out = DataframeSchema(name=target.name, columns=col_info)
- except pydantic.ValidationError:
- logger.exception(f"Error parsing {target.name=!r} {col_info=!r}")
- raise
- return out
-
- def __init_subclass__(cls):
- """Ensure that all subclasses have a `schema` attribute and a `validate` method.
-
- Raises:
- AssertionError: If either of the attributes is not implemented
- """
- if not inspect.getmodule(cls).__name__.startswith("dynamicio"):
- assert "schema" in cls.__dict__
-
- if cls.schema is None or (cls.schema is not SCHEMA_FROM_FILE and len(cls.schema) == 0):
- raise ValueError(f"schema for class {cls} cannot be None or empty...")
-
- async def async_read(self):
- """Allows the use of asyncio to concurrently read files in memory.
-
- Returns:
- A pandas dataframe or an iterable.
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.read)
-
- def read(self) -> pd.DataFrame:
- """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
- Returns:
- A pandas dataframe or an iterable.
- """
- source_name = self.sources_config.data_backend_type
- df = getattr(self, f"_read_from_{source_name}")()
-
- df = self._apply_schema(df)
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- return df
-
- async def async_write(self, df: pd.DataFrame):
- """Allows the use of asyncio to concurrently write files out.
-
- Args:
- df: The data to be written
- """
- loop = asyncio.get_running_loop()
- return await loop.run_in_executor(pool, self.write, df)
-
- def write(self, df: pd.DataFrame):
- """Sink data to a given source based on the sources_config.
-
- Args:
- df: The data to be written
- """
- source_name = self.sources_config.data_backend_type
- if set(df.columns) != self.schema.column_names: # pylint: disable=E1101
- columns = [column for column in df.columns.to_list() if column in self.schema.column_names]
- df = df[columns]
-
- if self.apply_schema_validations:
- self.validate_from_schema(df)
- if self.log_schema_metrics:
- self.log_metrics_from_schema(df)
-
- getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
-
- def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Validates a dataframe based on the validations present in its schema definition.
-
- All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
- Args:
- df:
-
- Returns:
- self (to allow for method chaining).
-
- Raises:
- SchemaValidationError: if any of the validations failed. The `message` attribute of
- the exception object is a `List[str]`, where each element is the name of a
- validation that failed.
- """
-
- failed_validations = {}
- for column in self.schema_validations.keys():
- col_validations = self.schema_validations[column]
- for validation in col_validations:
- if validation.apply:
- validator = validations.ALL_VALIDATORS[validation.name]
- validation_result = validator(self.name, df, column, **validation.options)
- if not validation_result.valid:
- failed_validations[validation.name] = validation_result.message
-
- if len(failed_validations) > 0:
- raise SchemaValidationError(failed_validations)
-
- return self
-
- def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
- """Calculates and logs metrics based on the metrics present in its schema definition.
-
- Args:
- df: A dataframe for which metrics are generated and logged
-
- Returns:
- self (to allow for method chaining).
- """
-
- for column in self.schema_metrics.keys():
- for metric in self.schema_metrics[column]:
- get_metric(metric)(self.name, df, column)() # type: ignore
-
- return self
-
- def _apply_schema(self, df: pd.DataFrame) -> pd.DataFrame:
- """Called by the `self.read()` and the `self._write_to_local()` methods.
-
- Contrasts a dataframe's read from a given source against the class's schema dictionary,
- checking that columns are the same (by means of _has_columns and _has_valid_dtypes). Then,
- check if the columns are fine, it further validates if the types of columns conform to the
- expected schema. Finally, if schema types are different, then it attempts to apply schema;
- if possible then the schema validation is successful.
-
- Args:
- df: A pandas dataframe.
-
- Returns:
- A schema validated dataframe.
- """
- if not self._has_valid_dtypes(df):
- raise ColumnsDataTypeError()
- return df
-
- @staticmethod
- def _transform_class_name_to_dataset_name(string_to_transform: str) -> str:
- """Called by the init function to fetch dataset names from class name.
-
- Used to create dataset name from class name, turns camel case into upper snake case.
- For example: 'ThisNameABC' -> 'THIS_NAME_ABC'.
- """
- words = re.findall(r"\d[A-Z]+|[A-Z]?[a-z\d]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+|[A-Z]{2,}|[A-Z]", string_to_transform)
- return "_".join(map(str.lower, words)).upper()
-
- def _has_valid_dtypes(self, df: pd.DataFrame) -> bool:
- """Checks if `df` has the expected dtypes defined in `schema`.
-
- Schema is a dictionary object where keys are column names and values are dtypes in string format as returned by e.g.
- `df[column].dtype.name`.
-
- This function issues `error` level logs describing the first column that caused the check to fail.
-
- It is assumed that `df` only has the columns defined in `schema`.
-
- Args:
- df:
-
- Returns:
- bool - `True` if `df` has the given dtypes, `False` otherwise
- """
- dtypes = df.dtypes
-
- for col_info in self.schema.columns.values():
- column_name = col_info.name
- expected_dtype = col_info.data_type
- found_dtype = dtypes[column_name].name
- if found_dtype != expected_dtype:
- if self.show_casting_warnings:
- logger.info(f"Expected: '{expected_dtype}' dtype for {self.name}['{column_name}]', found '{found_dtype}'")
- try:
- if len(set(type(v) for v in df[column_name].values)) > 1: # pylint: disable=consider-using-set-comprehension
- logger.warning(CASTING_WARNING_MSG.format(column_name, expected_dtype, found_dtype)) # pylint: disable=logging-format-interpolation
- logger.info(NOTICE_MSG.format(column_name)) # pylint: disable=logging-format-interpolation
- df[column_name] = df[column_name].astype(self.schema.columns[column_name].data_type)
- except (ValueError, TypeError):
- logger.exception(f"ValueError: Tried casting column {self.name}['{column_name}'] to '{expected_dtype}' from '{found_dtype}', but failed")
- return False
- return True
-
- @staticmethod
- def _get_options(options_from_code: MutableMapping[str, Any], options_from_resource_definition: Optional[Mapping[str, Any]]) -> MutableMapping[str, Any]:
- """Retrieves options either from code or from a resource-definition.
-
- Options are merged if they are provided by both sources, while in the case of conflicts, the options from the code
- take precedence.
-
- Args:
- options_from_code (Optional[Mapping])
- options_from_resource_definition (Optional[Mapping])
-
- Returns:
- [Optional[Mapping]]: options that are going to be used
- """
- if options_from_resource_definition:
- return {**options_from_resource_definition, **options_from_code}
- return options_from_code
diff --git a/dynamicio/errors.py b/dynamicio/errors.py
deleted file mode 100644
index 7c41fe8..0000000
--- a/dynamicio/errors.py
+++ /dev/null
@@ -1,96 +0,0 @@
-"""Hosts exception implementations for different errors."""
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, super-init-not-called
-__all__ = [
- "DynamicIOError",
- "DataSourceError",
- "ColumnsDataTypeError",
- "NonUniqueIdColumnError",
- "NullValueInColumnError",
- "NotExpectedCategoricalValue",
- "MissingSchemaDefinition",
- "SchemaNotFoundError",
- "SchemaValidationError",
- "InvalidDatasetTypeError",
- "CASTING_WARNING_MSG",
- "NOTICE_MSG",
-]
-
-from typing import Any, Optional
-
-
-class DynamicIOError(Exception):
- """Base class for DynamicIO errors."""
-
- ERROR_STR: str = ""
- ERROR_STR_DETAILED: str = "{0}"
-
- @property
- def message(self) -> Optional[Any]:
- """Easy access for optional message argument.
-
- Returns:
- Message or `None` if not set
- """
- try:
- return self.args[0]
- except IndexError:
- return None
-
- def __str__(self):
- """Enrich and return error message."""
- message = self.message
-
- if message is None:
- return self.ERROR_STR
-
- return self.ERROR_STR_DETAILED.format(message)
-
-
-class SchemaNotFoundError(DynamicIOError):
- """Error raised when schema is not specified in the provided source."""
-
- ERROR_STR = "Schema not specified in the provided source"
- ERROR_STR_DETAILED = "Schema not specified in the provided source: {0} "
-
-
-class SchemaValidationError(DynamicIOError):
- """Error raised when schema validation fails."""
-
-
-class MissingSchemaDefinition(DynamicIOError):
- """Error raised when schema is not specified in the provided source."""
-
- ERROR_STR = "The resource definition for this class is missing a schema definition"
- ERROR_STR_DETAILED = "The resource definition for this class is missing a schema definition: {0}"
-
-
-class DataSourceError(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class ColumnsDataTypeError(DynamicIOError):
- """Error raised when the validated data does not have the expected data types."""
-
-
-class NonUniqueIdColumnError(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class NullValueInColumnError(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class NotExpectedCategoricalValue(DynamicIOError):
- """Error raised when the data source fails to load."""
-
-
-class InvalidDatasetTypeError(DynamicIOError):
- """Error raised when dataset type is not one of [parquet, json, csv, h5]."""
-
- ERROR_STR = "The dataset provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
- ERROR_STR_DETAILED = "Dataset: {0} provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
-
-
-# Warning messages
-CASTING_WARNING_MSG = "Applying casting column: '{0}' to: 'type:{1}' from 'type:{2}' though not advised, as `dtypes`>1 for {0}, which may lead to data corruption!"
-NOTICE_MSG = "Keeping the {0} as is, may anyway cause I/O errors or data corruption issues especially when using `pandas.DataFrame.to_parquet` or `pandas.DataFrame.to_json`."
diff --git a/dynamicio/inject.py b/dynamicio/inject.py
new file mode 100644
index 0000000..8f4baa3
--- /dev/null
+++ b/dynamicio/inject.py
@@ -0,0 +1,90 @@
+"""Injects dynamic values into a string."""
+from __future__ import annotations
+
+import re
+from pathlib import Path
+from typing import Any, Dict, overload
+
+curly_braces_matcher = re.compile(r"(.*)(\{\s*(\S+)\s*\})(.*)")
+
+
+class InjectionError(ValueError):
+ """Raised when a string has any dynamic values in the form of "{DYNAMIC_VAR}" or "[[ DYNAMIC_VAR ]]"."""
+
+
+@overload
+def inject(value: None, **kwargs: dict[str, Any]) -> None:
+ ...
+
+
+@overload
+def inject(value: Path, **kwargs: dict[str, Any]) -> Path:
+ ...
+
+
+@overload
+def inject(value: str, **kwargs: dict[str, Any]) -> str:
+ ...
+
+
+def inject(value: str | Path | None, **kwargs: dict[str, Any]) -> str | Path | None:
+ """Parse a string and replace any "{DYNAMIC_VAR}" and "[[ DYNAMIC_VAR ]]" with the respective values in the kwargs.
+
+ case-insensitive.
+ Args:
+ value: An injectable value (str | Path | None) with dynamic values in the form of "{DYNAMIC_VAR}" or "[[ DYNAMIC_VAR ]]".
+ kwargs: A mapping of values to replace in the path.
+
+ Returns:
+ str | Path | None: Injectable with all dynamic values replaced.
+ """
+ if value is None:
+ return value
+ to_inject = str(value)
+ injected = _inject_with_matcher(to_inject, curly_braces_matcher, **kwargs)
+ return type(value)(injected)
+
+
+def check_injections(value: str | Path | None) -> None:
+ """Raise if a string has any dynamic values in the form of "{DYNAMIC_VAR}" or "[[ DYNAMIC_VAR ]]"."""
+ if value is None:
+ return value
+ to_check: str = str(value)
+ while _ := curly_braces_matcher.search(to_check):
+ raise InjectionError(f'Path is not fully injected: "{to_check!r}"')
+
+
+def _inject_with_matcher(value: str, matcher, **kwargs) -> str:
+ """Replaces any matching dynamic values.
+
+ Args:
+ path: A string with dynamic values.
+ matcher: A regex matcher to find the dynamic values.
+ kwargs: A mapping of values to replace in the path.
+
+ Returns:
+ str: The path with the dynamic values replaced with the respective values in the kwargs.
+ """
+ kwargs_lower = {k.lower(): v for k, v in kwargs.items()} # case-insensitive
+
+ replacements: Dict[str, Any] = {}
+
+ temp_suffix_value = ""
+
+ while result := matcher.search(value):
+ str_to_replace = result.group(3).lower() # we want to be case-insensitive
+ replacement = kwargs_lower.get(str_to_replace, None)
+
+ if replacement is None:
+ suffix = matcher.sub("\\g<2>\\g<4>", value)
+ temp_suffix_value = f"{suffix}{temp_suffix_value}"
+ value = matcher.sub("\\g<1>", value)
+ else:
+ replacements[str_to_replace] = replacement
+
+ # finds the first match and replaces it
+ value = matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
+
+ value = f"{value}{temp_suffix_value}"
+
+ return value
diff --git a/dynamicio/io/__init__.py b/dynamicio/io/__init__.py
new file mode 100644
index 0000000..06951c0
--- /dev/null
+++ b/dynamicio/io/__init__.py
@@ -0,0 +1,4 @@
+from .file import LocalFileResource
+from .s3 import S3Resource
+from .postgres import PostgresResource
+from .kafka import KafkaResource
diff --git a/dynamicio/io/file.py b/dynamicio/io/file.py
new file mode 100644
index 0000000..3eda2bc
--- /dev/null
+++ b/dynamicio/io/file.py
@@ -0,0 +1,60 @@
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional
+
+import pandas as pd
+
+from dynamicio.io.resource import BaseResource
+from dynamicio.io.serde import CsvSerde, HdfSerde, JsonSerde, ParquetSerde, PickleSerde
+
+
+class LocalFileResource(BaseResource):
+ path: Path
+ read_kwargs: Dict[str, Any] = {}
+ write_kwargs: Dict[str, Any] = {}
+ injectables: List[str] = ["path"]
+ file_type: Optional[Literal["parquet", "hdf", "csv", "json", "pickle", "h5"]] = None
+
+ def _read(self) -> pd.DataFrame:
+ return self.get_serde()._read(self.path)
+
+ def _write(self, df: pd.DataFrame) -> None:
+ self.path.parent.mkdir(parents=True, exist_ok=True)
+ return self.get_serde()._write(self.path, df)
+
+ def cache_key(self) -> Path:
+ if self.test_path is not None:
+ return self.test_path
+ else:
+ return self.path
+
+ @property
+ def serde_class(self):
+ file_type = self.file_type or (self.path.suffix.replace(".", "") if self.path.suffix else None)
+
+ if file_type == "parquet":
+ serde_class = ParquetSerde
+ elif file_type == "hdf" or file_type == "h5":
+ serde_class = HdfSerde
+ elif file_type == "csv":
+ serde_class = CsvSerde
+ elif file_type == "json":
+ serde_class = JsonSerde
+ elif file_type == "pickle":
+ serde_class = PickleSerde
+ elif file_type is None:
+ raise ValueError(f"File type not specified for {self.path}")
+ else:
+ raise ValueError(f"Unknown file type {file_type}")
+
+ serde_class_with_kwargs = partial(serde_class, read_kwargs=self.read_kwargs, write_kwargs=self.write_kwargs)
+
+ return serde_class_with_kwargs
+
+ def get_serde(self):
+ """Return the serde instance, with baked-in validation."""
+ validations = []
+ if self.pa_schema is not None:
+ validations.append(self.pa_schema.validate)
+
+ return self.serde_class(validations=validations)
diff --git a/dynamicio/io/hdf.py b/dynamicio/io/hdf.py
new file mode 100644
index 0000000..d7082c4
--- /dev/null
+++ b/dynamicio/io/hdf.py
@@ -0,0 +1,78 @@
+"""Hdf ReaderWriter."""
+from __future__ import annotations
+
+import uuid
+from contextlib import contextmanager
+from typing import Any, Dict, Generator, IO, Optional
+
+import boto3 # type: ignore
+import pandas as pd # type: ignore
+import tables # type: ignore
+from pydantic import BaseModel # type: ignore
+
+
+class InMemStore(pd.io.pytables.HDFStore):
+ """A subclass of pandas HDFStore that does not manage the pytables File object."""
+
+ _in_mem_table = None
+
+ def __init__(self, path: str, table: tables.File, mode: str = "r"):
+ """Initialize the store."""
+ self._in_mem_table = table
+ super().__init__(path=path, mode=mode) # type: ignore
+
+ def open(self, *_args, **_kwargs): # noqa: D102
+ pd.io.pytables._tables()
+ self._handle = self._in_mem_table
+
+ def close(self, *_args, **_kwargs): # noqa: D102
+ pass
+
+ @property
+ def is_open(self): # noqa: D102
+ return self._handle is not None
+
+
+class HdfIO: # noqa: D102
+ """Class providing stream support for HDF tables."""
+
+ @contextmanager
+ def create_file(self, label: str, mode: str, data: Optional[bytes] = None) -> Generator[tables.File, None, None]:
+ """Create an in-memory pytables table."""
+ extra_kw = {}
+ if data:
+ extra_kw["driver_core_image"] = data
+ file_handle = tables.File(
+ f"{label}_{uuid.uuid4()}.h5",
+ mode,
+ title=label,
+ root_uep="/",
+ filters=None,
+ driver="H5FD_CORE",
+ driver_core_backing_store=0,
+ **extra_kw,
+ )
+ try:
+ yield file_handle
+ finally:
+ file_handle.close()
+
+ def load(self, fobj: IO[bytes], label: str = "unknown_file.h5") -> pd.DataFrame:
+ """Load the dataframe from a file-like object."""
+ with self.create_file(label, mode="r", data=fobj.read()) as file_handle:
+ return pd.read_hdf(InMemStore(label, file_handle)) # type: ignore
+
+ def save(
+ self,
+ df: pd.DataFrame,
+ fobj: IO[bytes],
+ label: str = "unknown_file.h5",
+ **kwargs,
+ ):
+ """Load the dataframe to a file-like object."""
+ if not kwargs:
+ kwargs = {}
+ with self.create_file(label, mode="w", data=fobj.read()) as file_handle:
+ store = InMemStore(path=label, table=file_handle, mode="w")
+ store.put(key="df", value=df, **kwargs)
+ fobj.write(file_handle.get_file_image())
diff --git a/dynamicio/io/kafka.py b/dynamicio/io/kafka.py
new file mode 100644
index 0000000..0489423
--- /dev/null
+++ b/dynamicio/io/kafka.py
@@ -0,0 +1,87 @@
+"""I/O functions and Resource class for kafka targeted operations."""
+import logging
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Mapping, Optional, Type, Literal
+
+import pandas as pd # type: ignore
+import simplejson
+from kafka import KafkaProducer # type: ignore
+from pandera import SchemaModel
+from pydantic import Field
+
+from dynamicio.io.resource import BaseResource
+from dynamicio.io.serde import BaseSerde, JsonSerde
+
+
+class KafkaResource(BaseResource):
+ # Required
+ topic: str
+ server: str
+
+ # Defaults
+ key_generator: Callable[[Any, Mapping[Any, Any]], Optional[str]] = Field(
+ lambda idx, _: idx,
+ description="""Gets called with dataframe's (idx, row). Defaults to `idx`.""",
+ )
+ key_serializer: Callable[[Any], bytes] = lambda key: key.encode("utf-8") if key else None
+ value_serializer: Callable[[Mapping], bytes] = lambda val: simplejson.dumps(val, ignore_nan=True).encode("utf-8")
+ document_transformer: Callable[[Mapping[Any, Any]], Mapping[Any, Any]] = lambda value: value
+ # TODO: Give descriptions to all these callables that describe what they're being called with
+
+ # Options
+ kafka_producer: Optional[KafkaProducer] = None # gets instantiated in get_kafka_producer
+ compression_type: Literal["gzip", "snappy", "lz4", "zstd"] = "snappy" # type: ignore
+ producer_kwargs: Dict[str, Any] = {}
+
+ # Resource
+ injectables: List[str] = ["topic", "server"]
+ pa_schema: Optional[Type[SchemaModel]] = None
+ test_path: Optional[str] = None
+
+ def get_kafka_producer(self) -> KafkaProducer:
+ """Get a KafkaProducer instance."""
+ if self.kafka_producer is None:
+ return KafkaProducer(
+ bootstrap_servers=self.server,
+ compression_type=self.compression_type,
+ key_serializer=self.key_serializer,
+ value_serializer=self.value_serializer,
+ **self.producer_kwargs,
+ )
+ return self.kafka_producer
+
+ def _write(self, df: pd.DataFrame) -> None:
+ """Handles Write operations for Kafka."""
+ kafka_producer = self.get_kafka_producer()
+
+ logging.info(f"Sending {len(df)} messages to Kafka topic:{self.topic}")
+
+ messages = df.reset_index(drop=True).to_dict("records")
+
+ for idx, message in zip(df.index.values, messages):
+ kafka_producer.send(
+ self.topic,
+ key=self.key_generator(idx, message),
+ value=self.document_transformer(message),
+ ) # type: ignore
+
+ kafka_producer.flush() # type: ignore
+
+ def _read(self) -> pd.DataFrame:
+ raise NotImplementedError
+
+ def cache_key(self):
+ """Return the path to the fixture file."""
+ if self.test_path is not None:
+ return Path(self.test_path)
+ return Path(f"kafka/{self.topic}.json") # Should server be added here?
+
+ @property
+ def serde_class(self) -> Type[BaseSerde]:
+ return JsonSerde
+
+ class Config:
+ """Pydantic Config class."""
+
+ arbitrary_types_allowed = True
+ validate_assignment = True
diff --git a/dynamicio/io/postgres.py b/dynamicio/io/postgres.py
new file mode 100644
index 0000000..19c0de6
--- /dev/null
+++ b/dynamicio/io/postgres.py
@@ -0,0 +1,155 @@
+import csv
+import logging
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Dict, Generator, List, Optional, Type
+
+import pandas as pd # type: ignore
+from pandera import SchemaModel
+from pydantic import Field # pylint: disable=no-name-in-module
+from sqlalchemy import create_engine # type: ignore
+from sqlalchemy.orm import Session as SqlAlchemySession # type: ignore
+from sqlalchemy.orm import sessionmaker # type: ignore
+
+from dynamicio.io.resource import BaseResource
+from dynamicio.io.serde import ParquetSerde
+
+Session = sessionmaker()
+
+
+@contextmanager
+def session_scope(connection_string: str, application_name: Optional[str]) -> Generator[SqlAlchemySession, None, None]:
+ """Connect to a database using `connection_string` and returns an active session to that connection.
+
+ Args:
+ connection_string:
+ application_name [optional]: Name of the application that is connecting to the database (repo name).
+
+
+ Yields:
+ Active session
+ """
+ application_name = application_name or "unknown-dynamicio-app"
+ engine = create_engine(connection_string, connect_args={"application_name": application_name})
+ session = Session(bind=engine)
+
+ try:
+ yield session
+ session.commit()
+ except Exception as exc:
+ session.rollback()
+ raise exc
+ finally:
+ session.close() # pylint: disable=no-member
+
+
+class PostgresResource(BaseResource):
+ # Postgres Connection
+ db_user: str
+ db_password: Optional[str]
+ db_host: str
+ db_port: int = 5432
+ db_name: str
+ db_schema: str = "public"
+ application_name: Optional[str] = Field(None, description="Application name to use for postgres connection.")
+
+ # Postgres IO
+ truncate_and_append: bool = False
+ table_name: Optional[str] = Field(None, description="SQL table name. Needs to be given if no sql_query is given")
+ sql_query: Optional[str] = Field(
+ None, description="SQL query. Will fetch schema defined columns if this is not given."
+ )
+ read_kwargs: Dict[str, Any] = {}
+ write_kwargs: Dict[str, Any] = {}
+
+ # Resource
+ injectables: List[str] = ["table_name", "sql_query", "db_user", "db_password", "db_host", "db_name", "db_schema"]
+ pa_schema: Optional[Type[SchemaModel]] = None
+ test_path: Optional[str] = None
+
+ @property
+ def connection_string(self) -> str:
+ """Build connection string out of components."""
+ password = f":{self.db_password}" if self.db_password else ""
+ return f"postgresql://{self.db_user}{password}@{self.db_host}:{self.db_port}/{self.db_name}"
+
+ @property
+ def final_table_name(self) -> str:
+ """Return schema and table name in a format of schema.table_name."""
+ return f"{self.db_schema}.{self.table_name}"
+
+ def _read(self) -> pd.DataFrame:
+ """Handles Read operations for Postgres."""
+ if not (bool(self.sql_query) ^ bool(self.table_name)): # Xor
+ raise ValueError("PostgresResource must define EITHER sql_query OR table_name.")
+
+ if self.pa_schema is not None and (not self.sql_query and self.pa_schema.Config.strict):
+ # filtering can now be done at sql level
+ columns: List[str] = list(self.pa_schema.to_schema().columns.keys()) # type: ignore
+ columns_str = ", ".join(columns)
+ sql_query = f"SELECT {columns_str} FROM {self.final_table_name}"
+ elif self.sql_query is None:
+ sql_query = f"SELECT * FROM {self.final_table_name}"
+ else:
+ sql_query = self.sql_query
+
+ logging.info(f"Downloading table: {self.final_table_name} from: {self.db_host}:{self.db_name}")
+ with session_scope(self.connection_string, self.application_name) as session:
+ df = pd.read_sql(sql=sql_query, con=session.get_bind(), **self.read_kwargs)
+
+ return df
+
+ def _write(self, df: pd.DataFrame) -> None:
+ """Handles Write operations for Postgres."""
+ if not self.table_name:
+ raise ValueError("PostgresResource must specify table_name for writing.")
+
+ with session_scope(self.connection_string, self.application_name) as session:
+ session: SqlAlchemySession # type: ignore # this is done for IDE purposes
+ if self.truncate_and_append:
+ logging.info(
+ f"Writing to table (csv-hack): {self.final_table_name} from: {self.db_host}:{self.db_name}"
+ )
+ session.execute(f"TRUNCATE TABLE {self.final_table_name};")
+
+ # Speed hack: dump file as csv, use Postgres' CSV import function.
+ # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table
+ with tempfile.NamedTemporaryFile(mode="r+") as temp_file:
+ df.to_csv(
+ temp_file,
+ index=False,
+ header=False,
+ sep="\t",
+ doublequote=False,
+ escapechar="\\",
+ quoting=csv.QUOTE_NONE,
+ )
+ temp_file.flush()
+ temp_file.seek(0)
+
+ cur = session.connection().connection.cursor()
+ cur.execute(f"SET search_path TO {self.db_schema};")
+ cur.copy_from(temp_file, self.table_name, columns=df.columns, null="")
+ else:
+ logging.info(f"Writing to table: {self.final_table_name} from: {self.db_host}:{self.db_name}")
+ df.to_sql(
+ name=self.table_name,
+ con=session.get_bind(),
+ if_exists="replace",
+ index=False,
+ schema=self.db_schema,
+ )
+
+ def cache_key(self) -> Path:
+ if self.test_path is not None:
+ return self.test_path
+ elif self.table_name:
+ return Path("postgres") / (self.final_table_name + ".parquet")
+ elif self.sql_query:
+ raise ValueError("test_path must be set if using custom sql query.")
+
+ @property
+ def serde_class(self):
+ """Postgres uses a plain ParquetSerde for testing."""
+ return ParquetSerde
diff --git a/dynamicio/io/resource.py b/dynamicio/io/resource.py
new file mode 100644
index 0000000..1257b73
--- /dev/null
+++ b/dynamicio/io/resource.py
@@ -0,0 +1,99 @@
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from pathlib import Path
+from typing import Callable, List, Optional, Type
+
+import pandas as pd
+from pandera import SchemaModel
+from pydantic import BaseModel
+from uhura import Readable, Writable
+
+from dynamicio.inject import InjectionError, inject
+from dynamicio.io.serde import BaseSerde, PickleSerde
+
+
+def create_schema_validator(schema) -> Callable[[pd.DataFrame], pd.DataFrame]:
+ def validate_schema(df: pd.DataFrame):
+ return schema.validate(df)
+
+ return validate_schema
+
+
+class BaseResource(BaseModel, Readable[pd.DataFrame], Writable[pd.DataFrame], ABC):
+ """Base class for all resources.
+
+ :injectables: List of attributes that can be injected with format strings.
+ :test_path (optional): Path to the test data. If set, the resource will be read from and written to this path.
+ :pa_schema (optional): Pandera schema to validate the resource. If set, the resource will be validated before writing and after reading.
+ """
+
+ injectables: List[str]
+ test_path: Optional[Path] = None
+ pa_schema: Optional[Type[SchemaModel]] = None
+
+ def inject(self, **kwargs) -> "BaseResource":
+ """Inject any attributes that are marked as injectable with format strings.
+
+ This includes the test_path and any other relevant attributes."""
+ # copy object
+ clone = deepcopy(self)
+ for injectable in self.injectables:
+ # inject attributes
+ value = getattr(clone, injectable)
+ if isinstance(value, str) or isinstance(value, Path) or value is None:
+ formatted_str = inject(value, **kwargs)
+ setattr(clone, injectable, formatted_str)
+
+ else:
+ raise InjectionError(f"Cannot inject {injectable} of type {type(value)} in {self.__class__.__name__}")
+
+ # inject test path
+ if self.test_path is not None:
+ clone.test_path = inject(self.test_path, **kwargs)
+ return clone
+
+ @abstractmethod
+ def _read(self) -> pd.DataFrame:
+ """Internal read method. Should not be called directly. Use read() instead.
+
+ Overwrite this method to implement custom read logic.
+ The main read() method is replaced when in uhura testing mode."""
+ raise NotImplementedError()
+
+ @abstractmethod
+ def _write(self, df: pd.DataFrame) -> None:
+ """Internal write method. Should not be called directly. Use write() instead.
+
+ Overwrite this method to implement custom write logic.
+ The main write() method is replaced when in uhura testing mode."""
+ raise NotImplementedError()
+
+ def read(self) -> pd.DataFrame:
+ """Read the resource."""
+ df = self._read()
+ df = self.get_serde().validate(df)
+ return df
+
+ def write(self, df: pd.DataFrame) -> None:
+ """Write the resource."""
+ df = self.get_serde().validate(df)
+ self._write(df)
+
+ def cache_key(self):
+ """Return the test path."""
+ if self.test_path is None:
+ raise ValueError("No test path set.")
+ return str(self.test_path)
+
+ @property
+ def serde_class(self) -> Type[BaseSerde]:
+ """Return the serde class. Default is PickleSerde."""
+ return PickleSerde
+
+ def get_serde(self) -> BaseSerde:
+ """Return the serde instance, with baked-in validation."""
+ validations = []
+ if self.pa_schema is not None:
+ validations.append(self.pa_schema.validate)
+
+ return self.serde_class(validations=validations)
diff --git a/dynamicio/io/s3.py b/dynamicio/io/s3.py
new file mode 100644
index 0000000..dcb48d6
--- /dev/null
+++ b/dynamicio/io/s3.py
@@ -0,0 +1,70 @@
+from functools import partial
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Type
+
+import boto3
+import pandas as pd
+
+from dynamicio.io.s3_contexts import s3_named_file_reader, s3_writer, s3_reader
+from dynamicio.io.resource import BaseResource
+from dynamicio.io.serde import BaseSerde, CsvSerde, HdfSerde, JsonSerde, ParquetSerde, PickleSerde
+
+
+class S3Resource(BaseResource):
+ bucket: str
+ path: Path
+ read_kwargs: Dict[str, Any] = {}
+ write_kwargs: Dict[str, Any] = {}
+ injectables: List[str] = ["path"]
+ file_type: Optional[Literal["parquet", "hdf", "csv", "json", "pickle"]] = None
+ force_read_to_memory: bool = False
+
+ @property
+ def _s3_path(self) -> str:
+ """For logging purposes only."""
+ return f"s3://{self.bucket}/{self.path}"
+
+ def _read(self) -> pd.DataFrame:
+ if self.force_read_to_memory:
+ with s3_reader(boto3.client("s3"), s3_bucket=self.bucket, s3_key=str(self.path)) as fobj: # type: ignore
+ df = self.get_serde()._read(fobj, **self.read_kwargs) # type: ignore
+ if df is not None:
+ return df
+ else:
+ raise ValueError(f"Could not read {self._s3_path}")
+
+ with s3_named_file_reader(boto3.client("s3"), s3_bucket=self.bucket, s3_key=str(self.path)) as target_file:
+ return self.get_serde()._read(target_file.name, **self.read_kwargs) # type: ignore
+
+ def _write(self, df: pd.DataFrame) -> None:
+ with s3_writer(boto3.client("s3"), s3_bucket=self.bucket, s3_key=str(self.path)) as fobj:
+ return self.get_serde()._write(fobj, df)
+
+ @property
+ def serde_class(self):
+ file_type = self.file_type or (self.path.suffix.replace(".", "") if self.path.suffix else None)
+
+ if file_type == "parquet":
+ serde_class = ParquetSerde
+ elif file_type == "hdf" or file_type == "h5":
+ serde_class = HdfSerde
+ elif file_type == "csv":
+ serde_class = CsvSerde
+ elif file_type == "json":
+ serde_class = JsonSerde
+ elif file_type == "pickle":
+ serde_class = PickleSerde
+ elif file_type is None:
+ raise ValueError(f"File type not specified for {self.path}")
+ else:
+ raise ValueError(f"Unknown file type {file_type}")
+
+ serde_class_with_kwargs = partial(serde_class, read_kwargs=self.read_kwargs, write_kwargs=self.write_kwargs)
+
+ return serde_class_with_kwargs
+
+ def cache_key(self) -> Path:
+ if self.test_path is not None:
+ return self.test_path
+ else:
+ return Path("s3") / self.bucket / self.path
diff --git a/dynamicio/io/s3_contexts.py b/dynamicio/io/s3_contexts.py
new file mode 100644
index 0000000..a8489ef
--- /dev/null
+++ b/dynamicio/io/s3_contexts.py
@@ -0,0 +1,68 @@
+# flake8: noqa: I101
+"""Context managers for reading and writing to S3."""
+import io
+import tempfile
+from contextlib import contextmanager
+from pathlib import Path
+from typing import IO, Generator
+
+
+@contextmanager
+def s3_named_file_reader(boto3_client, s3_bucket: str, s3_key: str) -> Generator:
+ """Contextmanager to abstract reading different file types in S3.
+
+ This implementation saves the downloaded data to a temporary file.
+
+ Args:
+ s3_bucket: The S3 bucket from where to read the file.
+ s3_key: The file-path to the target file to be read.
+
+ Returns:
+ The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
+
+ """
+ with tempfile.NamedTemporaryFile("wb") as target_file:
+ # Download the file from S3
+ boto3_client.download_fileobj(s3_bucket, s3_key, target_file)
+ # Yield local file path to body of `with` statement
+ target_file.flush()
+ yield target_file
+
+
+@contextmanager
+def s3_reader(boto3_client, s3_bucket: str, s3_key: Path) -> Generator[io.BytesIO, None, None]:
+ """Contextmanager to abstract reading different file types in S3.
+
+ This implementation only retains data in-memory, avoiding creating any temp files.
+
+ Args:
+ s3_bucket: The S3 bucket from where to read the file.
+ s3_key: The file-path to the target file to be read.
+
+ Returns:
+ The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
+
+ """
+ fobj = io.BytesIO()
+ # Download the file from S3
+ boto3_client.download_fileobj(s3_bucket, str(s3_key), fobj)
+ # Yield the buffer
+ fobj.seek(0)
+ yield fobj
+
+
+@contextmanager
+def s3_writer(boto3_client, s3_bucket: str, s3_key: str) -> Generator[IO[bytes], None, None]:
+ """Contextmanager to abstract loading different file types to S3.
+
+ Args:
+ s3_bucket: The S3 bucket to upload the file to.
+ s3_key: The file-path where the target file should be uploaded to.
+
+ Returns:
+ The local file path where to actually write the file, to be read and uploaded by boto3.client.
+ """
+ fobj = io.BytesIO()
+ yield fobj
+ fobj.seek(0)
+ boto3_client.upload_fileobj(fobj, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"})
diff --git a/dynamicio/io/serde.py b/dynamicio/io/serde.py
new file mode 100644
index 0000000..ab58caa
--- /dev/null
+++ b/dynamicio/io/serde.py
@@ -0,0 +1,115 @@
+"""These are the base serde classes, used for testing & when appropriate for actual IO."""
+import pickle
+from abc import ABC, abstractmethod
+from io import BytesIO
+from threading import Lock
+from typing import Callable, Optional, TypeVar, Union
+
+import pandas as pd
+from uhura.serde import Serde
+
+from dynamicio import utils
+from dynamicio.io.hdf import HdfIO
+
+SerdeType = TypeVar("SerdeType")
+
+
+class BaseSerde(ABC, Serde[pd.DataFrame]):
+ file_extension = "_"
+
+ def __init__(self, validations: Optional[Callable] = None, **kwargs):
+ self.validations = validations or []
+
+ def read_from_file(self, file) -> pd.DataFrame:
+ df = self._read(file)
+ return self.validate(df)
+
+ @abstractmethod
+ def _read(self, file) -> pd.DataFrame:
+ raise NotImplementedError
+
+ def write_to_file(self, path: str, obj: pd.DataFrame) -> None:
+ return self._write(path, obj)
+
+ @abstractmethod
+ def _write(self, path: str, obj: pd.DataFrame) -> None:
+ raise NotImplementedError
+
+ def validate(self, df: pd.DataFrame):
+ """Validation is done here to avoid double validations in the framework."""
+ for validator in self.validations:
+ validator(df)
+ return df
+
+
+class PickleSerde(BaseSerde):
+ def _read(self, file) -> SerdeType:
+ with open(file, "rb") as infile:
+ return pickle.load(infile)
+
+ def _write(self, file, obj: SerdeType) -> None:
+ with open(file, "wb") as outfile:
+ pickle.dump(obj, outfile)
+
+
+class ParquetSerde(BaseSerde):
+ def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs):
+ self._read_kwargs = read_kwargs or {}
+ self._write_kwargs = write_kwargs or {}
+ super().__init__(**kwargs)
+
+ def _read(self, file: str) -> pd.DataFrame:
+ return pd.read_parquet(file, **self._read_kwargs)
+
+ def _write(self, file: str, obj: pd.DataFrame) -> None:
+ obj.to_parquet(file, **self._write_kwargs)
+
+
+HDF_LOCK = Lock()
+
+
+class HdfSerde(BaseSerde):
+ def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs):
+ self._read_kwargs = read_kwargs or {}
+ self._write_kwargs = write_kwargs or {}
+ super().__init__(**kwargs)
+
+ def _read(self, file: Union[str, BytesIO]) -> pd.DataFrame:
+ if isinstance(file, BytesIO):
+ return HdfIO().load(file)
+ with HDF_LOCK:
+ return pd.read_hdf(file, **self._read_kwargs)
+
+ def _write(self, file: Union[str, BytesIO], obj: pd.DataFrame) -> None:
+ if isinstance(file, BytesIO):
+ with utils.pickle_protocol(protocol=4), HDF_LOCK:
+ HdfIO().save(obj, file, **self._write_kwargs)
+ else:
+ with utils.pickle_protocol(protocol=4), HDF_LOCK:
+ obj.to_hdf(file, key="df", mode="w", **self._write_kwargs)
+
+
+class CsvSerde(BaseSerde):
+ def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs):
+ self._read_kwargs = read_kwargs or {}
+ self._write_kwargs = write_kwargs or {"index": False}
+ super().__init__(**kwargs)
+
+ def _read(self, file: str) -> pd.DataFrame:
+ return pd.read_csv(file, **self._read_kwargs)
+
+ def _write(self, file: str, obj: pd.DataFrame) -> None:
+ obj.to_csv(file, **self._write_kwargs)
+
+
+class JsonSerde(BaseSerde):
+ def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs):
+ self._read_kwargs = read_kwargs or {}
+ self._write_kwargs = write_kwargs or {}
+ super().__init__(**kwargs)
+
+ def _read(self, file: str) -> pd.DataFrame:
+ return pd.read_json(file, **self._read_kwargs)
+
+ def _write(self, file: str, obj: pd.DataFrame) -> None:
+ obj.to_json(file, **self._write_kwargs)
diff --git a/dynamicio/metrics.py b/dynamicio/metrics.py
index 711a9e4..a193c45 100644
--- a/dynamicio/metrics.py
+++ b/dynamicio/metrics.py
@@ -1,178 +1,149 @@
-"""A module responsible for metrics generation and logging."""
-# pylint: disable=missing-function-docstring,missing-class-docstring
import json
import logging
-import sys
-from numbers import Number
-from typing import Any, Dict, Mapping, Type
+from enum import Enum
+from typing import Mapping
-import pandas as pd # type: ignore
-from magic_logger import logger
-from pythonjsonlogger import jsonlogger # type: ignore
+import pandas as pd
+from pandera import extensions
-logHandler = logging.StreamHandler(sys.stdout)
-formatter = jsonlogger.JsonFormatter()
-logHandler.setFormatter(formatter)
-logger.addHandler(logHandler)
+logger = logging.getLogger(__name__)
-__metrics__: Dict[str, Type["Metric"]] = {}
+class Metric(str, Enum):
+ MIN = "Min"
+ MAX = "Max"
+ MEAN = "Mean"
+ STD = "Std"
+ VARIANCE = "Variance"
+ COUNTS = "Counts"
+ UNIQUE_COUNTS = "UniqueCounts"
+ COUNTS_PER_LABEL = "CountsPerLabel"
-def get_metric(name: str) -> Type["Metric"]:
- return __metrics__[name]
-
-
-def log_metric(dataset: str, column: str, metric: str, value: float):
+def log_metric(column: str, metric: str, value: float):
"""Logs a metric in a structured way for a given dataset column.
Args:
- dataset: The dataset for which the metric is logged
column: Column for which the metric is logged
metric: name fo the metric, e.g. "unique_vals"
value: The metric's value, e.g. "10000"
"""
- logger.info(json.dumps({"message": "METRIC", "dataset": dataset, "column": column, "metric": metric, "value": float(value)}))
-
-
-class Metric:
- """A base class for implementing metrics classes."""
-
- def __init__(self, dataset_name: str, df: pd.DataFrame, column: str): # noqa
- self.dataset_name = dataset_name
- self.df = df
- self.column = column
-
- def __init_subclass__(cls): # noqa
- __metrics__[cls.__name__] = cls
- assert "calculate_metric" in cls.__dict__
-
- def __call__(self) -> Any: # noqa
- metric_value = self.calculate_metric()
+ logger.info(json.dumps({"message": "METRIC", "column": column, "metric": metric, "value": float(value)}))
+
+# This function needs to be specifically in this file. Pandera needs a chance to initialise this custom validation
+# before the user can specify metrics in their Pandera classes. The moment they import any metric
+# (such as `Metric.MIN`), this whole module gets executed, and this custom validation is simultaneously executed,
+# so it's available on demand
+@extensions.register_check_method(statistics=["metrics"])
+def log_statistics(pandas_obj, *, metrics):
+ """
+ Implements column-level data metrics as a workaround through custom metrics
+ """
- if isinstance(metric_value, Mapping):
- for entity in sorted(metric_value.keys()): # pylint: disable=no-member
- column = metric_value[entity] # pylint: disable=unsubscriptable-object
- log_metric(self.dataset_name, entity, self.metric_name, column)
+ col_name = str(pandas_obj.name)
+
+ for metric in metrics:
+ computed_metric = None
+
+ if metric == Metric.MIN:
+ computed_metric = calculate_min(pandas_obj)
+ elif metric == Metric.MAX:
+ computed_metric = calculate_max(pandas_obj)
+ elif metric == Metric.MEAN:
+ computed_metric = calculate_mean(pandas_obj)
+ elif metric == Metric.STD:
+ computed_metric = calculate_std(pandas_obj)
+ elif metric == Metric.VARIANCE:
+ computed_metric = calculate_variance(pandas_obj)
+ elif metric == Metric.COUNTS:
+ computed_metric = calculate_counts(pandas_obj)
+ elif metric == Metric.UNIQUE_COUNTS:
+ computed_metric = calculate_unique_counts(pandas_obj)
+ elif metric == Metric.COUNTS_PER_LABEL:
+ computed_metric = calculate_counts_per_label(pandas_obj)
+
+ if isinstance(computed_metric, Mapping):
+ for entity in sorted(computed_metric.keys()): # pylint: disable=no-member
+ value = computed_metric[entity] # pylint: disable=unsubscriptable-object
+ log_metric(column=col_name, metric=metric, value=value)
else:
- log_metric(dataset=self.dataset_name, column=self.column, metric=self.metric_name, value=metric_value)
- return metric_value
-
- @property
- def metric_name(self) -> str:
- """Retrieves the name of the metric from the class name.
-
- Returns:
- The name of the metric, e.g. "Min or Mean".
- """
- return self.__class__.__name__
-
- def calculate_metric(self) -> Any:
- """Dictates that subclasses need to implement this method.
-
- Returns:
- NotImplemented is returned if the method is not implemented, by the subclass
- inevitably pointing to the parent implementation.
- """
- return NotImplemented
-
+ log_metric(column=col_name, metric=metric, value=computed_metric)
-class Min(Metric):
- """A metric instance that enables generating and returning the minimum value of a column."""
+ return True
- def calculate_metric(self) -> Number:
- """Generate and return the minimum value of a column.
- Returns:
- The minimum value of a column.
- """
- return self.df[self.column].min()
+def calculate_min(series: pd.Series) -> float:
+ """Generate and return the minimum value of a column.
+ Returns:
+ The minimum value of a column.
+ """
+ return series.min()
-class Max(Metric):
- """A metric instance that enables generating and returning the maximum value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the maximum value of a column.
-
- Returns:
- The maximum value of a column.
- """
- return self.df[self.column].max()
-
-
-class Mean(Metric):
- """A metric instance that enables generating and returning the mean value of a column."""
-
- def calculate_metric(self) -> Number:
- """Generate and return the mean value of a column.
-
- Returns:
- The mean value of a column.
- """
- return self.df[self.column].mean()
+def calculate_max(series: pd.Series) -> float:
+ """Generate and return the maximum value of a column.
-class Std(Metric):
- """A metric instance that enables generating and returning the standard deviation of a column."""
+ Returns:
+ The maximum value of a column.
+ """
+ return series.max()
- def calculate_metric(self) -> Number:
- """Generate and return the standard deviation of a column.
- Returns:
- The standard deviation of a column.
- """
- return self.df[self.column].std()
+def calculate_mean(series: pd.Series) -> float:
+ """Generate and return the mean value of a column.
+ Returns:
+ The mean value of a column.
+ """
+ return series.mean()
-class Variance(Metric):
- """A metric instance that generated and returns the variance of a column."""
- def calculate_metric(self) -> Number:
- """Generate and return the variance of a column.
+def calculate_std(series: pd.Series) -> float:
+ """Generate and return the standard deviation of a column.
- Returns:
- The variance of a column.
- """
- return self.df[self.column].var()
+ Returns:
+ The standard deviation of a column.
+ """
+ return series.std()
-class Counts(Metric):
- """A metric instance that enables generating and returning the length of a column."""
+def calculate_variance(series: pd.Series) -> float:
+ """Generate and return the variance of a column.
- def calculate_metric(self) -> int:
- """Generate and return the length of a column.
+ Returns:
+ The variance of a column.
+ """
+ return series.var()
- Returns:
- The length of a column.
- """
- return len(self.df[self.column])
+def calculate_counts(series: pd.Series) -> int:
+ """Generate and return the length of a column.
-class UniqueCounts(Metric):
- """A metric instance that enables generating and returning the unique values of a column."""
+ Returns:
+ The length of a column.
+ """
+ return len(series)
- def calculate_metric(self) -> int:
- """Generate and return the unique values of a column.
- Returns:
- The unique values of a column.
- """
- return len(self.df[self.column].unique())
+def calculate_unique_counts(series: pd.Series) -> int:
+ """Generate and return the unique values of a column.
+ Returns:
+ The unique values of a column.
+ """
+ return len(series.unique())
-class CountsPerLabel(Metric):
- """A metric instance that enables generating and returning the counts per label in a categorical column."""
- def calculate_metric(self) -> Mapping:
- """Generate and return the counts per label in a categorical column.
+def calculate_counts_per_label(series: pd.Series) -> dict:
+ """Generate and return the counts per label in a categorical column.
- Returns:
- The counts per label in a categorical column
- """
- column_vs_metric_value = self.df[self.column].value_counts().to_dict()
- label_vs_metric_value_with_column_prefix = {}
- for key in column_vs_metric_value.keys():
- new_key = self.column + "-" + key
- label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
- return label_vs_metric_value_with_column_prefix
+ Returns:
+ The counts per label in a categorical column
+ """
+ column_vs_metric_value = series.value_counts().to_dict()
+ label_vs_metric_value_with_column_prefix = {}
+ for key in column_vs_metric_value.keys():
+ new_key = str(series.name) + "-" + key
+ label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
+ return label_vs_metric_value_with_column_prefix
diff --git a/dynamicio/mixins/__init__.py b/dynamicio/mixins/__init__.py
deleted file mode 100644
index f928c7f..0000000
--- a/dynamicio/mixins/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-"""Default dynamicio mixins module"""
-
-from dynamicio.mixins.with_kafka import (
- WithKafka,
-)
-from dynamicio.mixins.with_local import (
- WithLocal,
- WithLocalBatch,
-)
-from dynamicio.mixins.with_postgres import (
- WithPostgres,
-)
-from dynamicio.mixins.with_s3 import (
- WithS3File,
- WithS3PathPrefix,
-)
diff --git a/dynamicio/mixins/utils.py b/dynamicio/mixins/utils.py
deleted file mode 100644
index 23a66c7..0000000
--- a/dynamicio/mixins/utils.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""Mixin utility functions"""
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-import inspect
-import string
-from contextlib import contextmanager
-from functools import wraps
-from types import FunctionType, MethodType
-from typing import Any, Collection, Iterable, Mapping, MutableMapping, Optional, Union
-
-from magic_logger import logger
-
-
-def allow_options(options: Union[Iterable, FunctionType, MethodType]):
- """Validate **options for a decorated reader function.
-
- Args:
- options: A set of valid options for a reader (e.g. `pandas.read_parquet` or `pandas.read_csv`)
-
- Returns:
- read_with_valid_options: The input function called with modified options.
- """
-
- def _filter_out_irrelevant_options(kwargs: Mapping, valid_options: Iterable):
- filtered_options = {}
- invalid_options = {}
- for key_arg in kwargs.keys():
- if key_arg in valid_options:
- filtered_options[key_arg] = kwargs[key_arg]
- else:
- invalid_options[key_arg] = kwargs[key_arg]
- if len(invalid_options) > 0:
- logger.warning(
- f"Options {invalid_options} were not used because they were not supported by the read or write method configured for this source. "
- "Check if you expected any of those to have been used by the operation!"
- )
- return filtered_options
-
- def read_with_valid_options(func):
- @wraps(func)
- def _(*args, **kwargs):
- if callable(options):
- return func(*args, **_filter_out_irrelevant_options(kwargs, args_of(options)))
- return func(*args, **_filter_out_irrelevant_options(kwargs, options))
-
- return _
-
- return read_with_valid_options
-
-
-def args_of(func):
- """Retrieve allowed options for a given function.
-
- Args:
- func: A function like, e.g., pd.read_csv
-
- Returns:
- A set of allowed options
- """
- return set(inspect.signature(func).parameters.keys())
-
-
-def get_string_template_field_names(s: str) -> Collection[str]: # pylint: disable=C0103
- """Given a string `s`, it parses the string to identify any template fields and returns the names of those fields.
-
- If `s` is not a string template, the returned `Collection` is empty.
-
- Args:
- s:
-
- Returns:
- Collection[str]
-
- Example:
-
- >>> get_string_template_field_names("abc{def}{efg}")
- ["def", "efg"]
- >>> get_string_template_field_names("{0}-{1}")
- ["0", "1"]
- >>> get_string_template_field_names("hello world")
- []
- """
- # string.Formatter.parse returns a 4-tuple of:
- # `literal_text`, `field_name`, `form_at_spec`, `conversion`
- # More info here https://docs.python.org/3.8/library/string.html#string.Formatter.parse
- field_names = [group[1] for group in string.Formatter().parse(s) if group[1] is not None]
-
- return field_names
-
-
-def resolve_template(path: str, options: MutableMapping[str, Any]) -> str: # pylint: disable=C0103
- """Given a string `path`, it attempts to replace all templates fields with values provided in `options`.
-
- If `path` is not a string template, `path` is returned.
-
- Args:
- path: A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
- options: A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
- Returns:
- str: Returns a static path replaced with the value in the options mapping.
-
- Raises:
- ValueError: if any template fields in s are not named using valid Python identifiers
- ValueError: if a given template field cannot be resolved in `options`
- """
- fields = get_string_template_field_names(path)
-
- if len(fields) == 0:
- return path
-
- if not all(field.isidentifier() for field in fields):
- raise ValueError(f"Expected valid Python identifiers, found {fields}")
-
- if not all(field in options for field in fields):
- raise ValueError(f"Expected values for all fields in {fields}, found {list(options.keys())}")
-
- path = path.format(**{field: options[field] for field in fields})
- for field in fields:
- options.pop(field)
-
- return path
-
-
-@contextmanager
-def pickle_protocol(protocol: Optional[int]):
- """Downgrade to the provided pickle protocol within the context manager.
-
- Args:
- protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
- """
- import pickle # pylint: disable=import-outside-toplevel
-
- previous = pickle.HIGHEST_PROTOCOL
- try:
- pickle.HIGHEST_PROTOCOL = 4
- if protocol:
- pickle.HIGHEST_PROTOCOL = protocol
- yield
- finally:
- pickle.HIGHEST_PROTOCOL = previous
diff --git a/dynamicio/mixins/with_kafka.py b/dynamicio/mixins/with_kafka.py
deleted file mode 100644
index 13b1019..0000000
--- a/dynamicio/mixins/with_kafka.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Kafka I/O support."""
-
-
-from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
-
-import pandas as pd # type: ignore
-import simplejson
-from kafka import KafkaProducer # type: ignore
-from magic_logger import logger
-
-
-from dynamicio.config.pydantic import DataframeSchema, KafkaDataEnvironment
-from dynamicio.mixins import utils
-
-
-class WithKafka:
- """Handles I/O operations for Kafka.
-
- Args:
- - options:
- - Standard: Keyword-arguments passed to the KafkaProducer constructor (see `KafkaProducer.DEFAULT_CONFIG.keys()`).
- - Additional Options:
-
- - `key_generator: Callable[[Any, Mapping], T]`: defines the keying policy to be used for sending keyed-messages to Kafka. It is a `Callable` that takes a
- `tuple(idx, row)` and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index
- (which may not be composed of unique values or string type keys). It goes hand in hand with the default `key-serialiser`, which assumes that the keys
- are strings and encode's them as such.
-
- - `key_serializer: Callable[T, bytes]`: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).
-
- N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.
-
- - `document_transformer: Callable[[Mapping[Any, Any]`: Manipulates the messages/rows sent to Kafka as values. It is a `Callable` taking a `Mapping` as its only
- argument and return a `Mapping`, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each
- document that will be written to the target Kafka topic.
-
- - `value_serializer: Callable[Mapping, bytes]`: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..
-
- Example:
- >>> # Given
- >>> keyed_test_df = pd.DataFrame.from_records(
- >>> [
- >>> ["key-01", "cm_1", "id_1", 1000, "ABC"],
- >>> ["key-02", "cm_2", "id_2", 1000, "ABC"],
- >>> ["key-03", "cm_3", "id_3", 1000, "ABC"],
- >>> ],
- >>> columns=["key", "id", "foo", "bar", "baz"],
- >>> ).set_index("key")
- >>>
- >>> kafka_cloud_config = IOConfig(
- >>> path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")),
- >>> env_identifier="CLOUD",
- >>> dynamic_vars=constants,
- >>> ).get(source_key="WRITE_TO_KAFKA_JSON")
- >>>
- >>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value")
- >>>
- >>> # When
- >>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer:
- >>> mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- >>> mock_producer = MockKafkaProducer()
- >>> mock__kafka_producer.return_value = mock_producer
- >>> write_kafka_io.write(keyed_test_df)
- >>>
- >>> # Then
- >>> assert mock_producer.my_stream == [
- >>> {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}},
- >>> {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}},
- >>> {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}},
- >>> ]
- """
-
- sources_config: KafkaDataEnvironment
- schema: DataframeSchema
- options: MutableMapping[str, Any]
- __kafka_config: Optional[Mapping] = None
- __producer: Optional[KafkaProducer] = None
- __key_generator: Optional[Callable[[Any, Mapping[Any, Any]], Optional[str]]] = None
- __document_transformer: Optional[Callable[[Mapping[Any, Any]], Mapping[Any, Any]]] = None
-
- def _write_to_kafka(self, df: pd.DataFrame) -> None:
- """Given a dataframe where each row is a message to be sent to a Kafka Topic, iterate through all rows and send them to a Kafka topic.
-
- The topic is defined in `self.sources_config["kafka"]` and using a kafka producer, which is flushed at the
- end of this process.
-
- Args:
- df: A dataframe where each row is a message to be sent to a Kafka Topic.
- """
- if self.__key_generator is None:
- self.__key_generator = lambda idx, __: idx # default key generator uses the dataframe's index
- if self.options.get("key_generator") is not None:
- self.__key_generator = self.options.pop("key_generator")
-
- if self.__document_transformer is None:
- self.__document_transformer = lambda value: value
- if self.options.get("document_transformer") is not None:
- self.__document_transformer = self.options.pop("document_transformer")
-
- if self.__producer is None:
- self.__producer = self._get_producer(self.sources_config.kafka.kafka_server, **self.options)
-
- self._send_messages(df=df, topic=self.sources_config.kafka.kafka_topic)
-
- @utils.allow_options(KafkaProducer.DEFAULT_CONFIG.keys())
- def _get_producer(self, server: str, **options: MutableMapping[str, Any]) -> KafkaProducer:
- """Generate and return a Kafka Producer.
-
- Default options are used to generate the producer. Specifically:
- - `bootstrap_servers`: Passed on through the source_config
- - `value_serializer`: Uses a default_value_serializer defined in this mixin
-
- More options can be added to the producer by passing them as keyword arguments, through valid options.
-
- These can also override the default options.
-
- Args:
- server: The host name.
- **options: Keyword arguments to pass to the KafkaProducer.
-
- Returns:
- A Kafka producer instance.
- """
- self.__kafka_config = {
- **{
- "bootstrap_servers": server,
- "compression_type": "snappy",
- "key_serializer": self._default_key_serializer,
- "value_serializer": self._default_value_serializer,
- },
- **options,
- }
- return KafkaProducer(**self.__kafka_config)
-
- def _send_messages(self, df: pd.DataFrame, topic: str) -> None:
- logger.info(f"Sending {len(df)} messages to Kafka topic:{topic}.")
-
- messages = df.reset_index(drop=True).to_dict("records")
- for idx, message in zip(df.index.values, messages):
- self.__producer.send(topic, key=self.__key_generator(idx, message), value=self.__document_transformer(message)) # type: ignore
-
- self.__producer.flush() # type: ignore
-
- @staticmethod
- def _default_key_serializer(key: Optional[str]) -> Optional[bytes]:
- if key:
- return key.encode("utf-8")
- return None
-
- @staticmethod
- def _default_value_serializer(value: Mapping) -> bytes:
- return simplejson.dumps(value, ignore_nan=True).encode("utf-8")
-
- def _read_from_kafka(self) -> Iterable[Mapping]: # type: ignore
- """Read messages from a Kafka Topic and convert them to separate dataframes.
-
- Returns:
- Multiple dataframes, one per message read from the Kafka topic of interest.
- """
- # TODO: Implement kafka reader
diff --git a/dynamicio/mixins/with_local.py b/dynamicio/mixins/with_local.py
deleted file mode 100644
index 89c951d..0000000
--- a/dynamicio/mixins/with_local.py
+++ /dev/null
@@ -1,257 +0,0 @@
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Local FS I/O support."""
-
-import glob
-import os
-from threading import Lock
-from typing import Any, MutableMapping
-
-import pandas as pd # type: ignore
-from fastparquet import ParquetFile, write # type: ignore
-from pyarrow.parquet import read_table, write_table # type: ignore # pylint: disable=no-name-in-module
-
-from dynamicio.config.pydantic import DataframeSchema, LocalBatchDataEnvironment, LocalDataEnvironment
-from dynamicio.mixins import utils
-
-hdf_lock = Lock()
-
-
-class WithLocal:
- """Handles local I/O operations."""
-
- schema: DataframeSchema
- sources_config: LocalDataEnvironment
- options: MutableMapping[str, Any]
-
- def _read_from_local(self) -> pd.DataFrame:
- """Read a local file as a `DataFrame`.
-
- The configuration object is expected to have two keys:
- - `file_path`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using
- "_read_{file_type}_file".
-
- Returns:
- DataFrame
- """
- local_config = self.sources_config.local
- file_path = utils.resolve_template(local_config.file_path, self.options)
- file_type = local_config.file_type
-
- return getattr(self, f"_read_{file_type}_file")(file_path, self.schema, **self.options)
-
- def _write_to_local(self, df: pd.DataFrame):
- """Write a dataframe locally based on the {file_type} of the config_io configuration.
-
- The configuration object is expected to have two keys:
-
- - `file_path`
- - `file_type`
-
- To actually write the file, a method is dynamically invoked by name, using
- "_write_{file_type}_file".
-
- Args:
- df: The dataframe to be written out.
- """
- local_config = self.sources_config.local
- file_path = utils.resolve_template(local_config.file_path, self.options)
- file_type = local_config.file_type
-
- getattr(self, f"_write_{file_type}_file")(df, file_path, **self.options)
-
- @staticmethod
- @utils.allow_options(pd.read_hdf)
- def _read_hdf_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame:
- """Read a HDF file as a DataFrame using `pd.read_hdf`.
-
- All `options` are passed directly to `pd.read_hdf`.
-
- Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
- that when used with asyncio through `async_read()` HDF files will be read sequentially.
- For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
- Args:
- file_path: The path to the hdf file to be read.
- options: The pandas `read_hdf` options.
-
- Returns:
- DataFrame: The dataframe read from the hdf file.
- """
- with hdf_lock:
- df = pd.read_hdf(file_path, **options)
-
- columns = [column for column in df.columns.to_list() if column in schema.column_names]
- df = df[columns]
- return df
-
- @staticmethod
- @utils.allow_options(pd.read_csv)
- def _read_csv_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame:
- """Read a CSV file as a DataFrame using `pd.read_csv`.
-
- All `options` are passed directly to `pd.read_csv`.
-
- Args:
- file_path: The path to the csv file to be read.
- options: The pandas `read_csv` options.
-
- Returns:
- DataFrame: The dataframe read from the csv file.
- """
- options["usecols"] = list(schema.column_names)
- return pd.read_csv(file_path, **options)
-
- @staticmethod
- @utils.allow_options(pd.read_json)
- def _read_json_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame:
- """Read a json file as a DataFrame using `pd.read_hdf`.
-
- All `options` are passed directly to `pd.read_hdf`.
-
- Args:
- file_path:
- options:
-
- Returns:
- DataFrame
- """
- df = pd.read_json(file_path, **options)
- columns = [column for column in df.columns.to_list() if column in schema.column_names]
- df = df[columns]
- return df
-
- @staticmethod
- def _read_parquet_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame:
- """Read a Parquet file as a DataFrame using `pd.read_parquet`.
-
- All `options` are passed directly to `pd.read_parquet`.
-
- Args:
- file_path: The path to the parquet file to be read.
- options: The pandas `read_parquet` options.
-
- Returns:
- DataFrame: The dataframe read from the parquet file.
- """
- options["columns"] = list(schema.column_names)
-
- if options.get("engine") == "fastparquet":
- return WithLocal.__read_with_fastparquet(file_path, **options)
- return WithLocal.__read_with_pyarrow(file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(read_table)])
- def __read_with_pyarrow(cls, file_path: str, **options: Any) -> pd.DataFrame:
- return pd.read_parquet(file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(ParquetFile)])
- def __read_with_fastparquet(cls, file_path: str, **options: Any) -> pd.DataFrame:
- return pd.read_parquet(file_path, **options)
-
- @staticmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_hdf), *["protocol"]])
- def _write_hdf_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe to hdf using `df.to_hdf`.
-
- All `options` are passed directly to `df.to_hdf`.
-
- Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
- that when used with asyncio through `async_read()` HDF files will be written sequentially.
- For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: The pandas `to_hdf` options.
-
- - The pandas `to_hdf` options, &;
- - protocol: The pickle protocol to use for writing the hdf file out; a value <=5.
- """
- with utils.pickle_protocol(protocol=options.pop("protocol", None)), hdf_lock:
- df.to_hdf(file_path, key="df", mode="w", **options)
-
- @staticmethod
- @utils.allow_options(pd.DataFrame.to_csv)
- def _write_csv_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a CSV file using `df.to_csv`.
-
- All `options` are passed directly to `df.to_csv`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a csv file.
- """
- df.to_csv(file_path, **options)
-
- @staticmethod
- @utils.allow_options(pd.DataFrame.to_json)
- def _write_json_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a json file using `df.to_json`.
-
- All `options` are passed directly to `df.to_json`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a json file.
- """
- df.to_json(file_path, **options)
-
- @staticmethod
- def _write_parquet_file(df: pd.DataFrame, file_path: str, **options: Any):
- """Write a dataframe as a parquet file using `df.to_parquet`.
-
- All `options` are passed directly to `df.to_parquet`.
-
- Args:
- df: A dataframe write out.
- file_path: The location where the file needs to be written.
- options: Options relative to writing a parquet file.
- """
- if options.get("engine") == "fastparquet":
- return WithLocal.__write_with_fastparquet(df, file_path, **options)
- return WithLocal.__write_with_pyarrow(df, file_path, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write_table)])
- def __write_with_pyarrow(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
- return df.to_parquet(filepath, **options)
-
- @classmethod
- @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write)])
- def __write_with_fastparquet(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
- return df.to_parquet(filepath, **options)
-
-
-class WithLocalBatch(WithLocal):
- """Responsible for batch reading local files."""
-
- sources_config: LocalBatchDataEnvironment # type: ignore
-
- def _read_from_local_batch(self) -> pd.DataFrame:
- """Reads a set of files for a specified file type, concatenates them and returns a dataframe.
-
- Returns:
- A concatenated dataframe composed of all files read through local_batch.
- """
- local_batch_config = self.sources_config.local
-
- file_type = local_batch_config.file_type
- filtering_file_type = file_type.value
- if filtering_file_type == "hdf":
- filtering_file_type = "h5"
-
- files = glob.glob(os.path.join(local_batch_config.path_prefix, f"*.{filtering_file_type}"))
-
- dfs_to_concatenate = []
- for file in files:
- file_to_load = os.path.join(local_batch_config.path_prefix, file)
- dfs_to_concatenate.append(getattr(self, f"_read_{file_type}_file")(file_to_load, self.schema, **self.options)) # type: ignore
-
- return pd.concat(dfs_to_concatenate).reset_index(drop=True)
diff --git a/dynamicio/mixins/with_postgres.py b/dynamicio/mixins/with_postgres.py
deleted file mode 100644
index 051d893..0000000
--- a/dynamicio/mixins/with_postgres.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Postgres I/O support."""
-
-import csv
-import tempfile
-from contextlib import contextmanager
-from typing import Any, Dict, Generator, MutableMapping, Union
-
-import pandas as pd # type: ignore
-from magic_logger import logger
-from sqlalchemy import BigInteger, Boolean, Column, create_engine, Date, DateTime, Float, Integer, String # type: ignore
-from sqlalchemy.ext.declarative import declarative_base # type: ignore
-from sqlalchemy.orm import Query # type: ignore
-from sqlalchemy.orm.decl_api import DeclarativeMeta # type: ignore
-from sqlalchemy.orm.session import Session as SqlAlchemySession # type: ignore
-from sqlalchemy.orm.session import sessionmaker # type: ignore
-
-from dynamicio.config.pydantic import DataframeSchema, PostgresDataEnvironment
-from dynamicio.mixins import utils
-
-Session = sessionmaker(autoflush=True)
-
-Base = declarative_base()
-_type_lookup = {
- "bool": Boolean,
- "boolean": Boolean,
- "object": String(64),
- "int64": Integer,
- "float64": Float,
- "int": Integer,
- "date": Date,
- "datetime64[ns]": DateTime,
- "bigint": BigInteger,
-}
-
-
-@contextmanager
-def session_for(connection_string: str) -> Generator[SqlAlchemySession, None, None]:
- """Connect to a database using `connection_string` and returns an active session to that connection.
-
- Args:
- connection_string:
-
- Yields:
- Active session
- """
- engine = create_engine(connection_string)
- session = Session(bind=engine)
-
- try:
- yield session
- finally:
- session.close() # pylint: disable=no-member
-
-
-class WithPostgres:
- """Handles I/O operations for Postgres.
-
- Args:
- - options:
- - `truncate_and_append: bool`: If set to `True`, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows.
- """
-
- sources_config: PostgresDataEnvironment
- schema: DataframeSchema
- options: MutableMapping[str, Any]
-
- def _read_from_postgres(self) -> pd.DataFrame:
- """Read data from postgres as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `db_user`
- - `db_password`
- - `db_host`
- - `db_port`
- - `db_name`
-
- Returns:
- DataFrame
- """
- postgres_config = self.sources_config.postgres
- db_user = postgres_config.db_user
- db_password = postgres_config.db_password
- db_host = postgres_config.db_host
- db_port = postgres_config.db_port
- db_name = postgres_config.db_name
-
- connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
- sql_query = self.options.pop("sql_query", None)
-
- assert self.sources_config.dynamicio_schema is not None, "The schema must be specified for SQL tables"
- model = self._generate_model_from_schema(self.sources_config.dynamicio_schema)
-
- query = Query(self._get_table_columns(model))
- if sql_query:
- query = sql_query
-
- logger.info(f"[postgres] Started downloading table: {self.sources_config.dynamicio_schema.name} from: {db_host}:{db_name}")
- with session_for(connection_string) as session:
- return self._read_database(session, query, **self.options)
-
- @staticmethod
- def _generate_model_from_schema(schema: DataframeSchema) -> DeclarativeMeta:
- json_cls_schema: Dict[str, Any] = {"tablename": schema.name, "columns": []}
-
- for col in schema.columns.values():
- sql_type = _type_lookup.get(col.data_type)
- if sql_type:
- json_cls_schema["columns"].append({"name": col.name, "type": sql_type})
-
- class_name = "".join(word.capitalize() or "_" for word in schema.name.split("_")) + "Model"
-
- class_dict = {"clsname": class_name, "__tablename__": schema.name, "__table_args__": {"extend_existing": True}}
- class_dict.update({column["name"]: Column(column["type"], primary_key=True) if idx == 0 else Column(column["type"]) for idx, column in enumerate(json_cls_schema["columns"])})
-
- generated_model = type(class_name, (Base,), class_dict)
- return generated_model
-
- @staticmethod
- def _get_table_columns(model):
- tables_colums = []
- if model:
- for col in list(model.__table__.columns):
- tables_colums.append(getattr(model, col.name))
- return tables_colums
-
- @staticmethod
- @utils.allow_options(pd.read_sql)
- def _read_database(session: SqlAlchemySession, query: Union[str, Query], **options: Any) -> pd.DataFrame:
- """Run `query` against active `session` and returns the result as a `DataFrame`.
-
- Args:
- session: Active session
- query: If a `Query` object is given, it should be unbound. If a `str` is given, the
- value is used as-is.
-
- Returns:
- DataFrame
- """
- if isinstance(query, Query):
- query = query.with_session(session).statement
- return pd.read_sql(sql=query, con=session.get_bind(), **options)
-
- def _write_to_postgres(self, df: pd.DataFrame):
- """Write a dataframe to postgres based on the {file_type} of the config_io configuration.
-
- Args:
- df: The dataframe to be written
- """
- postgres_config = self.sources_config.postgres
- db_user = postgres_config.db_user
- db_password = postgres_config.db_password
- db_host = postgres_config.db_host
- db_port = postgres_config.db_port
- db_name = postgres_config.db_name
-
- connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
- assert self.sources_config.dynamicio_schema is not None, "The schema must be specified for SQL tables"
- model = self._generate_model_from_schema(self.sources_config.dynamicio_schema)
-
- is_truncate_and_append = self.options.get("truncate_and_append", False)
-
- logger.info(f"[postgres] Started downloading table: {self.sources_config.dynamicio_schema.name} from: {db_host}:{db_name}")
- with session_for(connection_string) as session:
- self._write_to_database(session, model.__tablename__, df, is_truncate_and_append) # type: ignore
-
- @staticmethod
- def _write_to_database(session: SqlAlchemySession, table_name: str, df: pd.DataFrame, is_truncate_and_append: bool):
- """Write a dataframe to any database provided a session with a data model and a table name.
-
- Args:
- session: Generated from a data model and a table name
- table_name: The name of the table to read from a DB
- df: The dataframe to be written out
- is_truncate_and_append: Supply to truncate the table and append new rows to it; otherwise, delete and replace
- """
- if is_truncate_and_append:
- session.execute(f"TRUNCATE TABLE {table_name};")
-
- # Below is a speedup hack in place of `df.to_csv` with the multipart option. As of today, even with
- # `method="multi"`, uploading to Postgres is painfully slow. Hence, we're resorting to dumping the file as
- # csv and using Postgres's CSV import function.
- # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table
- with tempfile.NamedTemporaryFile(mode="r+") as temp_file:
- df.to_csv(temp_file, index=False, header=False, sep="\t", doublequote=False, escapechar="\\", quoting=csv.QUOTE_NONE)
- temp_file.flush()
- temp_file.seek(0)
-
- cur = session.connection().connection.cursor()
- cur.copy_from(temp_file, table_name, columns=df.columns, null="")
- else:
- df.to_sql(name=table_name, con=session.get_bind(), if_exists="replace", index=False)
-
- session.commit()
diff --git a/dynamicio/mixins/with_s3.py b/dynamicio/mixins/with_s3.py
deleted file mode 100644
index 92ad958..0000000
--- a/dynamicio/mixins/with_s3.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing S3 I/O support."""
-
-import dataclasses
-import io
-import os
-import tempfile
-import urllib.parse
-import uuid
-from contextlib import contextmanager
-from typing import Generator, IO, Optional
-
-import boto3 # type: ignore
-import pandas as pd # type: ignore
-import s3transfer.futures # type: ignore
-import tables # type: ignore
-from awscli.clidriver import create_clidriver # type: ignore
-from magic_logger import logger
-
-from dynamicio.config.pydantic import DataframeSchema, S3DataEnvironment, S3PathPrefixEnvironment
-from dynamicio.mixins import (
- utils,
- with_local,
-)
-
-
-class InMemStore(pd.io.pytables.HDFStore):
- """A subclass of pandas HDFStore that does not manage the pytables File object"""
-
- _in_mem_table = None
-
- def __init__(self, path: str, table: tables.File, mode: str = "r"):
- self._in_mem_table = table
- super().__init__(path=path, mode=mode)
-
- def open(self, *_args, **_kwargs):
- pd.io.pytables._tables()
- self._handle = self._in_mem_table
-
- def close(self, *_args, **_kwargs):
- pass
-
- @property
- def is_open(self):
- return self._handle is not None
-
-
-class HdfIO:
- """Class providing stream support for HDF tables"""
-
- @contextmanager
- def create_file(self, label: str, mode: str, data: bytes = None) -> Generator[tables.File, None, None]:
- """Create an in-memory pytables table"""
- extra_kw = {}
- if data:
- extra_kw["driver_core_image"] = data
- file_handle = tables.File(f"{label}_{uuid.uuid4()}.h5", mode, title=label, root_uep="/", filters=None, driver="H5FD_CORE", driver_core_backing_store=0, **extra_kw)
- try:
- yield file_handle
- finally:
- file_handle.close()
-
- def load(self, fobj: IO[bytes], label: str = "unknown_file.h5") -> pd.DataFrame:
- """Load the dataframe from an file-like object"""
- with self.create_file(label, mode="r", data=fobj.read()) as file_handle:
- return pd.read_hdf(InMemStore(label, file_handle))
-
- def save(self, df: pd.DataFrame, fobj: IO[bytes], label: str = "unknown_file.h5", options: Optional[dict] = None):
- """Load the dataframe to a file-like object"""
- if not options:
- options = {}
- with self.create_file(label, mode="w", data=fobj.read()) as file_handle:
- store = InMemStore(path=label, table=file_handle, mode="w")
- store.put(key="df", value=df, **options)
- fobj.write(file_handle.get_file_image())
-
-
-def awscli_runner(*cmd: str):
- """Runs the awscli command provided.
-
- Args:
- *cmd: A list of args used in the command.
-
- Raises:
- A runtime error exception is raised if download fails.
-
- Example:
-
- >>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".")
- """
- # Run
- exit_code = create_clidriver().main(cmd)
-
- if exit_code > 0:
- raise RuntimeError(f"AWS CLI exited with code {exit_code}")
-
-
-@dataclasses.dataclass
-class S3TransferHandle:
- """A dataclass used to track an ongoing data download from the s3"""
-
- s3_object: object # boto3.resource('s3').ObjectSummary
- fobj: IO[bytes] # file-like object the data is being downloaded to
- done_future: s3transfer.futures.BaseTransferFuture
-
-
-class WithS3PathPrefix(with_local.WithLocal):
- """Handles I/O operations for AWS S3; implements read operations only.
-
- This mixin assumes that the directories it reads from will only contain a single file-type.
- """
-
- sources_config: S3PathPrefixEnvironment # type: ignore
- schema: DataframeSchema
-
- boto3_resource = boto3.resource("s3")
- boto3_client = boto3.client("s3")
-
- def _write_to_s3_path_prefix(self, df: pd.DataFrame):
- """Write a DataFrame to an S3 path prefix.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `path_prefix`
- - `file_type`
-
- Args:
- df (pd.DataFrame): the DataFrame to be written to S3
-
- Raises:
- ValueError: In case `path_prefix` is missing from config
- ValueError: In case the `partition_cols` arg is missing while trying to write a parquet file
- """
- s3_config = self.sources_config.s3
-
- file_type = s3_config.file_type
- if file_type != "parquet":
- raise ValueError(f"File type not supported: {file_type}, only parquet files can be written to an S3 key")
- if "partition_cols" not in self.options:
- raise ValueError("`partition_cols` is required as an option to write partitioned parquet files to S3")
-
- bucket = s3_config.bucket
- path_prefix = s3_config.path_prefix
- full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
- with tempfile.TemporaryDirectory() as temp_dir:
- self._write_parquet_file(df, temp_dir, **self.options)
- awscli_runner(
- "s3",
- "sync",
- temp_dir,
- full_path_prefix,
- "--acl",
- "bucket-owner-full-control",
- "--only-show-errors",
- "--exact-timestamps",
- )
-
- def _read_from_s3_path_prefix(self) -> pd.DataFrame:
- """Read all files under a path prefix from an S3 bucket as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `path_prefix`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using
- "_read_{file_type}_path_prefix".
-
- Returns:
- DataFrame
- """
- s3_config = self.sources_config.s3
- file_type = s3_config.file_type
- if file_type not in {"parquet", "csv", "hdf", "json"}:
- raise ValueError(f"File type not supported: {file_type}")
-
- bucket = s3_config.bucket
- path_prefix = s3_config.path_prefix
- full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
- # The `no_disk_space` option should be used only when reading a subset of columns from S3
- if self.options.pop("no_disk_space", False):
- if file_type == "parquet":
- return self._read_parquet_file(full_path_prefix, self.schema, **self.options)
- if file_type == "hdf":
- dfs = []
- for fobj in self._iter_s3_files(
- full_path_prefix,
- file_ext=".h5",
- max_memory_use=1024**3, # 1 gib
- ):
- dfs.append(HdfIO().load(fobj))
- df = pd.concat(dfs, ignore_index=True)
- columns = [column for column in df.columns.to_list() if column in self.schema.columns.keys()]
- return df[columns]
-
- with tempfile.TemporaryDirectory() as temp_dir:
- # aws-cli is shown to be up to 6 times faster when downloading the complete dataset from S3 than using the boto3
- # client or pandas directly. This is because aws-cli uses the parallel downloader, which is much faster than the
- # boto3 client.
- awscli_runner(
- "s3",
- "sync",
- full_path_prefix,
- temp_dir,
- "--acl",
- "bucket-owner-full-control",
- "--only-show-errors",
- "--exact-timestamps",
- )
-
- dfs = []
- for file in os.listdir(temp_dir):
- df = getattr(self, f"_read_{file_type}_file")(os.path.join(temp_dir, file), self.schema, **self.options) # type: ignore
- if len(df) > 0:
- dfs.append(df)
-
- return pd.concat(dfs, ignore_index=True)
-
- def _iter_s3_files(self, s3_prefix: str, file_ext: Optional[str] = None, max_memory_use: int = -1) -> Generator[IO[bytes], None, None]: # pylint: disable=too-many-locals
- """Download sways of S3 objects.
-
- Parameters:
- s3_prefix: s3 url to fetch objects with
- file_ext: extension of s3 objects to allow through
- max_memory_use: The approximate number of bytes to allocate on each yield of Generator
- """
- parsed_url = urllib.parse.urlparse(s3_prefix)
- assert parsed_url.scheme == "s3", f"{s3_prefix!r} should be an s3 url"
- bucket_name = parsed_url.netloc
- file_prefix = f"{parsed_url.path.strip('/')}/"
- s3_objects_to_fetch = []
- # Collect objects to be loaded
- for s3_object in self.boto3_resource.Bucket(bucket_name).objects.filter(Prefix=file_prefix):
- good_object = (not file_ext) or (s3_object.key.endswith(file_ext))
- if good_object:
- s3_objects_to_fetch.append(s3_object)
-
- if max_memory_use < 0:
- # Unlimited memory use - fetch ALL
- max_memory_use = sum(s3_obj.size for s3_obj in s3_objects_to_fetch) * 2
- transfer_config = boto3.s3.transfer.TransferConfig(max_concurrency=20)
- while s3_objects_to_fetch:
- mem_use_left = max_memory_use
- handles = []
- with boto3.s3.transfer.create_transfer_manager(self.boto3_client, transfer_config) as transfer_manager:
- while mem_use_left > 0 and s3_objects_to_fetch:
- s3_object = s3_objects_to_fetch.pop()
- fobj = io.BytesIO()
- future = transfer_manager.download(bucket_name, s3_object.key, fobj)
- handles.append(S3TransferHandle(s3_object, fobj, future))
- mem_use_left -= s3_object.size
- # Leaving the `transfer_manager` context implicitly waits for all downloads to complete
- # Rewind and yield all fobjs
- for handle in handles:
- handle.fobj.seek(0)
- yield handle.fobj
-
-
-class WithS3File(with_local.WithLocal):
- """Handles I/O operations for AWS S3.
-
- All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory.
- Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support
- for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.
-
- Options:
- no_disk_space: If `True`, then s3fs + fsspec will be used to read data directly into memory.
- """
-
- sources_config: S3DataEnvironment # type: ignore
- schema: DataframeSchema
-
- boto3_client = boto3.client("s3")
-
- @contextmanager
- def _s3_named_file_reader(self, s3_bucket: str, s3_key: str) -> Generator:
- """Contextmanager to abstract reading different file types in S3.
-
- This implementation saves the downloaded data to a temporary file.
-
- Args:
- s3_bucket: The S3 bucket from where to read the file.
- s3_key: The file-path to the target file to be read.
-
- Returns:
- The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
-
- """
- with tempfile.NamedTemporaryFile("wb") as target_file:
- # Download the file from S3
- self.boto3_client.download_fileobj(s3_bucket, s3_key, target_file)
- # Yield local file path to body of `with` statement
- target_file.flush()
- yield target_file
-
- @contextmanager
- def _s3_reader(self, s3_bucket: str, s3_key: str) -> Generator[io.BytesIO, None, None]:
- """Contextmanager to abstract reading different file types in S3.
-
- This implementation only retains data in-memory, avoiding creating any temp files.
-
- Args:
- s3_bucket: The S3 bucket from where to read the file.
- s3_key: The file-path to the target file to be read.
-
- Returns:
- The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
-
- """
- fobj = io.BytesIO()
- # Download the file from S3
- self.boto3_client.download_fileobj(s3_bucket, s3_key, fobj)
- # Yield the buffer
- fobj.seek(0)
- yield fobj
-
- @contextmanager
- def _s3_writer(self, s3_bucket: str, s3_key: str) -> Generator[IO[bytes], None, None]:
- """Contextmanager to abstract loading different file types to S3.
-
- Args:
- s3_bucket: The S3 bucket to upload the file to.
- s3_key: The file-path where the target file should be uploaded to.
-
- Returns:
- The local file path where to actually write the file, to be read and uploaded by boto3.client.
- """
- fobj = io.BytesIO()
- yield fobj
- fobj.seek(0)
- self.boto3_client.upload_fileobj(fobj, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"})
-
- def _read_from_s3_file(self) -> pd.DataFrame:
- """Read a file from an S3 bucket as a `DataFrame`.
-
- The configuration object is expected to have the following keys:
- - `bucket`
- - `file_path`
- - `file_type`
-
- To actually read the file, a method is dynamically invoked by name, using "_read_{file_type}_file".
-
- Returns:
- DataFrame
- """
- s3_config = self.sources_config.s3
- file_type = s3_config.file_type
- file_path = utils.resolve_template(s3_config.file_path, self.options)
- bucket = s3_config.bucket
-
- logger.info(f"[s3] Started downloading: s3://{s3_config.bucket}/{file_path}")
- if self.options.pop("no_disk_space", None):
- no_disk_space_rv = None
- if file_type in ["csv", "json", "parquet"]:
- no_disk_space_rv = getattr(self, f"_read_{file_type}_file")(f"s3://{s3_config.bucket}/{file_path}", self.schema, **self.options) # type: ignore
- elif file_type == "hdf":
- with self._s3_reader(s3_bucket=bucket, s3_key=file_path) as fobj: # type: ignore
- no_disk_space_rv = HdfIO().load(fobj) # type: ignore
- else:
- raise NotImplementedError(f"Unsupported file type {file_type!r}.")
- if no_disk_space_rv is not None:
- return no_disk_space_rv
- with self._s3_named_file_reader(s3_bucket=bucket, s3_key=file_path) as target_file: # type: ignore
- return getattr(self, f"_read_{file_type}_file")(target_file.name, self.schema, **self.options) # type: ignore
-
- def _write_to_s3_file(self, df: pd.DataFrame):
- """Write a dataframe to s3 based on the {file_type} of the config_io configuration.
-
- The configuration object is expected to have two keys:
-
- - `file_path`
- - `file_type`
-
- To actually write the file, a method is dynamically invoked by name, using "_write_{file_type}_file".
-
- Args:
- df: The dataframe to be written out
- """
- s3_config = self.sources_config.s3
- bucket = s3_config.bucket
- file_path = utils.resolve_template(s3_config.file_path, self.options)
- file_type = s3_config.file_type
-
- logger.info(f"[s3] Started uploading: s3://{bucket}/{file_path}")
- if file_type in ["csv", "json", "parquet"]:
- getattr(self, f"_write_{file_type}_file")(df, f"s3://{bucket}/{file_path}", **self.options) # type: ignore
- elif file_type == "hdf":
- hdf_options = dict(self.options)
- pickle_protocol = hdf_options.pop("pickle_protocol", None)
- with self._s3_writer(s3_bucket=s3_config.bucket, s3_key=file_path) as target_file, utils.pickle_protocol(protocol=pickle_protocol):
- HdfIO().save(df, target_file, hdf_options) # type: ignore
- else:
- raise ValueError(f"File type: {file_type} not supported!")
- logger.info(f"[s3] Finished uploading: s3://{bucket}/{file_path}")
diff --git a/dynamicio/py.typed b/dynamicio/py.typed
deleted file mode 100644
index e69de29..0000000
diff --git a/dynamicio/utils.py b/dynamicio/utils.py
new file mode 100644
index 0000000..9bc6cbb
--- /dev/null
+++ b/dynamicio/utils.py
@@ -0,0 +1,20 @@
+"""Utilities for dynamicio."""
+
+from contextlib import contextmanager
+
+
+@contextmanager
+def pickle_protocol(protocol: int):
+ """Downgrade to the provided pickle protocol within the context manager.
+
+ Args:
+ protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to.
+ """
+ import pickle # pylint: disable=import-outside-toplevel
+
+ previous = pickle.HIGHEST_PROTOCOL
+ try:
+ pickle.HIGHEST_PROTOCOL = protocol
+ yield
+ finally:
+ pickle.HIGHEST_PROTOCOL = previous
diff --git a/demo/__init__.py b/dynamicio/v5_migration/__init__.py
similarity index 100%
rename from demo/__init__.py
rename to dynamicio/v5_migration/__init__.py
diff --git a/dynamicio/v5_migration/__main__.py b/dynamicio/v5_migration/__main__.py
new file mode 100644
index 0000000..013ae30
--- /dev/null
+++ b/dynamicio/v5_migration/__main__.py
@@ -0,0 +1,8 @@
+# pylint: skip-file
+# noqa
+# type: ignore
+
+
+from dynamicio.v5_migration.app import app
+
+app()
diff --git a/dynamicio/v5_migration/app.py b/dynamicio/v5_migration/app.py
new file mode 100644
index 0000000..1927f4d
--- /dev/null
+++ b/dynamicio/v5_migration/app.py
@@ -0,0 +1,203 @@
+# pylint: skip-file
+# noqa
+# type: ignore
+
+from __future__ import annotations
+
+from copy import deepcopy
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Callable
+
+import typer
+import yaml
+from rich import print as rich_print
+
+from dynamicio.v5_migration.resource_migration import (
+ convert_single_resource_file,
+ is_resource_dict,
+ resources_import_str,
+)
+from dynamicio.v5_migration.schema_migration import convert_single_schema_file, is_schema_dict, schema_import_str
+
+app = typer.Typer()
+
+
+@app.command()
+def convert_everything(source: Path, destination: Path):
+ """Converts every item as far as possible. Paths can be dirs or files."""
+ schemas_source_destination, schemas_to_be_written = gather_schema_migration_actions(source, destination)
+ resources_source_destination, resources_to_be_written = gather_resource_migration_actions(source, destination)
+
+ source_destination_pairs = schemas_source_destination + resources_source_destination
+ files_to_be_written = schemas_to_be_written + resources_to_be_written
+
+ confirm_migration_actions(source_destination_pairs, files_to_be_written)
+ write_files(files_to_be_written)
+
+
+@dataclass
+class SourceDestinationPair:
+ source: Path
+ destination: Path
+
+
+@dataclass
+class FilesToBeWritten:
+ target_file: Path
+ target_content: str
+
+
+@app.command()
+def convert_resources(source: Path, destination: Path):
+ """Converts only resource yamls."""
+ files_to_be_written, source_destination_pairs = gather_resource_migration_actions(destination, source)
+
+ confirm_migration_actions(source_destination_pairs, files_to_be_written)
+ write_files(files_to_be_written)
+
+
+def gather_resource_migration_actions(source: Path, destination: Path):
+ source_content, source_path = handle_source_path(source)
+ # make lower case 2 levels
+ source_content = {
+ source: {
+ k.lower(): {kk.lower(): vv for kk, vv in v.items()} if isinstance(v, dict) else v
+ for k, v in deepcopy(contents).items()
+ }
+ for source, contents in source_content.items()
+ }
+
+ source_content = {source: contents for source, contents in source_content.items() if is_resource_dict(contents)}
+ source_destination_pairs: list[SourceDestinationPair]
+ files_to_be_written: list[FilesToBeWritten]
+ source_destination_pairs, files_to_be_written = generate_source_destination_actions(
+ source_path,
+ source_content,
+ destination,
+ resources_import_str,
+ convert_single_resource_file,
+ )
+ return source_destination_pairs, files_to_be_written
+
+
+@app.command()
+def convert_schemas(source: Path, destination: Path):
+ """Converts only schemas."""
+ source_destination_pairs, files_to_be_written = gather_schema_migration_actions(source, destination)
+
+ confirm_migration_actions(source_destination_pairs, files_to_be_written)
+ write_files(files_to_be_written)
+
+
+def gather_schema_migration_actions(
+ source: Path, destination: Path
+) -> tuple[list[SourceDestinationPair], list[FilesToBeWritten]]:
+ """Gathers the source destination pairs and files to be written."""
+
+ source_content, source_path = handle_source_path(source)
+ source_content = {source: contents for source, contents in source_content.items() if is_schema_dict(contents)}
+
+ source_destination_pairs: list[SourceDestinationPair]
+ files_to_be_written: list[FilesToBeWritten]
+ source_destination_pairs, files_to_be_written = generate_source_destination_actions(
+ source_path,
+ source_content,
+ destination,
+ schema_import_str,
+ convert_single_schema_file,
+ )
+
+ return source_destination_pairs, files_to_be_written
+
+
+# ------------------
+
+
+def generate_source_destination_actions(
+ source_path: Path,
+ source_content: dict[Path, dict],
+ destination: Path,
+ import_str: str,
+ contents_to_code_conversion_func: Callable[[dict], str],
+) -> tuple[list[SourceDestinationPair], list[FilesToBeWritten]]:
+ """Generates the source destination pairs and files to be written."""
+ source_destination_pairs: list[SourceDestinationPair] = []
+ files_to_be_written: list[FilesToBeWritten] = []
+
+ if not source_content:
+ return [], []
+
+ if destination.suffix == ".py":
+ python_str = import_str
+ for _source, contents in source_content.items():
+ rich_print(f"Converting [green]{len(source_destination_pairs)}[/green]")
+ python_str += contents_to_code_conversion_func(contents)
+ source_destination_pairs.append(SourceDestinationPair(_source, destination))
+
+ files_to_be_written.append(FilesToBeWritten(destination.with_suffix(".py"), python_str))
+
+ elif destination.suffix == "":
+ for _source, contents in source_content.items():
+ python_str = import_str
+ python_str += contents_to_code_conversion_func(contents)
+
+ sub_path = _source.relative_to(source_path)
+ destination_path = destination / sub_path.with_suffix(".py")
+
+ source_destination_pairs.append(SourceDestinationPair(_source, destination_path))
+ files_to_be_written.append(FilesToBeWritten(destination_path, python_str))
+ else:
+ raise ValueError(
+ f"Destination {destination} is not a directory or python file. Found suffix {destination.suffix}."
+ )
+ return source_destination_pairs, files_to_be_written
+
+
+def handle_source_path(source: Path) -> tuple[dict[Path, dict], Path]:
+ """returns a tuple of source_content and source_path
+
+ source_content is a dict of paths and their yaml contents.
+ source_path is the path of the source directory or parent directory of source if source is a path.
+ """
+ if source.is_file():
+ sources = [source]
+ source_path = source.parent
+ elif source.is_dir():
+ sources = list(source.glob("**/*.yaml"))
+ source_path = source
+ else:
+ raise ValueError(f"Source {source} is not a file or directory")
+
+ source_content = {source: yaml.safe_load(source.open()) for source in sources}
+ # Make yaml keys lowercase
+ source_content = {sc_key: {k.lower(): v for k, v in sc_val.items()} for sc_key, sc_val in source_content.items()}
+ return source_content, source_path
+
+
+def write_files(files_to_be_written: list[FilesToBeWritten]):
+ """Writes the files to be written."""
+ for write_file in files_to_be_written:
+ write_file.target_file.parent.mkdir(parents=True, exist_ok=True)
+ write_file.target_file.write_text(write_file.target_content)
+ rich_print(
+ f"[red]WARNING [/red][blue]Fix warnings emitted above (if any), some validations may need manual edits.[/blue]"
+ )
+
+
+def confirm_migration_actions(
+ source_destination_pairs: list[SourceDestinationPair],
+ files_to_be_written: list[FilesToBeWritten],
+):
+ """Confirms the migration actions."""
+ rich_print(f"[bold red]Found [green]{len(source_destination_pairs)}[/green] source destination pairs:[/bold red]")
+ for pair in source_destination_pairs:
+ rich_print(f"[blue] - [/blue]{pair.source} -> {pair.destination}")
+
+ rich_print(f"[bold red]Found [green]{len(files_to_be_written)}[/green] files to be written:[/bold red]")
+
+ for write_file in files_to_be_written:
+ loc = write_file.target_content.count("\n")
+ rich_print(f"[bold blue] - [/bold blue]{write_file.target_file} - ({loc} lines of code.)")
+
+ typer.confirm("\nProceed writing?", abort=True)
diff --git a/dynamicio/v5_migration/resource_migration.py b/dynamicio/v5_migration/resource_migration.py
new file mode 100644
index 0000000..a2d61fb
--- /dev/null
+++ b/dynamicio/v5_migration/resource_migration.py
@@ -0,0 +1,86 @@
+# pylint: skip-file
+# noqa
+# type: ignore
+
+
+from __future__ import annotations
+
+from copy import deepcopy
+
+from rich import print as rich_print
+
+from dynamicio.v5_migration.resource_templates import (
+ KafkaTemplate,
+ LocalTemplate,
+ PostgresTemplate,
+ ReadyTemplate,
+ S3Template,
+)
+
+
+def is_resource_dict(candidate_dict: dict) -> bool:
+ """Checks if a dict is a resource dict."""
+ # make lower case
+ check_dict = {
+ k.lower(): {kk.lower(): vv for kk, vv in v.items()} if isinstance(v, dict) else v
+ for k, v in deepcopy(candidate_dict).items()
+ }
+ for key, value in check_dict.items():
+ if not isinstance(value, dict):
+ return False
+ if "cloud" not in value:
+ rich_print(f"[red]No cloud key in {key} resource - not parsing as resource.[/red]")
+ return False
+ return True
+
+
+def convert_single_resource_file(file_contents: dict) -> str:
+ """Converts a single resource file (yaml) to valid python code without imports."""
+
+ result = convert_resource_dict(file_contents)
+
+ return "\n".join([resource.render_template() for resource in result])
+
+
+def convert_resource_dict(parsed_yaml: dict) -> list[ReadyTemplate]:
+ """Converts a single resource dict to a list of keyed resource templates."""
+ ready_templates = []
+ for resource_key, resource_dict in parsed_yaml.items():
+ resource_name = f"{resource_key.lower()}_resource"
+ has_parsed = False
+ for resource_type in [S3Template, LocalTemplate, KafkaTemplate, PostgresTemplate]:
+ if resource_type.is_dict_parseable(resource_dict): # type: ignore
+ try:
+ ready_template = resource_type.from_dict(resource_dict, resource_name)
+ ready_templates.append(ready_template) # type: ignore
+ has_parsed = True
+ except Exception as e:
+ print(e)
+ has_parsed = False
+ if not has_parsed:
+ rich_print(f"Could not parse resource [red]{resource_key}[/red]")
+
+ return ready_templates
+
+
+def parse_resource_configs(parsed_yaml_entry: dict[str, str]) -> list:
+ """Parses a single resource config dict."""
+ resource_configs = []
+
+ for key, val in parsed_yaml_entry.items():
+ if key == "schema":
+ continue
+
+ for resource_type in [S3Template, LocalTemplate, KafkaTemplate, PostgresTemplate]:
+ if resource_type.is_dict_parseable(val): # type: ignore
+ resource_configs.append(resource_type.from_dict(val, key.lower())) # type: ignore
+
+ return resource_configs
+
+
+resources_import_str = (
+ "from dynamicio import "
+ "ParquetResource, CsvResource, JsonResource, HdfResource,"
+ "S3ParquetResource, S3CsvResource, S3JsonResource, S3HdfResource, "
+ "KafkaResource, PostgresResource\n"
+)
diff --git a/dynamicio/v5_migration/resource_templates.py b/dynamicio/v5_migration/resource_templates.py
new file mode 100644
index 0000000..e6e6e20
--- /dev/null
+++ b/dynamicio/v5_migration/resource_templates.py
@@ -0,0 +1,231 @@
+# pylint: skip-file
+# noqa
+# type: ignore
+
+from __future__ import annotations
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
+
+
+# @dataclass
+# class KeyedResourceTemplate:
+# resources: list
+# resource_name: str
+# template: str = """
+# {resource_name} = KeyedResource(
+# {{
+# {resources}
+# }}
+# )
+# """
+#
+# def render_template(self) -> str:
+# return self.template.format(
+# resource_name=self.resource_name,
+# resources="\n".join([resource.render_own_resource() for resource in self.resources]),
+# )
+
+
+class ReadyTemplate(ABC):
+ @abstractmethod
+ def render_template(self) -> str:
+ raise NotImplementedError
+
+
+s3_file_type_class_map = {
+ "parquet": "S3ParquetResource",
+ "csv": "S3CsvResource",
+ "json": "S3JsonResource",
+ "hdf": "S3HdfResource",
+}
+
+
+@dataclass
+class S3Template(ReadyTemplate):
+ resource_name: str
+ bucket: str
+ file_path: str
+ test_path: Optional[str]
+ class_name: str
+ template: str = """
+{resource_name} = {class_name}(
+ bucket="{bucket}",
+ path="{file_path}"{test_path_str}
+)
+"""
+
+ @classmethod
+ def from_dict(cls, resource_dict: dict[str, ...], resource_name: str) -> "S3Template":
+ file_type = resource_dict["cloud"]["s3"]["file_type"]
+ test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None)
+ # Warning: if local filetype does not match cloud filetype. This will not work.
+ return cls(
+ resource_name=resource_name,
+ bucket=resource_dict["cloud"]["s3"]["bucket"],
+ file_path=resource_dict["cloud"]["s3"]["file_path"],
+ class_name=s3_file_type_class_map[file_type],
+ test_path=test_path,
+ )
+
+ @staticmethod
+ def is_dict_parseable(resource_dict: dict[str, ...]):
+ return resource_dict["cloud"]["type"] == "s3_file" and resource_dict["cloud"]["s3"]["file_type"] in list(
+ s3_file_type_class_map.keys()
+ )
+
+ def render_template(self) -> str:
+ test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else ""
+ return self.template.format(
+ resource_name=self.resource_name,
+ class_name=self.class_name,
+ bucket=self.bucket,
+ file_path=self.file_path,
+ test_path_str=test_path_str,
+ )
+
+
+FILE_TYPES = ["parquet", "csv", "json", "hdf"]
+
+
+def replace_double_brackets(string: Optional[str]) -> Optional[str]:
+ if not string:
+ return None
+ return string.replace("[[", "{").replace("]]", "}")
+
+
+@dataclass
+class LocalTemplate(ReadyTemplate):
+ resource_name: str
+ file_path: str
+ file_type: Optional[str]
+ test_path: Optional[str]
+ template: str = """
+{resource_name} = FileResource(
+ path="{file_path}"{test_path_str}{file_type_str}
+)
+"""
+
+ @classmethod
+ def from_dict(cls, resource_dict: dict[str, ...], resource_name: str) -> "LocalTemplate":
+ test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None)
+ file_path = resource_dict["cloud"]["local"]["file_path"]
+ file_type = resource_dict["cloud"]["local"]["file_type"]
+
+ test_path = replace_double_brackets(test_path)
+ file_path = replace_double_brackets(file_path)
+ # these can be inferred from the file_path
+ if any([file_path.endswith("." + ext) for ext in FILE_TYPES]):
+ file_type = None
+ return cls(
+ resource_name=resource_name,
+ file_path=file_path,
+ file_type=file_type,
+ test_path=test_path,
+ )
+
+ @staticmethod
+ def is_dict_parseable(resource_dict: dict[str, ...]) -> bool:
+ file_type = resource_dict["cloud"]["local"]["file_type"]
+ return resource_dict["cloud"]["type"] == "local" and file_type in FILE_TYPES
+
+ def render_template(self) -> str:
+ test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else ""
+ file_type_str = f',\n file_type="{self.file_type}"' if self.file_type else ""
+ return self.template.format(
+ resource_name=self.resource_name,
+ file_type_str=file_type_str,
+ file_path=self.file_path,
+ test_path_str=test_path_str,
+ )
+
+
+@dataclass
+class KafkaTemplate(ReadyTemplate):
+ resource_name: str
+ topic: str
+ server: str
+ test_path: Optional[str]
+ template: str = """
+{resource_name} = KafkaResource(
+ server="{server}",
+ topic="{topic}"{test_path_str}
+)
+"""
+
+ @classmethod
+ def from_dict(cls, resource_dict: dict[str, ...], resource_name: str) -> "KafkaTemplate":
+ test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None)
+ return cls(
+ resource_name=resource_name,
+ topic=resource_dict["cloud"]["kafka"]["kafka_topic"],
+ server=resource_dict["cloud"]["kafka"]["kafka_server"],
+ test_path=test_path,
+ )
+
+ @staticmethod
+ def is_dict_parseable(resource_dict: dict[str, ...]):
+ return resource_dict["cloud"]["type"] == "kafka"
+
+ def render_template(self) -> str:
+ test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else ""
+ return self.template.format(
+ resource_name=self.resource_name,
+ server=self.server,
+ topic=self.topic,
+ test_path_str=test_path_str,
+ )
+
+
+@dataclass
+class PostgresTemplate(ReadyTemplate):
+ resource_name: str
+ db_host: str
+ db_port: str
+ db_name: str
+ db_user: str
+ db_password: str
+ class_name: str = "PostgresResource"
+ test_path: Optional[str] = None
+ template: str = """
+{resource_name} = {class_name}(
+ db_host="{db_host}",
+ db_port="{db_port}",
+ db_name="{db_name}",
+ db_user="{db_user}",
+ db_password="{db_password}",
+ table_name=None,
+ sql_query=...{test_path_str}
+)
+"""
+
+ @classmethod
+ def from_dict(cls, resource_dict: dict[str, dict[str, str]], resource_name: str) -> "PostgresTemplate":
+ test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None)
+ return cls(
+ resource_name=resource_name,
+ db_host=resource_dict["cloud"]["postgres"]["db_host"],
+ db_port=resource_dict["cloud"]["postgres"]["db_port"],
+ db_name=resource_dict["cloud"]["postgres"]["db_name"],
+ db_user=resource_dict["cloud"]["postgres"]["db_user"],
+ db_password=resource_dict["cloud"]["postgres"]["db_password"],
+ test_path=test_path,
+ )
+
+ @staticmethod
+ def is_dict_parseable(resource_dict: dict[str, ...]):
+ return resource_dict["cloud"]["type"] == "postgres"
+
+ def render_template(self) -> str:
+ test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else ""
+ return self.template.format(
+ resource_name=self.resource_name,
+ class_name=self.class_name,
+ db_host=self.db_host,
+ db_port=self.db_port,
+ db_name=self.db_name,
+ db_user=self.db_user,
+ db_password=self.db_password,
+ test_path_str=test_path_str,
+ )
diff --git a/dynamicio/v5_migration/schema_migration.py b/dynamicio/v5_migration/schema_migration.py
new file mode 100644
index 0000000..5d7275d
--- /dev/null
+++ b/dynamicio/v5_migration/schema_migration.py
@@ -0,0 +1,498 @@
+# pylint: skip-file
+# noqa
+
+
+from __future__ import annotations
+
+import abc
+import re
+from dataclasses import dataclass
+from string import ascii_lowercase, digits
+from typing import Any
+
+from rich import print as rich_print
+
+from dynamicio.metrics import Metric
+
+schema_import_str = """from datetime import datetime
+
+import pandera as pa
+from pandera import SchemaModel
+from pandera.typing import Series
+from dynamicio.metrics import Metric
+
+"""
+
+_numpy_type_to_pandera_mapping = {
+ r"object": "str",
+ r"float.*": "float",
+ r"int.*": "int",
+ r"datetime.*": "datetime",
+ r"bool": "bool",
+}
+
+
+def is_schema_dict(yaml_schema: dict) -> bool:
+ if "columns" not in yaml_schema:
+ return False
+ for name, info in yaml_schema["columns"].items():
+ if not isinstance(name, str):
+ return False
+ if "type" not in info:
+ return False
+ return True
+
+
+def convert_single_schema_file(yaml_contents: dict) -> str:
+ name = yaml_contents["name"]
+ columns = _collect_columns(yaml_contents)
+
+ schema_class = SchemaClass(name=name, columns=columns)
+
+ return schema_class.render_template()
+
+
+class Validation(abc.ABC):
+ @abc.abstractmethod
+ def render_own_template(self) -> str:
+ raise NotImplementedError()
+
+
+@dataclass
+class HasNoNulls(Validation):
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "has_no_null_values"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "HasNoNulls":
+ return cls()
+
+ def render_own_template(self) -> str:
+ return "nullable=False"
+
+
+@dataclass
+class IsIn(Validation):
+ categories: list[str]
+ match_all: bool
+ template: str = "isin=[{categories}]"
+
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "is_in"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsIn":
+ match_all = candidate["options"].get("match_all", True)
+ if not match_all:
+ rich_print(
+ f"[bold red]The migration of validation `is_in` with `match_all = False` is not supported. "
+ f"`match_all: false` actually means that unique values of column should be equal to given categories, "
+ f"without any missing (yes that sounds the wrong way round). "
+ f"Please implement it manually by specifying the a custom check in your pandera schema "
+ f"as follows: [/bold red]"
+ )
+ rich_print(
+ f"\n"
+ f'@pa.check("column_name")\n'
+ f"def is_in_check(cls, series: Series[str]) -> Series[bool]:\n"
+ f" # Implementation\n"
+ f" return ...\n\n"
+ )
+ return cls(categories=candidate["options"]["categorical_values"], match_all=match_all)
+
+ def render_own_template(self) -> str:
+ if not self.match_all:
+ return ""
+ return self.template.format(categories=",".join(f'"{cat}"' for cat in self.categories))
+
+
+@dataclass
+class HasUniqueValues(Validation):
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "has_unique_values"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "HasUniqueValues":
+ return cls()
+
+ def render_own_template(self) -> str:
+ return "unique=True"
+
+
+@dataclass
+class IsGreaterThan(Validation):
+ threshold: float
+ template: str = "gt={threshold}"
+
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "is_greater_than"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsGreaterThan":
+ return cls(threshold=candidate["options"]["threshold"])
+
+ def render_own_template(self) -> str:
+ return self.template.format(threshold=self.threshold)
+
+
+@dataclass
+class IsGreaterThanOrEquals(Validation):
+ threshold: float
+ template: str = "ge={threshold}"
+
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "is_greater_than_or_equal"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsGreaterThanOrEquals":
+ return cls(threshold=candidate["options"]["threshold"])
+
+ def render_own_template(self) -> str:
+ return self.template.format(threshold=self.threshold)
+
+
+@dataclass
+class IsLessThan(Validation):
+ threshold: float
+ template: str = "lt={threshold}"
+
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "is_lower_than"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsLessThan":
+ return cls(threshold=candidate["options"]["threshold"])
+
+ def render_own_template(self) -> str:
+ return self.template.format(threshold=self.threshold)
+
+
+@dataclass
+class IsLessThanOrEquals(Validation):
+ threshold: float
+ template: str = "le={threshold}"
+
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "is_lower_than_or_equal"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsLessThanOrEquals":
+ return cls(threshold=candidate["options"]["threshold"])
+
+ def render_own_template(self) -> str:
+ return self.template.format(threshold=self.threshold)
+
+
+@dataclass
+class IsBetween(Validation):
+ min_value: float
+ max_value: float
+ include_min: bool
+ include_max: bool
+ template: str = 'in_range={{"min_value":{min_value}, "max_value":{max_value}, "include_min":{include_min}, "include_max":{include_max}}}'
+
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ return validation_name == "is_between"
+
+ @classmethod
+ def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsBetween":
+ return cls(
+ min_value=candidate["options"]["lower"],
+ max_value=candidate["options"]["upper"],
+ include_min=candidate["options"]["include_left"] if "include_left" in candidate["options"] else False,
+ include_max=candidate["options"]["include_right"] if "include_right" in candidate["options"] else False,
+ )
+
+ def render_own_template(self) -> str:
+ return self.template.format(
+ min_value=self.min_value,
+ max_value=self.max_value,
+ include_min=self.include_min,
+ include_max=self.include_max,
+ )
+
+
+@dataclass
+class HasAcceptablePercentageOfNulls(Validation):
+ @staticmethod
+ def is_matched(validation_name: str) -> bool:
+ template: str = """
+@pa.check("column_name")
+def has_acceptable_percentage_of_nulls_check(cls, series: Series[str]) -> Series[bool]:
+ # Implementation
+ return ...
+ """
+
+ if validation_name == "has_acceptable_percentage_of_nulls":
+ rich_print(
+ f"[bold red]The migration of validation `has_acceptable_percentage_of_nulls` is not supported. "
+ f"Please implement it manually by specifying the a custom check in your pandera schema "
+ f"as follows: [/bold red]"
+ )
+ rich_print(template)
+
+ return False
+
+ def render_own_template(self) -> str:
+ return ""
+
+
+_supported_validations = [
+ HasNoNulls,
+ IsIn,
+ HasUniqueValues,
+ IsGreaterThan,
+ IsGreaterThanOrEquals,
+ IsLessThan,
+ IsLessThanOrEquals,
+ IsBetween,
+ HasAcceptablePercentageOfNulls,
+]
+
+
+class MetricLogger(abc.ABC):
+ @abc.abstractmethod
+ def render_own_template(self) -> str:
+ raise NotImplementedError()
+
+
+class Min(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.MIN.value
+
+ def render_own_template(self) -> str:
+ return "Metric.MIN"
+
+
+class Max(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.MAX.value
+
+ def render_own_template(self) -> str:
+ return "Metric.MAX"
+
+
+class Mean(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.MEAN.value
+
+ def render_own_template(self) -> str:
+ return "Metric.MEAN"
+
+
+class Std(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.STD.value
+
+ def render_own_template(self) -> str:
+ return "Metric.STD"
+
+
+class Variance(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.VARIANCE.value
+
+ def render_own_template(self) -> str:
+ return "Metric.VARIANCE"
+
+
+class Counts(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.COUNTS.value
+
+ def render_own_template(self) -> str:
+ return "Metric.COUNTS"
+
+
+class UniqueCounts(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.UNIQUE_COUNTS.value
+
+ def render_own_template(self) -> str:
+ return "Metric.UNIQUE_COUNTS"
+
+
+class CountsPerLabel(MetricLogger):
+ @staticmethod
+ def is_matched(metric_name: str) -> bool:
+ return metric_name == Metric.COUNTS_PER_LABEL.value
+
+ def render_own_template(self) -> str:
+ return "Metric.COUNTS_PER_LABEL"
+
+
+_supported_metrics = [
+ Min,
+ Max,
+ Mean,
+ Std,
+ Variance,
+ Counts,
+ UniqueCounts,
+ CountsPerLabel,
+]
+
+
+@dataclass
+class Column:
+ name: str
+ data_type: str
+ validations: list[Validation]
+ metrics: list[MetricLogger]
+ template_python_compatible = "{name}: Series[{data_type}] = pa.Field({options})"
+ _allowed_chars: list[str] = ascii_lowercase + digits + "_"
+
+ @property
+ def is_python_normalized(self) -> bool:
+ assert len(self.name) >= 1, "Column name cannot be empty"
+
+ s = self.name
+
+ is_lowercase = s == s.lower()
+ is_starts_with_alpha = s[0].isalpha()
+
+ is_only_alpha_num_and_underscore = all([(c in self._allowed_chars) for c in s])
+
+ return is_lowercase and is_starts_with_alpha and is_only_alpha_num_and_underscore
+
+ def _python_normalize(self) -> str:
+ normalized_name = self.name
+
+ # Lowercase the name
+ normalized_name = normalized_name.lower()
+
+ normalized_name_tmp = list(normalized_name)
+
+ # Replace all non-allowed characters (including spaces) with underscores
+ for idx, c in enumerate(list(normalized_name_tmp)):
+ if c not in self._allowed_chars:
+ normalized_name_tmp[idx] = "_"
+ normalized_name = "".join(normalized_name_tmp)
+
+ # Make sure the name doesn't begin with a number
+ if normalized_name[0] in digits:
+ normalized_name = normalized_name[1:]
+
+ # Accounts for the edge case when the unnormalized column name is just a single number,
+ # which results in an empty normalized name
+
+ if not normalized_name:
+ return f"_{self.name}"
+
+ return normalized_name
+
+ def render_template(self) -> str:
+ options = [option for option in self._render_options() if option] # Remove empty options
+
+ if self.is_python_normalized:
+ return self.template_python_compatible.format(
+ name=self.name, data_type=self.data_type, options=",".join(options)
+ )
+ else:
+ options.append(f'alias="{self.name}"')
+
+ return self.template_python_compatible.format(
+ name=self._python_normalize(), data_type=self.data_type, options=",".join(options)
+ )
+
+ def _render_options(self) -> list[str]:
+ options = []
+
+ for v in self.validations:
+ options.append(v.render_own_template())
+
+ # We default to all fields being nullable unless otherwise specified by the validations
+ if "nullable=False" not in options:
+ options.append("nullable=True")
+
+ # Optionally parse and append the metrics
+ if self.metrics:
+ metrics_template = 'log_statistics={{"metrics": [{metrics}]}}'.format(
+ metrics=",".join([m.render_own_template() for m in self.metrics])
+ )
+
+ options.append(metrics_template)
+
+ return options
+
+
+@dataclass
+class SchemaClass:
+ name: str
+ columns: list[Column]
+
+ template = """
+class {class_name}(SchemaModel):
+{columns}
+
+ class Config:
+ coerce = True
+ strict = "filter"
+ """
+
+ def _python_normalize(self) -> str:
+ normalized_class_name = ""
+
+ for word in self.name.split("_"):
+ normalized_class_name += word.lower().capitalize()
+
+ return normalized_class_name
+
+ def render_template(self) -> str:
+ rendered_columns = "\n".join([" " + col.render_template() for col in self.columns])
+
+ return self.template.format(
+ class_name=f"{self._python_normalize()}Schema",
+ columns=rendered_columns,
+ )
+
+
+def _collect_columns(yaml_schema) -> list[Column]:
+ columns = []
+ for col_name, col_info in yaml_schema["columns"].items():
+ parsed_numpy_dtype = col_info["type"]
+ parsed_validations = []
+ parsed_metrics = []
+
+ for candidate_type in _numpy_type_to_pandera_mapping:
+ if re.search(candidate_type, parsed_numpy_dtype) is not None:
+ derived_pandera_type = _numpy_type_to_pandera_mapping[candidate_type]
+
+ if col_info.get("validations"):
+ for validation_name, validation_body in col_info.get("validations").items():
+ for candidate_validation in _supported_validations:
+ if candidate_validation.is_matched(validation_name):
+ parsed_validations.append(candidate_validation.parse_from_dict(validation_body))
+ if col_info.get("metrics"):
+ for metric_name in col_info["metrics"]:
+ for metric_candidate in _supported_metrics:
+ if metric_candidate.is_matched(metric_name):
+ parsed_metrics.append(metric_candidate())
+
+ assert derived_pandera_type is not None, "Could not match the numpy dtype to pandera type"
+
+ columns.append(
+ Column(
+ name=col_name,
+ data_type=derived_pandera_type,
+ validations=parsed_validations,
+ metrics=parsed_metrics,
+ )
+ )
+
+ return columns
diff --git a/dynamicio/validations.py b/dynamicio/validations.py
deleted file mode 100644
index 26bd843..0000000
--- a/dynamicio/validations.py
+++ /dev/null
@@ -1,347 +0,0 @@
-"""Implements the Validator class responsible for various generic data validations and metrics generation."""
-import operator
-from typing import Callable, NamedTuple, Set
-
-import pandas as pd # type: ignore
-
-ALL_VALIDATORS = {} # name -> function
-
-
-def validator(func: Callable):
- """A decorator to add the function to the ALL_VALIDATORS dict"""
- name = func.__name__
- assert name not in ALL_VALIDATORS
- ALL_VALIDATORS[name] = func
- return func
-
-
-class ValidationResult(NamedTuple):
- """A NamedTuple for capturing different outputs after a validation."""
-
- valid: bool
- message: str
- value: float
-
-
-@validator
-def has_unique_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
- """Checks if values in column are unique.
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_duplicated_elements
- """
- counts = df[column].value_counts()
- if not (counts > 1).any():
- return ValidationResult(valid=True, message=f"{dataset}[{column}] has unique values", value=0)
-
- duplicates = counts[counts > 1].index.to_list()
- return ValidationResult(valid=False, message=f"Values {duplicates} for {dataset}[{column}] are duplicated!", value=len(duplicates))
-
-
-@validator
-def has_no_null_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
- """Checks if column has any null values (including NaN and NaT values).
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_nulls
- """
- mask = df[column].isnull()
- no_of_nulls = mask.sum()
- return ValidationResult(valid=not mask.any(), message=f"{dataset}[{column}] has {no_of_nulls} nulls", value=no_of_nulls)
-
-
-@validator
-def has_acceptable_percentage_of_nulls(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Checks if a provided threshold of max nulls has been exceeded.
-
- Note: For an empty df the validation will always be successful
-
- Args:
- dataset: Name fo the dataset_name
- df: A pandas DataFrame
- column: The column to be validated
- threshold: Maximum allowed threshold
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is percentage_of_nulls
- """
- if threshold <= 0 or threshold >= 1:
- raise ValueError(f"Threshold value: {threshold} must be a value between 0 and 1.")
-
- no_of_nulls = df[column].isnull().sum()
- if len(df) == 0:
- percentage_of_nulls = 0
- else:
- percentage_of_nulls = no_of_nulls / len(df)
-
- if percentage_of_nulls < threshold:
- return ValidationResult(
- valid=True,
- message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls}",
- value=percentage_of_nulls,
- )
- return ValidationResult(
- valid=False,
- message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls} which exceeds threshold: {threshold}",
- value=percentage_of_nulls,
- )
-
-
-@validator
-def is_in(dataset: str, df: pd.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) -> ValidationResult:
- """Checks if the column only has allowed categorical values as per the set provided.
-
- Note:
- Ignores nulls
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- categorical_values: The allowed set of categorical values
- match_all: If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_not_acceptable
- """
- unique_values = set(df[column][df[column].notna()].unique())
-
- if match_all:
- return _validate_categoricals_are_a_subset_of_the_acceptable(categorical_values, unique_values, column, dataset, df)
- return _validate_all_acceptable_categoricals_are_present(categorical_values, unique_values, column, dataset, df)
-
-
-@validator
-def _validate_all_acceptable_categoricals_are_present(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult:
- if unique_values == acceptable_categoricals:
- validation_result = ValidationResult(valid=True, message=f"All acceptable categorical values for {dataset}[{column}] are present", value=0)
- elif unique_values < acceptable_categoricals:
- validation_result = ValidationResult(
- valid=False,
- message=f"Missing categorical values for {dataset}[{column}]: {acceptable_categoricals - unique_values}",
- value=len(acceptable_categoricals - unique_values),
- )
- else:
- count_invalid = (~df[column].isin(acceptable_categoricals)).sum()
- validation_result = ValidationResult(
- valid=False,
- message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells",
- value=count_invalid,
- )
- return validation_result
-
-
-@validator
-def _validate_categoricals_are_a_subset_of_the_acceptable(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult:
- if unique_values.issubset(acceptable_categoricals):
- return ValidationResult(valid=True, message=f"Categorical values for {dataset}[{column}] are acceptable", value=0)
- count_invalid = (~df[column].isin(acceptable_categoricals)).sum()
- return ValidationResult(
- valid=False,
- message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells",
- value=count_invalid,
- )
-
-
-@validator
-def is_greater_than(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are above a given threshold.
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
- percentage of invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df > threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-@validator
-def is_greater_than_or_equal(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are above a given threshold.
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
- percentage of invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df >= threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-@validator
-def is_lower_than(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are below a given threshold.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df < threshold # pd.DataFrame
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-@validator
-def is_lower_than_or_equal(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- threshold: float,
-) -> ValidationResult:
- """Confirms column values are below a given threshold.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- threshold: A lower bound threshold not to be exceeded
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- valid = no_nulls_for_column_df <= threshold
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
-
-
-@validator
-def is_between(
- dataset: str,
- df: pd.DataFrame,
- column: str,
- lower: float,
- upper: float,
- include_left: bool = False,
- include_right: bool = False,
-) -> ValidationResult:
- """Confirms column values are between a lower bound and an upper bound thresholds.
-
- IMPORTANT NOTE: Ignores nulls!
-
- Args:
- dataset: Name fo the dataset_name
- df: A DataFrame
- column: The DataFrame column to be validated
- lower: The lower bound (left)
- upper: The upper bound (right)
- include_left: `left <= df[column]`
- include_right: `df[column] <=right`
-
- Returns:
- An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
- `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
- invalid values
- """
- no_nulls_for_column_df = df[~df[column].isnull()][column]
- lower_bound_operator = operator.ge if include_left else operator.gt
- upper_bound_operator = operator.le if include_right else operator.lt
-
- valid = lower_bound_operator(no_nulls_for_column_df, lower) & upper_bound_operator(no_nulls_for_column_df, upper)
-
- if valid.all():
- return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] is between {lower} and {upper} thresholds", value=0)
-
- no_of_invalid = (~valid).sum()
- return ValidationResult(
- valid=False,
- message=f"{no_of_invalid} cell values for {dataset}[{column}] are either below {lower} or above {upper}",
- value=no_of_invalid / len(no_nulls_for_column_df),
- )
diff --git a/dynamicio/validators.py b/dynamicio/validators.py
new file mode 100644
index 0000000..fa45d12
--- /dev/null
+++ b/dynamicio/validators.py
@@ -0,0 +1 @@
+"""Custom validators for the dynamicio, to be used with pandera schemas."""
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 0000000..886e142
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,4 @@
+[mypy-pandera]
+ignore_errors = True
+[mypy]
+warn_unused_configs = True
diff --git a/pyproject.toml b/pyproject.toml
index 855818d..2386f63 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[tool.black]
py38 = true
-line-length = 185
+line-length = 120
include = '\.pyi?$'
exclude = '''
(
@@ -30,7 +30,3 @@ exclude = '''
]
addopts = "-p no:warnings"
log_cli = false
-
-[tool.pydocstyle]
-convention = 'google'
-add_ignore = 'D103' # Ignore missing docstring in public function
diff --git a/requirements-dev.txt b/requirements-dev.txt
index e96b75e..9633d9c 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -8,7 +8,6 @@ gitlint==0.17.0
mock==4.0.3
mypy==0.990
pre-commit==2.20.0
-pydocstyle==6.1.1
pylint==2.15.5
pytest-asyncio==0.20.2
pytest-cov==4.0.0
diff --git a/requirements.txt b/requirements.txt
index bb39097..4a6a521 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,14 +4,16 @@ fastparquet>=0.8.0
fsspec==2022.3.0
kafka-python~=2.0.2
logzero>=1.7.0
-magic-logger>=1.0.2
-pandas>=1.2.4
+pandas~=1.2
psycopg2-binary~=2.9.3
pyarrow>=7.0.0
python-json-logger~=2.0.1
-PyYAML~=5.4.1
+PyYAML>=5.4.1
s3fs==0.4.2
simplejson~=3.17.2
SQLAlchemy~=1.4.11
tables~=3.7.0
pydantic~=1.10.2
+pandera~=0.14.5
+typer==0.9.0
+uhura~=1.5.0
diff --git a/tests/conftest.py b/tests/conftest.py
index e03a7c4..1540430 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,913 +1,34 @@
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring,
-import os
-import pickle
-import pickletools
-import tempfile
-from unittest.mock import Mock, patch
-
-import numpy as np
import pandas as pd
import pytest
-from dynamicio import WithS3PathPrefix
-from tests import constants
-from tests.mocking.models import ERModel
-
-TEST_SQL_DIR = os.path.dirname(os.path.abspath(__file__)) + "/test_sql/"
-__pickle_loads = pickle.loads
-
-
-def mock_pickle_loads(data):
- global MAX_PROTO_FOUND # pylint: disable=global-variable-undefined
- op, fst, _ = next(pickletools.genops(data)) # pylint: disable=invalid-name)
- if op.name == "PROTO":
- proto = fst
- MAX_PROTO_FOUND = max(MAX_PROTO_FOUND, proto)
- return __pickle_loads(data)
-
-
-def max_pklproto_hdf(hdf_filename):
- global MAX_PROTO_FOUND # pylint: disable=global-variable-undefined
- MAX_PROTO_FOUND = -1
- with pytest.MonkeyPatch().context() as mocked_context:
- mocked_context.setattr(pickle, "loads", mock_pickle_loads)
- try:
- pd.read_hdf(hdf_filename)
- except ValueError:
- pass
- return MAX_PROTO_FOUND
-
-
-class DummyYaml:
- def __init__(self, path):
- self.path = path
-
- def __repr__(self):
- return f"DummyYaml({self.path!r})"
-
- def __enter__(self):
- return Mock(), None
-
- def __exit__(self, *args):
- return None
-
-
-@pytest.fixture
-def expected_input_yaml_dict():
- return {
- "bindings": {
- "READ_FROM_S3_CSV_ALT": {
- "name": "READ_FROM_S3_CSV_ALT",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "mock-key",
- "file_type": "csv",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "READ_FROM_S3_CSV": {
- "name": "READ_FROM_S3_CSV",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "mock-key",
- "file_type": "csv",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": {
- "name": "read_from_s3_csv",
- "columns": {
- "id": {
- "name": "id",
- "data_type": "int64",
- "validations": [
- {"name": "has_unique_values", "apply": True, "options": {}},
- {
- "name": "has_no_null_values",
- "apply": True,
- "options": {},
- },
- ],
- "metrics": ["UniqueCounts", "Counts"],
- },
- "foo_name": {
- "name": "foo_name",
- "data_type": "object",
- "validations": [
- {
- "name": "has_no_null_values",
- "apply": True,
- "options": {},
- },
- {
- "name": "is_in",
- "apply": True,
- "options": {
- "categorical_values": [
- "class_a",
- "class_b",
- "class_c",
- ]
- },
- },
- ],
- "metrics": ["CountsPerLabel"],
- },
- "bar": {
- "name": "bar",
- "data_type": "int64",
- "validations": [
- {
- "name": "has_no_null_values",
- "apply": True,
- "options": {},
- },
- {
- "name": "is_greater_than",
- "apply": True,
- "options": {"threshold": 1000},
- },
- {
- "name": "is_lower_than",
- "apply": True,
- "options": {"threshold": 2000},
- },
- ],
- "metrics": ["Min", "Max", "Mean", "Std", "Variance"],
- },
- },
- },
- },
- "READ_FROM_S3_JSON": {
- "name": "READ_FROM_S3_JSON",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_json_to_read.json",
- "file_type": "json",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "mock-key",
- "file_type": "json",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "READ_FROM_S3_HDF": {
- "name": "READ_FROM_S3_HDF",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_hdf_to_read.h5",
- "file_type": "hdf",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "mock-key",
- "file_type": "hdf",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "READ_FROM_S3_PARQUET": {
- "name": "READ_FROM_S3_PARQUET",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_parquet_to_read.parquet",
- "file_type": "parquet",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "s3:sample-prefix/mock-key",
- "file_type": "parquet",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "READ_FROM_POSTGRES": {
- "name": "READ_FROM_POSTGRES",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_pg_parquet_to_read.parquet",
- "file_type": "parquet",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "postgres",
- "postgres": {
- "db_host": "127.0.0.1",
- "db_port": "17039",
- "db_name": "backend",
- "db_user": "user",
- "db_password": "pass",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "READ_FROM_KAFKA": {
- "name": "READ_FROM_KAFKA",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_parquet_to_read.parquet",
- "file_type": "parquet",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "kafka",
- "kafka": {
- "kafka_server": "mock-kafka-server",
- "kafka_topic": "mock-kafka-topic",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "TEMPLATED_FILE_PATH": {
- "name": "TEMPLATED_FILE_PATH",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/{{file_name_to_replace}}.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "path/to/{file_name_to_replace}.csv",
- "file_type": "csv",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "READ_FROM_PARQUET_TEMPLATED": {
- "name": "READ_FROM_PARQUET_TEMPLATED",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/{{file_name_to_replace}}.parquet",
- "file_type": "parquet",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "path/to/{file_name_to_replace}.parquet",
- "file_type": "parquet",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": None,
- },
- "REPLACE_SCHEMA_WITH_DYN_VARS": {
- "name": "REPLACE_SCHEMA_WITH_DYN_VARS",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/{{file_name_to_replace}}.parquet",
- "file_type": "parquet",
- },
- }
- },
- "dynamicio_schema": {
- "name": "bar",
- "columns": {
- "column_a": {
- "name": "column_a",
- "data_type": "object",
- "validations": [
- {"name": "has_unique_values", "apply": True, "options": {}}
- ],
- "metrics": ["Counts"],
- },
- "column_b": {
- "name": "column_b",
- "data_type": "object",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}}
- ],
- "metrics": ["CountsPerLabel"],
- },
- "column_c": {
- "name": "column_c",
- "data_type": "float64",
- "validations": [
- {
- "name": "is_greater_than",
- "apply": True,
- "options": {"threshold": 1000},
- }
- ],
- "metrics": [],
- },
- "column_d": {
- "name": "column_d",
- "data_type": "float64",
- "validations": [
- {
- "name": "is_lower_than",
- "apply": True,
- "options": {"threshold": 1000.0},
- }
- ],
- "metrics": ["Min", "Max", "Mean", "Std", "Variance"],
- },
- "0": {
- "name": "0",
- "data_type": "object",
- "validations": [],
- "metrics": [],
- },
- "1": {
- "name": "1",
- "data_type": "object",
- "validations": [],
- "metrics": [],
- },
- },
- },
- },
- }
- }
-
-
-@pytest.fixture
-def expected_s3_csv_local_mapping():
- return {
- "name": "READ_FROM_S3_CSV",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "mock-key",
- "file_type": "csv",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": {
- "name": "read_from_s3_csv",
- "columns": {
- "id": {
- "name": "id",
- "data_type": "int64",
- "validations": [
- {"name": "has_unique_values", "apply": True, "options": {}},
- {"name": "has_no_null_values", "apply": True, "options": {}},
- ],
- "metrics": ["UniqueCounts", "Counts"],
- },
- "foo_name": {
- "name": "foo_name",
- "data_type": "object",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}},
- {
- "name": "is_in",
- "apply": True,
- "options": {
- "categorical_values": ["class_a", "class_b", "class_c"]
- },
- },
- ],
- "metrics": ["CountsPerLabel"],
- },
- "bar": {
- "name": "bar",
- "data_type": "int64",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}},
- {
- "name": "is_greater_than",
- "apply": True,
- "options": {"threshold": 1000},
- },
- {
- "name": "is_lower_than",
- "apply": True,
- "options": {"threshold": 2000},
- },
- ],
- "metrics": ["Min", "Max", "Mean", "Std", "Variance"],
- },
- },
- },
- }
-
-
-@pytest.fixture
-def expected_s3_csv_cloud_mapping():
- return {
- "name": "read_from_s3_csv",
- "columns": {
- "id": {
- "name": "id",
- "data_type": "int64",
- "validations": [
- {"name": "has_unique_values", "apply": True, "options": {}},
- {"name": "has_no_null_values", "apply": True, "options": {}},
- ],
- "metrics": ["UniqueCounts", "Counts"],
- },
- "foo_name": {
- "name": "foo_name",
- "data_type": "object",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}},
- {
- "name": "is_in",
- "apply": True,
- "options": {
- "categorical_values": ["class_a", "class_b", "class_c"]
- },
- },
- ],
- "metrics": ["CountsPerLabel"],
- },
- "bar": {
- "name": "bar",
- "data_type": "int64",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}},
- {
- "name": "is_greater_than",
- "apply": True,
- "options": {"threshold": 1000},
- },
- {
- "name": "is_lower_than",
- "apply": True,
- "options": {"threshold": 2000},
- },
- ],
- "metrics": ["Min", "Max", "Mean", "Std", "Variance"],
- },
- },
- }
-
-
-@pytest.fixture
-def expected_postgres_cloud_mapping():
- return {
- "options": {},
- "data_backend_type": "postgres",
- "postgres": {
- "db_host": "127.0.0.1",
- "db_port": "17039",
- "db_name": "backend",
- "db_user": "user",
- "db_password": "pass",
- },
- }
-
-
-@pytest.fixture
-def expected_s3_parquet_df():
- return pd.read_parquet(f"{constants.TEST_RESOURCES}/data/input/some_parquet_to_read.parquet")
-
-
-@pytest.fixture(scope="class")
-def expected_s3_hdf_file_path():
- return f"{constants.TEST_RESOURCES}/data/input/some_hdf_to_read.h5"
-
-
-@pytest.fixture(scope="class")
-def expected_s3_hdf_df(expected_s3_hdf_file_path): # pylint: disable=redefined-outer-name
- return pd.read_hdf(expected_s3_hdf_file_path)
-
-
-@pytest.fixture
-def expected_s3_json_df():
- return pd.read_json(f"{constants.TEST_RESOURCES}/data/input/some_json_to_read.json", orient="columns")
-
-
-@pytest.fixture
-def expected_s3_csv_df():
- return pd.read_csv(f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv")
-
-
-@pytest.fixture
-def expected_df_with_less_columns():
- df = pd.DataFrame.from_records(
- [
- [1, "name_a"],
- [2, "name_b"],
- [3, "name_a"],
- [4, "name_b"],
- [5, "name_a"],
- [6, "name_b"],
- [7, "name_a"],
- [8, "name_b"],
- [9, "name_a"],
- [10, "name_b"],
- [11, "name_a"],
- [12, "name_b"],
- [13, "name_a"],
- [14, "name_b"],
- [15, "name_a"],
- ],
- columns=["id", "foo_name"],
- )
- return df
-
-
-@pytest.fixture
-def dataset_with_more_columns_than_dictated_in_schema():
- df = pd.DataFrame.from_records(
- [
- [1, "foo_a", 1, 1500, 1600, "pass_through"],
- [2, "foo_b", 2, 1500, 1600, "pass_through"],
- [3, "foo_a", 3, 1500, 1600, "pass_through"],
- [4, "foo_b", 4, 1500, 1600, "pass_through"],
- [5, "foo_a", 5, 1500, 1600, "pass_through"],
- [6, "foo_b", 6, 1500, 1600, "pass_through"],
- [7, "foo_a", 7, 1500, 1600, "pass_through"],
- [8, "foo_b", 8, 1500, 1600, "pass_through"],
- [9, "foo_a", 9, 1500, 1600, "pass_through"],
- [10, "foo_b", 10, 1500, 1600, "pass_through"],
- [11, "foo_a", 11, 1500, 1600, "pass_through"],
- [12, "foo_b", 12, 1500, 1600, "pass_through"],
- [13, "foo_a", 13, 1500, 1600, "pass_through"],
- [14, "foo_b", 14, 1500, 1600, "pass_through"],
- [15, "foo_a", 15, 1500, 1600, "pass_through"],
- ],
- columns=["id", "foo_name", "bar", "start_odometer", "end_odometer", "event_type"],
- )
- return df
-
@pytest.fixture
def test_df():
- df = pd.DataFrame.from_records(
- [
- ["cm_1", "id_1", 1000, "ABC"],
- ["cm_2", "id_2", 1000, "ABC"],
- ["cm_3", "id_3", 1000, "ABC"],
- ],
- columns=["id", "foo", "bar", "baz"],
- )
- return df
-
-
-@pytest.fixture
-def expected_columns():
- return [ERModel.id, ERModel.foo, ERModel.bar, ERModel.baz]
-
-
-@pytest.fixture
-def expected_kwargs_for_read_parquet():
- return {"engine", "columns", "kwargs", "path", "use_nullable_dtypes"}
-
-
-@pytest.fixture
-def expected_value_serializer():
- return {'value_serializer': 'WithKafka._default_value_serializer'}
+ return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [True, False, True]})
@pytest.fixture
-def input_messages_df():
- return pd.DataFrame.from_dict(
- [
- {"id": "message01", "foo": "xxxxxxxx", "bar": 0, "baz": ["a", "b", "c"]},
- {"id": "message02", "foo": "yyyyyyyy", "bar": 1, "baz": ["d", "e", "f"]},
- ]
- )
+def injectable_string():
+ return "{var1}/{var2}"
@pytest.fixture
-def input_schema_definition():
- return {
- "columns": {
- "id": {
- "metrics": ["UniqueCounts", "Counts"],
- "type": "int64",
- "validations": {
- "has_no_null_values": {"apply": True, "options": {}},
- "has_unique_values": {"apply": True, "options": {}},
- },
- },
- "bar": {
- "metrics": ["Min", "Max", "Mean", "Std", "Var"],
- "type": "int64",
- "validations": {
- "has_no_null_values": {"apply": True, "options": {}},
- "is_greater_than": {"apply": True, "options": {"threshold": 1000}},
- "is_lower_than": {"apply": True, "options": {"threshold": 2000}},
- },
- },
- "foo_name": {
- "metrics": None,
- "type": "object",
- "validations": {
- "is_in": {
- "apply": True,
- "options": {"categorical_values": ["class_a", "class_b", "class_c"]},
- },
- "has_no_null_values": {"apply": True, "options": {}},
- },
- },
- },
- "name": "read_from_s3_csv",
- }
-
-
-# @pytest.fixture
-# def expected_schema():
-# return {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-@pytest.fixture
-def expected_schema_definition():
- return {
- "name": "READ_FROM_S3_CSV",
- "environments": {
- "LOCAL": {
- "options": {},
- "data_backend_type": "local",
- "local": {
- "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "options": {},
- "data_backend_type": "s3",
- "s3": {
- "file_path": "mock-key",
- "file_type": "csv",
- "bucket": "mock-bucket",
- },
- },
- },
- "dynamicio_schema": {
- "name": "read_from_s3_csv",
- "columns": {
- "id": {
- "name": "id",
- "data_type": "int64",
- "validations": [
- {"name": "has_unique_values", "apply": True, "options": {}},
- {"name": "has_no_null_values", "apply": True, "options": {}},
- ],
- "metrics": ["UniqueCounts", "Counts"],
- },
- "foo_name": {
- "name": "foo_name",
- "data_type": "object",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}},
- {
- "name": "is_in",
- "apply": True,
- "options": {
- "categorical_values": ["class_a", "class_b", "class_c"]
- },
- },
- ],
- "metrics": ["CountsPerLabel"],
- },
- "bar": {
- "name": "bar",
- "data_type": "int64",
- "validations": [
- {"name": "has_no_null_values", "apply": True, "options": {}},
- {
- "name": "is_greater_than",
- "apply": True,
- "options": {"threshold": 1000},
- },
- {
- "name": "is_lower_than",
- "apply": True,
- "options": {"threshold": 2000},
- },
- ],
- "metrics": ["Min", "Max", "Mean", "Std", "Variance"],
- },
- },
- },
- }
-
-
-@pytest.fixture
-def valid_dataframe():
- return pd.DataFrame.from_dict(
- {
- "id": [3, 2, 1, 0],
- "foo_name": ["class_a", "class_b", "class_c", "class_a"],
- "bar": [1500, 1500, 1500, 1500],
- }
- )
-
-
-@pytest.fixture
-def invalid_dataframe():
- return pd.DataFrame.from_dict(
- {
- "id": [3, 2, 0, 0],
- "foo_name": ["class_a", "class_b", "class_d", "class_a"],
- "bar": [999, 1500, 2500, 1500],
- }
- )
-
-
-@pytest.fixture
-def expected_messages():
- return {
- "has_unique_values",
- "is_in",
- "is_greater_than",
- "is_lower_than",
- }
-
-
-@pytest.fixture
-def input_df():
- return pd.DataFrame.from_records(
- [
- ["event_0", "A", "A", "discharge", 10.01234, pd.NA, pd.Timestamp("2021-03-30"), 100.01234, 5, 5, ],
- ["event_1", "B", "B", "pass_through", 10.01234, None, pd.Timestamp("2021-03-30"), 100.01234, 6, 6, ],
- ["event_2", "A", "A", "load", None, None, pd.NaT, pd.NA, 7, 7],
- ["event_3", "B", "B", "pass_through", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 8, 8, ],
- ["event_4", "C", pd.NA, "load", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 9, 9, ],
- ["event_5", "A", "A", "pass_through", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 8, 8, ],
- ["event_6", "C", "C", "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 7, 7, ],
- ["event_7", "A", None, "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 6, 6, ],
- ["event_8", None, np.nan, "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 5, 5, ],
- ["event_9", "A", "A", "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 5, None, ],
- ],
- columns=["id", "category_a", "category_b", "activity", "duration_a", "duration_b", "start_time", "load", "weight_a", "weight_b", ],
- )
-
-
-@pytest.fixture
-def empty_df():
- return pd.DataFrame.from_records(
- [],
- columns=["id", "category_a", "category_b", "activity", "duration_a", "duration_b", "start_time", "load", "weight_a", "weight_b", ],
- )
-
-
-# Mocks
-s3_obj_file_names = ["s3://path/to/obj_1.h5", "s3://path/to/obj_2.h5", "s3://path/to/obj_3.h5"]
-invalid_s3_obj_file_names = ["s3://path/to/.gitkeep", "s3://path/to/obj_2.h5", "s3://path/to/obj_3.h5"]
-local_obj_file_names = ["obj_1.h5", "obj_2.h5", "obj_3.h5"]
-invalid_local_obj_file_names = ["obj_2.h5", "obj_3.h5"]
-
-
-@pytest.fixture
-def mock__read_hdf_file():
- def return_mock_df(path, _schema, **_options):
- path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)}
-
- return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]})
-
- with patch.object(WithS3PathPrefix, "_read_hdf_file", side_effect=return_mock_df) as mock:
- yield mock
-
-
-@pytest.fixture
-def mock__read_parquet_file():
- def return_mock_df(path, _schema, **_options):
- path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)}
-
- return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]})
-
- with patch.object(WithS3PathPrefix, "_read_parquet_file", side_effect=return_mock_df) as mock:
- yield mock
-
-
-@pytest.fixture
-def mock__read_csv_file():
- def return_mock_df(path, _schema, **_options):
- path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)}
-
- return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]})
-
- with patch.object(WithS3PathPrefix, "_read_csv_file", side_effect=return_mock_df) as mock:
- yield mock
-
-
-@pytest.fixture
-def mock__read_json_file():
- def return_mock_df(path, _schema, **_options):
- path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)}
-
- return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]})
-
- with patch.object(WithS3PathPrefix, "_read_json_file", side_effect=return_mock_df) as mock:
- yield mock
+def failing_injections():
+ return {"var1": Exception()}
@pytest.fixture
-# pylint: disable=invalid-name
-def mock_temporary_directory():
- with patch.object(tempfile, "TemporaryDirectory") as mock:
- mock.return_value.__enter__.return_value = "temp"
- yield mock
+def passing_injections():
+ return {"var1": "hello", "var2": "there"}
-@pytest.fixture
-def mock_listdir():
- with patch.object(os, "listdir", return_value=local_obj_file_names) as mock:
- yield mock
-
-
-@pytest.fixture
-def mock_invalid_listdir():
- with patch.object(os, "listdir", return_value=invalid_local_obj_file_names) as mock:
- yield mock
-
-
-@pytest.fixture
-# pylint: disable=invalid-name
-def mock_parquet_temporary_directory():
- with patch.object(tempfile, "TemporaryDirectory") as mock:
- mock.return_value.__enter__.return_value = os.path.join(constants.TEST_RESOURCES, "data/input/batch/parquet")
- yield mock
-
-
-@pytest.fixture
-# pylint: disable=invalid-name
-def mock_parquet_temporary_directory_w_empty_files():
- with patch.object(tempfile, "TemporaryDirectory") as mock:
- mock.return_value.__enter__.return_value = os.path.join(constants.TEST_RESOURCES, "data/input/batch/parquet_w_empty_files")
- yield mock
+@pytest.fixture(
+ params=[
+ "sample.csv",
+ "sample.parquet",
+ "sample.json",
+ "sample.h5",
+ ]
+)
+def file_name(request):
+ return request.param
diff --git a/tests/constants.py b/tests/constants.py
index 62cb1ec..d22b4e4 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -1,17 +1,5 @@
-"""A module for configuring all dynamic environment variables for testing purposes"""
+"""A module with constants used in tests."""
-import os
+from pathlib import Path
-TEST_RESOURCES = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
-
-# Dynamic Vars
-MOCK_BUCKET = "mock-bucket"
-MOCK_KEY = "mock-key"
-KAFKA_SERVER = "mock-kafka-server"
-KAFKA_TOPIC = "mock-kafka-topic"
-DB_HOST = "127.0.0.1"
-DB_PORT = "17039"
-DB_NAME = "backend"
-DB_USER = "user"
-DB_PASS = "pass"
-LOWER_THAN_LIMIT = 1000
+TEST_FIXTURES = Path(__file__).parent / "fixtures"
diff --git a/tests/fixtures/sample.csv b/tests/fixtures/sample.csv
new file mode 100644
index 0000000..1df431d
--- /dev/null
+++ b/tests/fixtures/sample.csv
@@ -0,0 +1,4 @@
+a,b,c
+1,x,True
+2,y,False
+3,z,True
diff --git a/tests/resources/data/input/batch/not_just_hdf/part_01.h5 b/tests/fixtures/sample.h5
similarity index 99%
rename from tests/resources/data/input/batch/not_just_hdf/part_01.h5
rename to tests/fixtures/sample.h5
index f8f5e23..97ea786 100644
Binary files a/tests/resources/data/input/batch/not_just_hdf/part_01.h5 and b/tests/fixtures/sample.h5 differ
diff --git a/tests/resources/data/input/batch/hdf/part_01.h5 b/tests/fixtures/sample.hdf
similarity index 99%
rename from tests/resources/data/input/batch/hdf/part_01.h5
rename to tests/fixtures/sample.hdf
index f8f5e23..6d5af3c 100644
Binary files a/tests/resources/data/input/batch/hdf/part_01.h5 and b/tests/fixtures/sample.hdf differ
diff --git a/tests/fixtures/sample.json b/tests/fixtures/sample.json
new file mode 100644
index 0000000..e7f1aa0
--- /dev/null
+++ b/tests/fixtures/sample.json
@@ -0,0 +1 @@
+{"a":{"0":1,"1":2,"2":3},"b":{"0":"x","1":"y","2":"z"},"c":{"0":true,"1":false,"2":true}}
\ No newline at end of file
diff --git a/tests/fixtures/sample.parquet b/tests/fixtures/sample.parquet
new file mode 100644
index 0000000..46d7e96
Binary files /dev/null and b/tests/fixtures/sample.parquet differ
diff --git a/tests/fixtures/schemas.py b/tests/fixtures/schemas.py
new file mode 100644
index 0000000..aecdab9
--- /dev/null
+++ b/tests/fixtures/schemas.py
@@ -0,0 +1,10 @@
+# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, R0801
+
+from pandera import SchemaModel
+from pandera.typing import Series
+
+
+class SampleSchema(SchemaModel):
+ a: Series[int]
+ b: Series[str]
+ c: Series[bool]
diff --git a/tests/mocking/__init__.py b/tests/mocking/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/mocking/io.py b/tests/mocking/io.py
deleted file mode 100644
index 8ed59fb..0000000
--- a/tests/mocking/io.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# pylint: disable=missing-class-docstring, missing-module-docstring, missing-function-docstring
-
-from dynamicio import UnifiedIO
-from dynamicio.core import SCHEMA_FROM_FILE
-
-
-class ReadS3IO(UnifiedIO):
- schema = {"id": "int64"}
-
-
-class ReadMockS3CsvIO(UnifiedIO):
- schema = SCHEMA_FROM_FILE
-
-
-class TemplatedFile(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class ReadLocalParquetTemplated(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class ReadS3CsvIO(UnifiedIO):
- schema = SCHEMA_FROM_FILE
-
-
-class ReadS3DataWithLessColumnsIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object"}
-
-
-class ReadS3DataWithFalseTypes(UnifiedIO):
- schema = {"id": "float64", "foo_name": "object"}
-
-
-class ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO(UnifiedIO):
- schema = {
- "bar": "int64",
- "foo_name": "object",
- "a_number": "int64",
- "b_number": "int64",
- "bar_type": "object",
- }
-
-
-class ReadS3ParquetIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class ReadS3ParquetWEmptyFilesIO(UnifiedIO):
- schema = {"id": "object", "bar": "int64"}
-
-
-class ReadS3ParquetWithLessColumnsIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object"}
-
-
-class ReadS3HdfIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class AsyncReadS3HdfIO(UnifiedIO):
- schema = {"col_1": "int64", "col_2": "object"}
-
-
-class ReadS3JsonIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class WriteS3ParquetIO(UnifiedIO):
- schema = {"col_1": "int64", "col_2": "object"}
-
-
-class WriteS3ParquetExternalIO(UnifiedIO):
- schema = {
- "bar": "int64",
- "event_type": "object",
- "id": "int64",
- "end_odometer": "int64",
- "foo_name": "object",
- }
-
-
-class WriteS3CsvIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class WriteS3CsvWithSchema(UnifiedIO):
- schema = SCHEMA_FROM_FILE
-
-
-class WriteS3HdfIO(UnifiedIO):
- schema = {"col_1": "int64", "col_2": "object"}
-
-
-class WriteS3JsonIO(UnifiedIO):
- schema = {"col_1": "int64", "col_2": "object"}
-
-
-class ReadPostgresIO(UnifiedIO):
- schema = {"id": "object", "foo": "object", "bar": "int64", "baz": "object"}
-
-
-class WritePostgresIO(UnifiedIO):
- schema = {"id": "object", "foo": "object", "bar": "int64", "baz": "object"}
-
-
-class WriteExtendedPostgresIO(UnifiedIO):
- schema = {"id": "object", "foo": "object", "bar": "int64", "start_date": "datetime64[ns]", "active": "bool", "net": "float64"}
-
-
-class WriteKafkaIO(UnifiedIO):
- schema = {"id": "object", "foo": "object", "bar": "int64", "baz": "object"}
-
-
-class WriteKeyedKafkaIO(UnifiedIO):
- schema = {"key": "object", "id": "object", "foo": "object", "bar": "int64", "baz": "object"}
-
-
-class MockKafkaProducer:
- def __init__(self):
- self.my_stream = []
-
- def send(self, topic: str, value: dict, key: str = None): # pylint: disable=unused-argument
- self.my_stream.append({"key": key, "value": value})
-
- def flush(self):
- pass
-
- def close(self):
- pass
-
-
-class ReadS3ParquetWithDifferentCastableDTypeIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
- # Input format of some_parquet_to_read.parquet is:
- # id,foo_name,bar
- # 1,foo_a,1
- # 2,foo_b,2
- # ...
- # 15,foo_a,15
-
-
-class ReadS3ParquetWithDifferentNonCastableDTypeIO(UnifiedIO):
- schema = {"id": "int64", "foo_name": "int64", "bar": "int64"}
-
- # Input format of some_parquet_to_read.parquet is:
- # id,foo_name,bar
- # 1,foo_a,1
- # 2,foo_b,2
- # ...
- # 15,foo_a,15
-
-
-class ReadFromBatchLocalParquet(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class ReadFromBatchLocalHdf(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64"}
-
-
-class ParquetWithSomeBool(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"}
-
-
-class CsvWithSomeBool(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"}
-
-
-class HdfWithSomeBool(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"}
-
-
-class JsonWithSomeBool(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"}
-
-
-class ParquetWithCustomValidate(UnifiedIO):
- schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"}
-
- @staticmethod
- def validate(df):
- if not df["id"].is_unique:
- return False
- if df["bar"].isna().any():
- return False
- return True
diff --git a/tests/mocking/models.py b/tests/mocking/models.py
deleted file mode 100644
index e19d09f..0000000
--- a/tests/mocking/models.py
+++ /dev/null
@@ -1,33 +0,0 @@
-"""A module for defining sql_alchemy models."""
-# pylint: disable=too-few-public-methods, R0801, C0104
-__all__ = ["ERModel"]
-
-from sqlalchemy import Column, Integer, String
-from sqlalchemy.ext.declarative import declarative_base
-
-Base = declarative_base()
-
-
-class ERModel(Base):
- """
- Sql_alchemy model for example table
-
- """
-
- __tablename__ = "example"
-
- id = Column(String, primary_key=True)
- foo = Column(String)
- bar = Column(Integer)
- baz = Column(String)
-
-
-clsdict = {
- "clsname": "PgModel",
- "__tablename__": "pg",
- "id": Column(String(64), primary_key=True, nullable=False),
- "foo": Column(String(64)),
- "bar": Column(Integer()),
- "baz": Column(String(64)),
-}
-PgModel = type(clsdict["clsname"], (Base,), clsdict)
diff --git a/tests/resource_tests/test_kafka.py b/tests/resource_tests/test_kafka.py
new file mode 100644
index 0000000..cfaea28
--- /dev/null
+++ b/tests/resource_tests/test_kafka.py
@@ -0,0 +1,52 @@
+# flake8: noqa: I101
+
+from unittest.mock import MagicMock, call, patch
+
+import pytest
+
+from dynamicio import KafkaResource
+
+
+@pytest.fixture
+def mocked_kafka_producer():
+ mocked_kafka_producer = MagicMock()
+ with patch("dynamicio.io.kafka.KafkaProducer") as kafka_producer:
+ kafka_producer.return_value = mocked_kafka_producer
+ yield mocked_kafka_producer
+
+
+@pytest.fixture
+def kafka_resource() -> KafkaResource:
+ return KafkaResource(topic="test_topic", server="test_server")
+
+
+def test_kafka_resource_write(test_df, kafka_resource, mocked_kafka_producer):
+ kafka_resource.write(test_df)
+ mocked_kafka_producer.send.assert_has_calls(
+ [
+ call("test_topic", key=0, value={"a": 1, "b": "x", "c": True}),
+ call("test_topic", key=1, value={"a": 2, "b": "y", "c": False}),
+ call("test_topic", key=2, value={"a": 3, "b": "z", "c": True}),
+ ]
+ )
+
+
+def test_kafka_resource_read(kafka_resource):
+ with pytest.raises(NotImplementedError):
+ kafka_resource.read()
+
+
+def test_kafka_inject_success(kafka_resource, passing_injections, test_df, mocked_kafka_producer):
+ kafka_resource.topic = "{var1}"
+ kafka_resource.server = "{var2}"
+ kafka_resource = kafka_resource.inject(**passing_injections)
+ assert kafka_resource.topic == passing_injections["var1"]
+ assert kafka_resource.server == passing_injections["var2"]
+ kafka_resource.write(test_df)
+ mocked_kafka_producer.send.assert_has_calls(
+ [
+ call(passing_injections["var1"], key=0, value={"a": 1, "b": "x", "c": True}),
+ call(passing_injections["var1"], key=1, value={"a": 2, "b": "y", "c": False}),
+ call(passing_injections["var1"], key=2, value={"a": 3, "b": "z", "c": True}),
+ ]
+ )
diff --git a/tests/resource_tests/test_local_file.py b/tests/resource_tests/test_local_file.py
new file mode 100644
index 0000000..47c0b82
--- /dev/null
+++ b/tests/resource_tests/test_local_file.py
@@ -0,0 +1,28 @@
+import pandas as pd
+
+from dynamicio.io import LocalFileResource
+from tests import constants
+from tests.fixtures.schemas import SampleSchema
+
+
+def test_read(test_df, file_name):
+ resource = LocalFileResource(path=constants.TEST_FIXTURES / file_name)
+ df = resource.read()
+ pd.testing.assert_frame_equal(df, test_df)
+
+
+def test_read_with_schema(test_df, file_name):
+ resource = LocalFileResource(path=constants.TEST_FIXTURES / file_name, pa_schema=SampleSchema)
+ df = resource.read()
+ pd.testing.assert_frame_equal(df, test_df)
+
+
+def test_write(test_df, tmpdir, file_name):
+ resource = LocalFileResource(path=tmpdir / file_name)
+ resource.write(test_df)
+ # reading should probably not be done with the config here
+ df = resource.read()
+ pd.testing.assert_frame_equal(df, test_df)
+
+
+# TODO: test float json thing
diff --git a/tests/resource_tests/test_postgres.py b/tests/resource_tests/test_postgres.py
new file mode 100644
index 0000000..4543cf8
--- /dev/null
+++ b/tests/resource_tests/test_postgres.py
@@ -0,0 +1,241 @@
+from unittest.mock import ANY, MagicMock, Mock, patch
+
+import pandas as pd
+import pytest
+
+from dynamicio import PostgresResource
+from tests import constants
+from tests.fixtures.schemas import SampleSchema
+
+sample_path = f"{constants.TEST_FIXTURES}/sample.parquet"
+
+
+@pytest.fixture
+def postgres_table_resource() -> PostgresResource:
+ return PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ db_schema="republic",
+ table_name="test_table",
+ )
+
+
+@pytest.fixture
+def postgres_query_resource() -> PostgresResource:
+ return PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ db_schema="republic",
+ sql_query="SELECT * FROM other_table",
+ )
+
+
+@pytest.fixture
+def mock_cursor():
+ return MagicMock()
+
+
+@pytest.fixture
+def mock_binding():
+ return "mock_binding"
+
+
+@pytest.fixture
+def mocked_session(mock_cursor, mock_binding):
+ mock_session = MagicMock()
+ mock_session.connection.return_value.connection.cursor.return_value = mock_cursor
+ mock_session.get_bind.return_value = mock_binding
+ mock_session_maker = Mock(return_value=mock_session)
+ with patch("dynamicio.io.postgres.Session", mock_session_maker):
+ yield mock_session
+
+
+@pytest.fixture
+def postgres_df(postgres_table_resource) -> pd.DataFrame:
+ return pd.read_parquet(sample_path)
+
+
+@pytest.fixture
+def read_sql_mock(postgres_df):
+ with patch("pandas.read_sql", return_value=postgres_df) as mock:
+ yield mock
+
+
+@pytest.fixture
+def to_sql_mock(postgres_df):
+ with patch("pandas.DataFrame.to_sql", return_value=None) as mock:
+ yield mock
+
+
+def test_postgres_resource_read(postgres_table_resource, postgres_df, read_sql_mock, mocked_session, mock_binding):
+ df = postgres_table_resource.read()
+ read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding)
+ pd.testing.assert_frame_equal(df, postgres_df)
+
+
+def test_postgres_resource_read_with_schema(postgres_df, read_sql_mock, mocked_session, mock_binding):
+ resource = PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ db_schema="republic",
+ table_name="test_table",
+ pa_schema=SampleSchema,
+ )
+ df = resource.read()
+ read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding)
+ pd.testing.assert_frame_equal(df, postgres_df)
+
+
+def test_postgres_resource_read_without_application_name():
+ mocked_session_scope = MagicMock()
+ with patch("dynamicio.io.postgres.session_scope", mocked_session_scope):
+ resource = PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ db_schema="republic",
+ table_name="test_table",
+ pa_schema=SampleSchema,
+ )
+ try:
+ df = resource.read()
+ except Exception as e:
+ pass
+
+ mocked_session_scope.assert_called_once_with("postgresql://test_user@test_host:1234/test_db", None)
+
+
+def test_postgres_resource_read_with_application_name():
+ mocked_session_scope = MagicMock()
+ with patch("dynamicio.io.postgres.session_scope", mocked_session_scope):
+ resource = PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ db_schema="republic",
+ table_name="test_table",
+ pa_schema=SampleSchema,
+ application_name="test_app",
+ )
+ try:
+ df = resource.read()
+ except Exception as e:
+ pass
+
+ mocked_session_scope.assert_called_once_with("postgresql://test_user@test_host:1234/test_db", "test_app")
+
+
+class PgFilterSampleSchema(SampleSchema):
+ class Config:
+ strict = "filter"
+
+
+def test_postgres_resource_read_with_filter_schema(
+ postgres_table_resource, postgres_df, read_sql_mock, mocked_session, mock_binding
+):
+ postgres_table_resource.pa_schema = PgFilterSampleSchema
+ df = postgres_table_resource.read()
+ read_sql_mock.assert_called_once_with(
+ sql="SELECT a, b, c FROM republic.test_table",
+ con=mock_binding,
+ )
+ pd.testing.assert_frame_equal(df, postgres_df)
+
+
+def test_postgres_query_resource_read(
+ postgres_query_resource, postgres_df, read_sql_mock, mocked_session, mock_binding
+):
+ df = postgres_query_resource.read()
+ read_sql_mock.assert_called_once_with(sql="SELECT * FROM other_table", con=mock_binding)
+ pd.testing.assert_frame_equal(df, postgres_df)
+
+
+# --- Write tests ---
+
+
+def test_postgres_resource_write(
+ postgres_table_resource, postgres_df, to_sql_mock, mocked_session, mock_binding, mock_cursor
+):
+ postgres_table_resource.write(postgres_df)
+ to_sql_mock.assert_called_once_with(
+ name="test_table", con=mock_binding, if_exists="replace", index=False, schema="republic"
+ )
+
+
+def test_postgres_resource_write_truncate_and_append(
+ postgres_table_resource, postgres_df, to_sql_mock, mocked_session, mock_binding, mock_cursor
+):
+ postgres_table_resource.truncate_and_append = True
+ postgres_table_resource.write(postgres_df)
+ mocked_session.execute.assert_called_once_with("TRUNCATE TABLE republic.test_table;")
+ mock_cursor.execute.assert_called_once_with("SET search_path TO republic;")
+ mock_cursor.copy_from.assert_called_once_with(ANY, "test_table", columns=postgres_df.columns, null="")
+
+
+def test_postgres_resource_inject_and_read(postgres_df, read_sql_mock, mocked_session, mock_binding):
+ resource = PostgresResource(
+ db_user="{db_user}",
+ db_host="{db_host}",
+ db_port=1234,
+ db_name="that_{db_name}",
+ db_schema="{republic}",
+ table_name="{table}",
+ )
+ resource = resource.inject(
+ db_user="test_user", db_host="test_host", db_name="test_db", table="test_table", republic="republic"
+ )
+ df = resource.read()
+ read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding)
+ pd.testing.assert_frame_equal(df, postgres_df)
+
+
+def test_postgres_resource_inject_and_read_query(postgres_df, read_sql_mock, mocked_session, mock_binding):
+ resource = PostgresResource(
+ db_user="{db_user}",
+ db_host="{db_host}",
+ db_port=1234,
+ db_name="that_{db_name}",
+ db_schema="{republic}",
+ sql_query="SELECT * FROM {republic}.{table}",
+ )
+ resource = resource.inject(
+ db_user="test_user", db_host="test_host", db_name="test_db", table="test_table", republic="republic"
+ )
+ df = resource.read()
+ read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding)
+ pd.testing.assert_frame_equal(df, postgres_df)
+
+
+def test_postgres_resource_raises_on_wrong_read_configuration(postgres_df, read_sql_mock, mocked_session, mock_binding):
+ resource = PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ table_name="test_table",
+ sql_query="SELECT * FROM other_table",
+ )
+ with pytest.raises(ValueError):
+ resource.read()
+
+
+def test_postgres_resource_raises_on_wrong_write_configuration(
+ postgres_df, read_sql_mock, mocked_session, mock_binding, to_sql_mock, mock_cursor
+):
+ resource = PostgresResource(
+ db_user="test_user",
+ db_host="test_host",
+ db_port=1234,
+ db_name="test_db",
+ sql_query="SELECT * FROM other_table",
+ )
+ with pytest.raises(ValueError):
+ resource.write(postgres_df)
diff --git a/tests/resource_tests/test_s3.py b/tests/resource_tests/test_s3.py
new file mode 100644
index 0000000..245f275
--- /dev/null
+++ b/tests/resource_tests/test_s3.py
@@ -0,0 +1,75 @@
+from contextlib import contextmanager
+from typing import Generator
+
+import boto3
+import pandas as pd
+import pytest
+from botocore.stub import Stubber
+from unittest.mock import patch
+
+from dynamicio.io import S3Resource
+from tests import constants
+from tests.fixtures.schemas import SampleSchema
+
+
+@pytest.fixture
+def with_s3_stubber():
+ s3_client = boto3.client("s3")
+ Stubber(s3_client)
+
+ with patch("boto3.client"):
+ yield s3_client
+
+
+@pytest.fixture
+def with_mocked_named_reader():
+ @contextmanager
+ def mocked_named_reader(s3_client, s3_bucket: str, s3_key: str) -> Generator:
+ name = s3_bucket + "/" + s3_key
+ target_file = type("MockNamedTemporaryFile", (object,), {"name": name})()
+ yield target_file
+
+ with patch(f"dynamicio.io.s3.s3_named_file_reader", new=mocked_named_reader) as target:
+ yield target
+
+
+@contextmanager
+def mocked_s3_generator(s3_client, s3_bucket: str, s3_key: str) -> Generator:
+ yield s3_bucket + "/" + s3_key
+
+
+@pytest.fixture
+def with_mocked_reader():
+ with patch(f"dynamicio.io.s3.s3_reader", new=mocked_s3_generator) as target:
+ yield target
+
+
+@pytest.fixture
+def with_mocked_writer():
+ with patch(f"dynamicio.io.s3.s3_writer", new=mocked_s3_generator) as target:
+ yield target
+
+
+@pytest.fixture(autouse=True)
+def with_mocked_s3(with_mocked_named_reader, with_mocked_reader, with_mocked_writer, with_s3_stubber):
+ yield
+
+
+def test_read(test_df, file_name):
+ resource = S3Resource(bucket=str(constants.TEST_FIXTURES), path=file_name)
+ df = resource.read()
+ pd.testing.assert_frame_equal(df, test_df)
+
+
+def test_read_with_schema(test_df, file_name):
+ resource = S3Resource(bucket=str(constants.TEST_FIXTURES), path=file_name, pa_schema=SampleSchema)
+ df = resource.read()
+ pd.testing.assert_frame_equal(df, test_df)
+
+
+def test_write(test_df, tmpdir, file_name):
+ resource = S3Resource(bucket=str(tmpdir), path=file_name)
+ resource.write(test_df)
+ # reading should probably not be done with the config here
+ df = resource.read()
+ pd.testing.assert_frame_equal(df, test_df)
diff --git a/tests/resources/data/external/h5_with_more_columns.h5 b/tests/resources/data/external/h5_with_more_columns.h5
deleted file mode 100644
index 37b4fa4..0000000
Binary files a/tests/resources/data/external/h5_with_more_columns.h5 and /dev/null differ
diff --git a/tests/resources/data/external/json_with_more_columns.json b/tests/resources/data/external/json_with_more_columns.json
deleted file mode 100644
index 75bd97c..0000000
--- a/tests/resources/data/external/json_with_more_columns.json
+++ /dev/null
@@ -1,104 +0,0 @@
-{
- "id":{
- "0":1,
- "1":2,
- "2":3,
- "3":4,
- "4":5,
- "5":6,
- "6":7,
- "7":8,
- "8":9,
- "9":10,
- "10":11,
- "11":12,
- "12":13,
- "13":14,
- "14":15
- },
- "foo_name":{
- "0":"foo_a",
- "1":"foo_b",
- "2":"foo_a",
- "3":"foo_b",
- "4":"foo_a",
- "5":"foo_b",
- "6":"foo_a",
- "7":"foo_b",
- "8":"foo_a",
- "9":"foo_b",
- "10":"foo_a",
- "11":"foo_b",
- "12":"foo_a",
- "13":"foo_b",
- "14":"foo_a"
- },
- "bar":{
- "0":1,
- "1":2,
- "2":3,
- "3":4,
- "4":5,
- "5":6,
- "6":7,
- "7":8,
- "8":9,
- "9":10,
- "10":11,
- "11":12,
- "12":13,
- "13":14,
- "14":15
- },
- "bar_type":{
- "0":"my-type",
- "1":"my-type",
- "2":"my-type",
- "3":"my-type",
- "4":"my-type",
- "5":"my-type",
- "6":"my-type",
- "7":"my-type",
- "8":"my-type",
- "9":"my-type",
- "10":"my-type",
- "11":"my-type",
- "12":"my-type",
- "13":"my-type",
- "14":"my-type"
- },
- "a_number":{
- "0":1500,
- "1":1500,
- "2":1500,
- "3":1500,
- "4":1500,
- "5":1500,
- "6":1500,
- "7":1500,
- "8":1500,
- "9":1500,
- "10":1500,
- "11":1500,
- "12":1500,
- "13":1500,
- "14":1500
- },
- "b_number":{
- "0":1600,
- "1":1600,
- "2":1600,
- "3":1600,
- "4":1600,
- "5":1600,
- "6":1600,
- "7":1600,
- "8":1600,
- "9":1600,
- "10":1600,
- "11":1600,
- "12":1600,
- "13":1600,
- "14":1600
- }
-}
\ No newline at end of file
diff --git a/tests/resources/data/input/batch/hdf/part_02.h5 b/tests/resources/data/input/batch/hdf/part_02.h5
deleted file mode 100644
index 62b6174..0000000
Binary files a/tests/resources/data/input/batch/hdf/part_02.h5 and /dev/null differ
diff --git a/tests/resources/data/input/batch/not_just_hdf/part_02.h5 b/tests/resources/data/input/batch/not_just_hdf/part_02.h5
deleted file mode 100644
index 62b6174..0000000
Binary files a/tests/resources/data/input/batch/not_just_hdf/part_02.h5 and /dev/null differ
diff --git a/tests/resources/data/input/batch/not_just_hdf/something_to_ignore.txt b/tests/resources/data/input/batch/not_just_hdf/something_to_ignore.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/resources/data/input/batch/not_just_parquet/part_01.parquet b/tests/resources/data/input/batch/not_just_parquet/part_01.parquet
deleted file mode 100644
index fb5deb4..0000000
Binary files a/tests/resources/data/input/batch/not_just_parquet/part_01.parquet and /dev/null differ
diff --git a/tests/resources/data/input/batch/not_just_parquet/part_02.parquet b/tests/resources/data/input/batch/not_just_parquet/part_02.parquet
deleted file mode 100644
index 00934a8..0000000
Binary files a/tests/resources/data/input/batch/not_just_parquet/part_02.parquet and /dev/null differ
diff --git a/tests/resources/data/input/batch/not_just_parquet/something_to_ignore.txt b/tests/resources/data/input/batch/not_just_parquet/something_to_ignore.txt
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/resources/data/input/batch/parquet/part_01.parquet b/tests/resources/data/input/batch/parquet/part_01.parquet
deleted file mode 100644
index fb5deb4..0000000
Binary files a/tests/resources/data/input/batch/parquet/part_01.parquet and /dev/null differ
diff --git a/tests/resources/data/input/batch/parquet/part_02.parquet b/tests/resources/data/input/batch/parquet/part_02.parquet
deleted file mode 100644
index 00934a8..0000000
Binary files a/tests/resources/data/input/batch/parquet/part_02.parquet and /dev/null differ
diff --git a/tests/resources/data/input/batch/parquet_w_empty_files/emptyfile.parquet b/tests/resources/data/input/batch/parquet_w_empty_files/emptyfile.parquet
deleted file mode 100644
index 9eea44b..0000000
Binary files a/tests/resources/data/input/batch/parquet_w_empty_files/emptyfile.parquet and /dev/null differ
diff --git a/tests/resources/data/input/batch/parquet_w_empty_files/fullfile.parquet b/tests/resources/data/input/batch/parquet_w_empty_files/fullfile.parquet
deleted file mode 100644
index 5fdaca4..0000000
Binary files a/tests/resources/data/input/batch/parquet_w_empty_files/fullfile.parquet and /dev/null differ
diff --git a/tests/resources/data/input/some_csv_to_read.csv b/tests/resources/data/input/some_csv_to_read.csv
deleted file mode 100644
index a6f8c5f..0000000
--- a/tests/resources/data/input/some_csv_to_read.csv
+++ /dev/null
@@ -1,16 +0,0 @@
-id,foo_name,bar
-1,name_a,1
-2,name_b,2
-3,name_a,3
-4,name_b,4
-5,name_a,5
-6,name_b,6
-7,name_a,7
-8,name_b,8
-9,name_a,9
-10,name_b,10
-11,name_a,11
-12,name_b,12
-13,name_a,13
-14,name_b,14
-15,name_a,15
\ No newline at end of file
diff --git a/tests/resources/data/input/some_hdf_to_read.h5 b/tests/resources/data/input/some_hdf_to_read.h5
deleted file mode 100644
index 39e6995..0000000
Binary files a/tests/resources/data/input/some_hdf_to_read.h5 and /dev/null differ
diff --git a/tests/resources/data/input/some_json_to_read.json b/tests/resources/data/input/some_json_to_read.json
deleted file mode 100644
index 0e6bd00..0000000
--- a/tests/resources/data/input/some_json_to_read.json
+++ /dev/null
@@ -1,53 +0,0 @@
-{
- "id":{
- "0":1,
- "1":2,
- "2":3,
- "3":4,
- "4":5,
- "5":6,
- "6":7,
- "7":8,
- "8":9,
- "9":10,
- "10":11,
- "11":12,
- "12":13,
- "13":14,
- "14":15
- },
- "foo_name":{
- "0":"name_a",
- "1":"name_b",
- "2":"name_a",
- "3":"name_b",
- "4":"name_a",
- "5":"name_b",
- "6":"name_a",
- "7":"name_b",
- "8":"name_a",
- "9":"name_b",
- "10":"name_a",
- "11":"name_b",
- "12":"name_a",
- "13":"name_b",
- "14":"name_a"
- },
- "bar":{
- "0":1,
- "1":2,
- "2":3,
- "3":4,
- "4":5,
- "5":6,
- "6":7,
- "7":8,
- "8":9,
- "9":10,
- "10":11,
- "11":12,
- "12":13,
- "13":14,
- "14":15
- }
-}
\ No newline at end of file
diff --git a/tests/resources/data/input/some_parquet_to_read.parquet b/tests/resources/data/input/some_parquet_to_read.parquet
deleted file mode 100644
index 9eb054b..0000000
Binary files a/tests/resources/data/input/some_parquet_to_read.parquet and /dev/null differ
diff --git a/tests/resources/data/input/some_pg_parquet_to_read.parquet b/tests/resources/data/input/some_pg_parquet_to_read.parquet
deleted file mode 100644
index ddf1d30..0000000
Binary files a/tests/resources/data/input/some_pg_parquet_to_read.parquet and /dev/null differ
diff --git a/tests/resources/data/temp/.gitkeep b/tests/resources/data/temp/.gitkeep
deleted file mode 100644
index e69de29..0000000
diff --git a/tests/resources/definitions/external.yaml b/tests/resources/definitions/external.yaml
deleted file mode 100644
index 8c17d7e..0000000
--- a/tests/resources/definitions/external.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
----
-READ_FROM_ATHENA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "athena"
- athena:
- db_host: "[[ DB_HOST ]]"
- db_port: "[[ DB_PORT ]]"
- db_name: "[[ DB_NAME ]]"
- db_user: "[[ DB_USER ]]"
- db_password: "[[ DB_PASS ]]"
-
-READ_MOCK_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
- # Missing SCHEMA
-
-READ_FROM_S3_JSON:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/external/json_with_more_columns.json"
- file_type: "json"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "json"
-
-READ_FROM_S3_HDF:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/external/h5_with_more_columns.h5"
- file_type: "hdf"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "hdf"
-
-WRITE_TO_S3_PARQUET:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/external/some_parquet_with_schema_dictated_column_order.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "test/write_some_parquet.parquet"
- file_type: "parquet"
-
-WRITE_TO_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/external/write_some_csv.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "test/write_some_csv.csv"
- file_type: "csv"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/write_to_s3_csv.yaml"
diff --git a/tests/resources/definitions/input.yaml b/tests/resources/definitions/input.yaml
deleted file mode 100644
index 2edeac3..0000000
--- a/tests/resources/definitions/input.yaml
+++ /dev/null
@@ -1,287 +0,0 @@
----
-READ_FROM_S3_CSV_ALT:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
-
-READ_FROM_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"
-
-READ_FROM_S3_JSON:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json"
- file_type: "json"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "json"
-
-READ_FROM_S3_HDF:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5"
- file_type: "hdf"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "hdf"
-
-READ_FROM_S3_PARQUET:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "s3:sample-prefix/[[ MOCK_KEY ]]"
- file_type: "parquet"
-
-READ_FROM_S3_PATH_PREFIX_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- path_prefix: "[[ MOCK_KEY ]]"
- file_type: "csv"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"
-
-READ_FROM_S3_PATH_PREFIX_PARQUET:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- path_prefix: "[[ MOCK_KEY ]]"
- file_type: "parquet"
-
-READ_FROM_S3_PATH_PREFIX_HDF:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5"
- file_type: "hdf"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- path_prefix: "[[ MOCK_KEY ]]"
- file_type: "hdf"
-
-READ_FROM_S3_PATH_PREFIX_JSON:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json"
- file_type: "hdf"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- path_prefix: "[[ MOCK_KEY ]]"
- file_type: "json"
-
-READ_FROM_POSTGRES:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_pg_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "postgres"
- postgres:
- db_host: "[[ DB_HOST ]]"
- db_port: "[[ DB_PORT ]]"
- db_name: "[[ DB_NAME ]]"
- db_user: "[[ DB_USER ]]"
- db_password: "[[ DB_PASS ]]"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/pg.yaml"
-
-READ_FROM_POSTGRES_WITH_QUERY_IN_OPTIONS:
- CLOUD:
- type: "postgres"
- postgres:
- db_host: "[[ DB_HOST ]]"
- db_port: "[[ DB_PORT ]]"
- db_name: "[[ DB_NAME ]]"
- db_user: "[[ DB_USER ]]"
- db_password: "[[ DB_PASS ]]"
- options:
- sql_query: "SELECT * FROM table_name_from_yaml_options"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/pg.yaml"
-
-READ_FROM_KAFKA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "kafka"
- kafka:
- kafka_server: "[[ KAFKA_SERVER ]]"
- kafka_topic: "[[ KAFKA_TOPIC ]]"
-
-TEMPLATED_FILE_PATH:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.csv"
- file_type: "csv"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "path/to/{file_name_to_replace}.csv"
- file_type: "csv"
-
-READ_FROM_PARQUET_TEMPLATED:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "path/to/{file_name_to_replace}.parquet"
- file_type: "parquet"
-
-READ_FROM_BATCH_LOCAL_PARQUET:
- LOCAL:
- type: "local_batch"
- local:
- path_prefix: "[[ TEST_RESOURCES ]]/data/input/batch/parquet/"
- file_type: "parquet"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.parquet"
- file_type: "parquet"
-
-READ_FROM_BATCH_LOCAL_NOT_JUST_PARQUET:
- LOCAL:
- type: "local_batch"
- local:
- path_prefix: "[[ TEST_RESOURCES ]]/data/input/batch/not_just_parquet/"
- file_type: "parquet"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.parquet"
- file_type: "parquet"
-
-READ_FROM_BATCH_LOCAL_HDF:
- LOCAL:
- type: "local_batch"
- local:
- path_prefix: "[[ TEST_RESOURCES ]]/data/input/batch/hdf/"
- file_type: "hdf"
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.hdf"
- file_type: "hdf"
-
-S3_PARQUET_WITH_BOOL:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_with_bool_vals.parquet"
- file_type: "parquet"
-
-S3_CSV_WITH_BOOL:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_with_bool_vals.csv"
- file_type: "csv"
-
-S3_HDF_WITH_BOOL:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_with_bool_vals.h5"
- file_type: "hdf"
-
-S3_JSON_WITH_BOOL:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_with_bool_vals.json"
- file_type: "json"
-
-S3_PARQUET_WITH_CUSTOM_VALIDATE:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_with_bool_vals.parquet"
- file_type: "parquet"
-
-S3_PARQUET_WITH_OPTIONS_IN_CODE:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
-
-S3_PARQUET_WITH_OPTIONS_IN_DEFINITION:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- options:
- option_3: false
- option_4: true
-
-WRITE_TO_S3_PATH_PREFIX_PARQUET:
- CLOUD:
- type: "s3_path_prefix"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- path_prefix: "[[ MOCK_KEY ]]"
- file_type: "parquet"
-
diff --git a/tests/resources/definitions/processed.yaml b/tests/resources/definitions/processed.yaml
deleted file mode 100644
index b54bfa1..0000000
--- a/tests/resources/definitions/processed.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
----
-WRITE_TO_S3_PARQUET:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_parquet.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "test/write_some_parquet.parquet"
- file_type: "parquet"
-
-WRITE_TO_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_csv.csv"
- file_type: "csv"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "test/write_some_csv.csv"
- file_type: "csv"
-
-WRITE_TO_S3_JSON:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_json.json"
- file_type: "json"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "test/write_some_json.json"
- file_type: "json"
-
-WRITE_TO_S3_HDF:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_h5.h5"
- file_type: "hdf"
- CLOUD:
- type: "s3_file"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "test/write_some_h5.h5"
- file_type: "hdf"
-
-WRITE_TO_KAFKA_JSON:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/write_kafka_messages.json"
- file_type: "json"
- options:
- orient: "records"
- CLOUD:
- type: "kafka"
- kafka:
- kafka_server: "[[ KAFKA_SERVER ]]"
- kafka_topic: "[[ KAFKA_TOPIC ]]"
-
-WRITE_TO_PG_PARQUET:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/processed/write_kafka_messages.parquet"
- file_type: "parquet"
- CLOUD:
- type: "postgres"
- postgres:
- db_host: "[[ DB_HOST ]]"
- db_port: "[[ DB_PORT ]]"
- db_name: "[[ DB_NAME ]]"
- db_user: "[[ DB_USER ]]"
- db_password: "[[ DB_PASS ]]"
diff --git a/tests/resources/definitions/test_input.yaml b/tests/resources/definitions/test_input.yaml
deleted file mode 100644
index 8ad2428..0000000
--- a/tests/resources/definitions/test_input.yaml
+++ /dev/null
@@ -1,129 +0,0 @@
----
-READ_FROM_S3_CSV_ALT:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
-
-READ_FROM_S3_CSV:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "csv"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"
-
-READ_FROM_S3_JSON:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json"
- file_type: "json"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "json"
-
-READ_FROM_S3_HDF:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5"
- file_type: "hdf"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "[[ MOCK_KEY ]]"
- file_type: "hdf"
-
-READ_FROM_S3_PARQUET:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "s3:sample-prefix/[[ MOCK_KEY ]]"
- file_type: "parquet"
-
-READ_FROM_POSTGRES:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_pg_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "postgres"
- postgres:
- db_host: "[[ DB_HOST ]]"
- db_port: "[[ DB_PORT ]]"
- db_name: "[[ DB_NAME ]]"
- db_user: "[[ DB_USER ]]"
- db_password: "[[ DB_PASS ]]"
-
-READ_FROM_KAFKA:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- file_type: "parquet"
- CLOUD:
- type: "kafka"
- kafka:
- kafka_server: "[[ KAFKA_SERVER ]]"
- kafka_topic: "[[ KAFKA_TOPIC ]]"
-
-TEMPLATED_FILE_PATH:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.csv"
- file_type: "csv"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "path/to/{file_name_to_replace}.csv"
- file_type: "csv"
-
-READ_FROM_PARQUET_TEMPLATED:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.parquet"
- file_type: "parquet"
- CLOUD:
- type: "s3"
- s3:
- bucket: "[[ MOCK_BUCKET ]]"
- file_path: "path/to/{file_name_to_replace}.parquet"
- file_type: "parquet"
-
-REPLACE_SCHEMA_WITH_DYN_VARS:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.parquet"
- file_type: "parquet"
- schema:
- file_path: "[[ TEST_RESOURCES ]]/schemas/bar.yaml"
diff --git a/tests/resources/schemas/bar.yaml b/tests/resources/schemas/bar.yaml
deleted file mode 100644
index 2a96769..0000000
--- a/tests/resources/schemas/bar.yaml
+++ /dev/null
@@ -1,48 +0,0 @@
----
-name: bar
-columns:
- column_a:
- type: "object"
- validations:
- has_unique_values:
- apply: true
- options: {}
- metrics:
- - Counts
- column_b:
- type: "object"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- metrics:
- - CountsPerLabel
- column_c:
- type: float64
- validations:
- is_greater_than:
- apply: true
- options:
- threshold: 1000
- metrics: []
- column_d:
- type: float64
- validations:
- is_lower_than:
- apply: true
- options:
- threshold: "[[ LOWER_THAN_LIMIT ]]"
- metrics:
- - Min
- - Max
- - Mean
- - Std
- - Variance
- "0":
- type: "object"
- validations: {}
- metrics: []
- 1:
- type: "object"
- validations: {}
- metrics: []
diff --git a/tests/resources/schemas/foo.yaml b/tests/resources/schemas/foo.yaml
deleted file mode 100644
index 8cce75a..0000000
--- a/tests/resources/schemas/foo.yaml
+++ /dev/null
@@ -1,46 +0,0 @@
----
-name: foo
-columns:
- id:
- type: "object"
- validations:
- has_unique_values:
- apply: true
- options:
- name:
- type: "objet"
- validations:
- has_no_null_values:
- apply: true
- options:
- year:
- type: "float64"
- validations:
- is_greater_than:
- apply: true
- options:
- threshold: 1950
- amount:
- type: "float64"
- validations:
- is_between:
- apply: true
- options:
- lower: 0
- upper: 1000
- include_left: false
- include_right: true # true by default
- category:
- type: "object"
- validations:
- is_in:
- apply: true
- options:
- categorical_values:
- - class_a
- - class_b
- - class_c
- match_all: false # true by default, if false, then the column unique categoricals must be equal to the acceptable ones, else they must be a subset
- has_no_null_values:
- apply: true
- options:
diff --git a/tests/resources/schemas/pg.yaml b/tests/resources/schemas/pg.yaml
deleted file mode 100644
index 17fadb5..0000000
--- a/tests/resources/schemas/pg.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
----
-name: pg
-columns:
- id:
- type: "object"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- metrics:
- - CountsPerLabel
- foo:
- type: "object"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- metrics:
- - Max
- - Min
- bar:
- type: "int64"
- validations:
- is_greater_than:
- apply: true
- options:
- threshold: 1950
- metrics: []
- baz:
- type: "object"
- validations:
- is_between:
- apply: true
- options:
- lower: 0
- upper: 1000
- metrics:
- - Min
- - Max
- - Mean
diff --git a/tests/resources/schemas/read_from_s3_csv.yaml b/tests/resources/schemas/read_from_s3_csv.yaml
deleted file mode 100644
index ec1f767..0000000
--- a/tests/resources/schemas/read_from_s3_csv.yaml
+++ /dev/null
@@ -1,50 +0,0 @@
----
-name: read_from_s3_csv
-columns:
- id:
- type: "int64"
- validations:
- has_unique_values:
- apply: true
- options: {}
- has_no_null_values:
- apply: true
- options: {}
- metrics:
- - UniqueCounts
- - Counts
- foo_name:
- type: "object"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- is_in:
- apply: true
- options:
- categorical_values:
- - class_a
- - class_b
- - class_c
- metrics:
- - CountsPerLabel
- bar:
- type: "int64"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- is_greater_than:
- apply: true
- options:
- threshold: 1000
- is_lower_than:
- apply: true
- options:
- threshold: 2000
- metrics:
- - Min
- - Max
- - Mean
- - Std
- - Variance
diff --git a/tests/resources/schemas/some_csv_to_read.yaml b/tests/resources/schemas/some_csv_to_read.yaml
deleted file mode 100644
index 8487043..0000000
--- a/tests/resources/schemas/some_csv_to_read.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-columns:
- id:
- metrics: []
- type: int64
- validations: {}
- bar:
- metrics: []
- type: int64
- validations: {}
- foo_name:
- metrics: []
- type: object
- validations: {}
-name: some_csv_to_read
diff --git a/tests/resources/schemas/some_hdf_to_read.yaml b/tests/resources/schemas/some_hdf_to_read.yaml
deleted file mode 100644
index 33bc4c6..0000000
--- a/tests/resources/schemas/some_hdf_to_read.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-columns:
- id:
- metrics: []
- type: int64
- validations: {}
- bar:
- metrics: []
- type: int64
- validations: {}
- foo_name:
- metrics: []
- type: object
- validations: {}
-name: some_hdf_to_read
diff --git a/tests/resources/schemas/some_json_to_read.yaml b/tests/resources/schemas/some_json_to_read.yaml
deleted file mode 100644
index 3b496d0..0000000
--- a/tests/resources/schemas/some_json_to_read.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-columns:
- id:
- metrics: []
- type: int64
- validations: {}
- bar:
- metrics: []
- type: int64
- validations: {}
- foo_name:
- metrics: []
- type: object
- validations: {}
-name: some_json_to_read
diff --git a/tests/resources/schemas/some_parquet_to_read.yaml b/tests/resources/schemas/some_parquet_to_read.yaml
deleted file mode 100644
index f4c65cd..0000000
--- a/tests/resources/schemas/some_parquet_to_read.yaml
+++ /dev/null
@@ -1,14 +0,0 @@
-columns:
- id:
- metrics: []
- type: int64
- validations: {}
- bar:
- metrics: []
- type: int64
- validations: {}
- foo_name:
- metrics: []
- type: object
- validations: {}
-name: some_parquet_to_read
diff --git a/tests/resources/schemas/some_pg_parquet_to_read.yaml b/tests/resources/schemas/some_pg_parquet_to_read.yaml
deleted file mode 100644
index 9021ae9..0000000
--- a/tests/resources/schemas/some_pg_parquet_to_read.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-columns:
- bar:
- metrics: []
- type: int64
- validations: {}
- baz:
- metrics: []
- type: object
- validations: {}
- foo:
- metrics: []
- type: object
- validations: {}
- id:
- metrics: []
- type: object
- validations: {}
-name: some_pg_parquet_to_read
diff --git a/tests/resources/schemas/write_to_s3_csv.yaml b/tests/resources/schemas/write_to_s3_csv.yaml
deleted file mode 100644
index 8b5f3a1..0000000
--- a/tests/resources/schemas/write_to_s3_csv.yaml
+++ /dev/null
@@ -1,42 +0,0 @@
----
-name: read_from_s3_csv
-columns:
- id:
- type: "int64"
- validations:
- has_unique_values:
- apply: true
- options: {}
- has_no_null_values:
- apply: true
- options: {}
- metrics: null
- foo_name:
- type: "object"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- is_in:
- apply: true
- options:
- categorical_values:
- - class_a
- - class_b
- - class_c
- metrics: null
- bar:
- type: "int64"
- validations:
- has_no_null_values:
- apply: true
- options: {}
- is_greater_than:
- apply: true
- options:
- threshold: 1000
- is_lower_than:
- apply: true
- options:
- threshold: 2000
- metrics: null
diff --git a/tests/test_cli.py b/tests/test_cli.py
deleted file mode 100644
index 2ffe89e..0000000
--- a/tests/test_cli.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302
-import argparse
-import os
-from unittest.mock import patch
-
-import pandas as pd
-import pytest
-
-import dynamicio
-from dynamicio import cli
-from dynamicio.cli import parse_args
-from dynamicio.errors import InvalidDatasetTypeError
-from tests.conftest import DummyYaml
-from tests.constants import TEST_RESOURCES
-
-
-class TestCli:
- @pytest.mark.unit
- def test_entrypoint(self):
- print() # Just makes the output more readable in the terminal
-
- # When
- exit_status = os.system("python -m dynamicio --help")
-
- # Then
- assert exit_status == 0
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- ["args_pattern", "expected_args"],
- [
- (
- ["-b", "-p", "path/to/datasets_dir", "-o", "output_dir"],
- argparse.Namespace(batch=True, output="output_dir", path="path/to/datasets_dir", single=False),
- ),
- (
- ["-s", "-p", "path/to/datasets_dir/the_one.parquet", "-o", "output_dir"],
- argparse.Namespace(
- batch=False,
- output="output_dir",
- path="path/to/datasets_dir/the_one.parquet",
- single=True,
- ),
- ),
- ],
- )
- def test_parser_can_take_one_out_of_two_valid_argument_patters(self, args_pattern, expected_args):
- # When/Then
- assert parse_args(args_pattern) == expected_args
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "args_pattern",
- [
- ["-p", "path/to/datasets_dir", "-o", "output_dir"],
- ["-p", "path/to/datasets_dir/the_one.parquet", "-o", "output_dir"],
- ],
- )
- def test_parse_args_raises_system_exit_if_batch_or_single_flags_not_provided(self, args_pattern):
- # When/Then
- with pytest.raises(SystemExit):
- parse_args(args_pattern)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "args_pattern",
- [
- ["-b"],
- ["-b", "-o", "output_dir"],
- ["-b", "-p", "path/to/datasets_dir"],
- ["-s"],
- ["-s", "-o", "output_dir"],
- ["-s", "-p", "path/to/datasets_dir"],
- ],
- )
- def test_parse_args_raises_system_exit_with_approved_flag_without_path_and_output(self, args_pattern):
- # When/Then
- with pytest.raises(SystemExit):
- parse_args(args_pattern)
-
- @pytest.mark.unit
- def test_when_single_flag_is_used__generate_schema_for__is_called_once(self):
- with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args, patch.object(cli, "generate_schema_for") as mocked__generate_schema_for, patch.object(
- cli, "open"
- ) as mocked__open, patch.object(cli.yaml, "safe_dump") as mocked__dump:
- # Given
- mocked__parse_args.return_value = argparse.Namespace(batch=False, single=True, path="the_one.parquet", output=".")
- mocked__generate_schema_for.return_value = {"name": "the_one", "columns": {}}
- mocked__open.return_value = DummyYaml(path="path/to/the_one.yaml")
- mocked__dump.return_value = "The-Matrix"
- # When
- dynamicio.cli.run()
-
- # Then
- assert mocked__generate_schema_for.called_once_with("the_one.parquet", ".")
-
- @pytest.mark.unit
- def test_when_batch_flag_is_used__generate_schema_for__is_called_multiple_times_as_per_the_no_of_files_under_the_datasets_dir(
- self,
- ):
-
- with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args:
- with patch.object(cli, "generate_schema_for") as mocked__generate_schema_for:
- with patch.object(cli.glob, "glob") as mocked__glob:
- with patch.object(cli, "open") as mocked__open:
- with patch.object(cli.yaml, "safe_dump") as mocked__dump:
- # Given
- mocked__parse_args.return_value = argparse.Namespace(batch=True, single=False, path="path/to/datasets_dir", output=".")
- mocked__generate_schema_for.return_value = {
- "name": "random",
- "columns": {},
- }
- mocked__glob.return_value = [
- "path/to/datasets_dir/agent_1.parquet",
- "path/to/datasets_dir/agent_2.parquet",
- ]
- mocked__open.return_value = DummyYaml(path="path/to/the_oracle.yaml")
- mocked__dump.return_value = "file_content"
- # When
- dynamicio.cli.run()
-
- # Then
- assert mocked__generate_schema_for.call_count == 2
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- ["dataset", "expected_reader"],
- [
- ("path/to/dataset.parquet", "read_parquet"),
- ("path/to/dataset.json", "read_json"),
- ("path/to/dataset.csv", "read_csv"),
- ("path/to/dataset.h5", "read_hdf"),
- ],
- )
- def test_generate_schema_for__uses_the_appropriate_pandas_reader_to_read_a_file(self, dataset, expected_reader):
- # When
- with patch.object(cli.pd, expected_reader) as mocked_reader:
- mocked_reader.return_value = pd.DataFrame()
- cli.generate_schema_for(dataset)
-
- # Then
- mocked_reader.assert_called()
-
- @pytest.mark.unit
- def test_generate_schema_for__throws_exception_InvalidDatasetTypeError(self):
- # Given
- dataset = "path/to/trinity.txt"
-
- # When/Then
- with pytest.raises(InvalidDatasetTypeError):
- cli.generate_schema_for(dataset)
-
- @pytest.mark.unit
- def test_generate_schema_for__returns_a_json_schema_with_a_name_key_populated_with_the_dataset_name(
- self,
- ):
- # Given
- dataset = "path/to/the_matrix.parquet"
-
- # When
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- mocked_reader.return_value = pd.DataFrame.from_dict({"agents": [1, 2, 3], "zioners": [4, 5, 6]})
- json_schema = cli.generate_schema_for(dataset)
-
- # Then
- assert json_schema["name"] == "the_matrix"
-
- @pytest.mark.unit
- def test_generate_schema_for__returns_a_json_schema_with_all_columns_in_the_provided_dataset(
- self,
- ):
- # Given
- dataset = "path/to/the_matrix.parquet"
-
- # When
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- mocked_reader.return_value = pd.DataFrame.from_dict({"agents": [1, 2, 3], "zioners": [4, 5, 6]})
- json_schema = cli.generate_schema_for(dataset)
-
- # Then
- assert list(json_schema["columns"].keys()) == ["agents", "zioners"]
-
- @pytest.mark.unit
- def test_generate_schema_for__returns_a_json_schema_with_all_columns_in_the_provided_dataset_with_the_correct_data_types(
- self,
- ):
- # Given
- dataset = "path/to/the_matrix.parquet"
-
- # When
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- mocked_reader.return_value = pd.DataFrame.from_dict(
- {
- "agents": [1, 2, 3],
- "zioners": ["4", "5", "6"],
- "red_pill": [True, False, True],
- "value": [1.0, 2.0, 3.0],
- }
- )
- json_schema = cli.generate_schema_for(dataset)
-
- # Then
- assert {column["type"] for column in json_schema["columns"].values()} == {
- "bool",
- "object",
- "int64",
- "float64",
- }
-
- @pytest.mark.unit
- def test_generate_schema_for__returns_a_valid_json_schema_for_a_given_dataset(self):
- # Given
- dataset = "path/to/the_matrix.parquet"
-
- # When
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- mocked_reader.return_value = pd.DataFrame.from_dict(
- {
- "agents": [1, 2, 3],
- "zioners": ["4", "5", "6"],
- "red_pill": [True, False, True],
- "value": [1.0, 2.0, 3.0],
- }
- )
- json_schema = cli.generate_schema_for(dataset)
-
- # Then
- assert json_schema == {
- "columns": {
- "agents": {"metrics": [], "type": "int64", "validations": {}},
- "red_pill": {"metrics": [], "type": "bool", "validations": {}},
- "value": {"metrics": [], "type": "float64", "validations": {}},
- "zioners": {"metrics": [], "type": "object", "validations": {}},
- },
- "name": "the_matrix",
- }
-
- @pytest.mark.unit
- def test_cli_runner_raises_invalid_dataset_type_error_exception_message_when_invoked_with_single_flag_and_invalid_path(
- self,
- ):
- # Given
- dataset = "path/to/trinity.txt"
-
- # When/Then
- with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args:
- mocked__parse_args.return_value = argparse.Namespace(
- batch=False,
- single=True,
- path=dataset,
- output=os.path.join(TEST_RESOURCES, "data/temp/"),
- )
- with pytest.raises(InvalidDatasetTypeError):
- cli.run()
-
- @pytest.mark.unit
- def test_when_single_flag_is_used__the_cli_generates_a_schema_yaml_for_the_provided_dataset(
- self,
- ):
- with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args:
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- mocked__parse_args.return_value = argparse.Namespace(
- batch=False,
- single=True,
- path="the_one.parquet",
- output=os.path.join(TEST_RESOURCES, "data/temp/"),
- )
- # Given
- mocked_reader.return_value = pd.DataFrame.from_dict({"agents": [1, 2, 3], "zioners": [4, 5, 6]})
-
- # When
- dynamicio.cli.run()
-
- # Then
- output_yaml = os.path.join(TEST_RESOURCES, "data/temp", "the_one.yaml")
- try:
- assert os.path.isfile(output_yaml)
- finally:
- os.remove(output_yaml)
-
- @pytest.mark.unit
- def test_when_batch_flag_is_used__the_cli_generates_a_schema_yaml_for_each_dataset_in_the_provided_dir(
- self,
- ):
- with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args:
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- with patch.object(cli.glob, "glob") as mocked__glob:
- # Given
- mocked__parse_args.return_value = argparse.Namespace(
- batch=True,
- single=False,
- path="path/to/datasets_dir",
- output=os.path.join(TEST_RESOURCES, "data/temp/"),
- )
- mocked__glob.return_value = [
- "path/to/datasets_dir/agent_1.parquet",
- "path/to/datasets_dir/agent_2.parquet",
- ]
- mocked_reader.return_value = pd.DataFrame.from_dict({"skills": [1, 2, 3], "levels": [4, 5, 6]})
-
- # When
- dynamicio.cli.run()
-
- # Then
- output_yaml_1 = os.path.join(TEST_RESOURCES, "data/temp", "agent_1.yaml")
- output_yaml_2 = os.path.join(TEST_RESOURCES, "data/temp", "agent_2.yaml")
- try:
- assert os.path.isfile(output_yaml_1) & os.path.isfile(output_yaml_2)
- finally:
- os.remove(output_yaml_1)
- os.remove(output_yaml_2)
-
- @pytest.mark.unit
- def test_cli_runner_prints_an_invalid_dataset_type_warning_when_invoked_with_batch_flag_and_a_dir_with_an_invalid_path_but_is_not_interrupted(self, capsys):
-
- with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args:
- with patch.object(cli.glob, "glob") as mocked__glob:
- with patch.object(cli.pd, "read_parquet") as mocked_reader:
- with patch.object(cli, "open") as mocked__open:
- with patch.object(cli.yaml, "safe_dump") as mocked__dump:
- # Given
- mocked__parse_args.return_value = argparse.Namespace(
- batch=True,
- single=False,
- path="a/dummy/path/",
- output=os.path.join(TEST_RESOURCES, "data/temp/"),
- )
- mocked__glob.return_value = [
- "path/to/neo.parquet",
- "path/to/trinity.txt",
- "path/to/morpheus.parquet",
- ]
- mocked_reader.return_value = pd.DataFrame.from_dict({"column_1": [1, 2, 3], "column_2": [4, 5, 6]})
- mocked__open.return_value = DummyYaml(path="path/to/the_oracle.yaml")
- mocked__dump.return_value = "file_content"
- # When
- cli.run()
- captured = capsys.readouterr()
-
- # Then
- std_out = captured.out.split("\n")
- assert (
- (std_out[0] == "Generating schema for: path/to/neo.parquet")
- and (std_out[1] == "Skipping path/to/trinity.txt! You may want to remove this file from the datasets directory")
- and (std_out[2] == "Generating schema for: path/to/morpheus.parquet")
- )
diff --git a/tests/test_config.py b/tests/test_config.py
deleted file mode 100644
index 45da395..0000000
--- a/tests/test_config.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, R0801
-import io
-import os
-
-import pytest
-import yaml
-
-from dynamicio.config.io_config import IOConfig, SafeDynamicResourceLoader, SafeDynamicSchemaLoader
-from tests import constants
-
-
-class TestIOConfig:
- @pytest.mark.unit
- def test_config_io_parser_returns_a_transformed_dict_version_of_the_yaml_input_with_dynamic_values_replaced(self, expected_input_yaml_dict):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- )
-
- # When
- yaml_dict = input_config.config.dict()
- # Then
- assert yaml_dict == expected_input_yaml_dict
-
- @pytest.mark.unit
- def test_config_io_get_schema_definition_returns_a_schema_definition_from_a_source_config(self, expected_schema_definition):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- )
-
- # When
- schema_definition = input_config.config.bindings["READ_FROM_S3_CSV"].dict()
-
- # Then
- assert schema_definition == expected_schema_definition
-
- @pytest.mark.unit
- def test_config_io_sources_returns_all_available_sources(self):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- )
-
- # When
- sources = list(input_config.config.bindings.keys())
-
- # Then
- assert sources == [
- "READ_FROM_S3_CSV_ALT",
- "READ_FROM_S3_CSV",
- "READ_FROM_S3_JSON",
- "READ_FROM_S3_HDF",
- "READ_FROM_S3_PARQUET",
- "READ_FROM_POSTGRES",
- "READ_FROM_KAFKA",
- "TEMPLATED_FILE_PATH",
- "READ_FROM_PARQUET_TEMPLATED",
- "REPLACE_SCHEMA_WITH_DYN_VARS",
- ]
-
- @pytest.mark.unit
- def test_get_for_config_io_set_for_a_local_env_returns_a_local_mapping_for_a_given_key(self, expected_s3_csv_local_mapping):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- )
-
- # When
- s3_csv_local_mapping = input_config.config.bindings["READ_FROM_S3_CSV"].dict()
-
- # Then
- assert s3_csv_local_mapping == expected_s3_csv_local_mapping
-
- @pytest.mark.unit
- def test_get_for_config_io_set_for_a_cloud_env_returns_a_cloud_mapping_for_an_s3_csv_key(self, expected_s3_csv_cloud_mapping):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- )
-
- # When
- s3_csv_cloud_mapping = input_config.get(source_key="READ_FROM_S3_CSV").dynamicio_schema.dict()
-
- # Then
- assert s3_csv_cloud_mapping == expected_s3_csv_cloud_mapping
-
- @pytest.mark.unit
- def test_get_for_config_io_set_for_a_cloud_env_returns_a_cloud_mapping_for_an_postgres_key(self, expected_postgres_cloud_mapping):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- )
-
- # When
- postgres_cloud_mapping = input_config.get(source_key="READ_FROM_POSTGRES").dict()
-
- # Then
- assert postgres_cloud_mapping == expected_postgres_cloud_mapping
-
- @pytest.mark.unit
- def test__get_schema_definition_dynamically_replaces_numerical_values_in_schemas(self):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- )
-
- # When
- my_config = input_config.get(source_key="REPLACE_SCHEMA_WITH_DYN_VARS")
-
- # Then
- assert my_config._parent.dynamicio_schema.columns["column_c"].validations[0].dict() == { # pylint: disable=protected-access
- "apply": True,
- "name": "is_greater_than",
- "options": {"threshold": 1000},
- }
-
- @pytest.mark.unit
- def test__get_schema_definition_returns_float_only_in_case_of_replacements(self):
- # Given
- input_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- )
-
- # When
- my_config = input_config.get(source_key="REPLACE_SCHEMA_WITH_DYN_VARS")
- schema_dict = {}
- for col in my_config._parent.dynamicio_schema.columns.values(): # pylint: disable=protected-access
- schema_dict[col.name] = str(col.data_type)
-
- # Then
-
- assert schema_dict == {
- "column_a": "ColumnType.object",
- "column_b": "ColumnType.object",
- "column_c": "ColumnType.float64",
- "column_d": "ColumnType.float64",
- "0": "ColumnType.object",
- "1": "ColumnType.object",
- }
-
-
-class TestSafeDynamicLoader: # pylint: disable=R0903
- @pytest.mark.unit
- def test_replaces_all_resource_template_instances(self):
- file_contents = 'abc: "[[ VALUE_1 ]]/[[ VALUE_2 ]]"'
-
- class MockEnvironmentModule: # pylint: disable=R0903
- VALUE_1 = "abc"
- VALUE_2 = "def"
-
- result = yaml.load(io.StringIO(file_contents), SafeDynamicResourceLoader.with_module(MockEnvironmentModule))
-
- assert result == {"abc": "abc/def"}
-
- @pytest.mark.unit
- def test_replaces_all_schema_template_instances(self):
- file_contents = 'abc: "[[ VALUE_A ]]"'
-
- class MockEnvironmentModule: # pylint: disable=R0903
- VALUE_A = 100
-
- result = yaml.load(io.StringIO(file_contents), SafeDynamicSchemaLoader.with_module(MockEnvironmentModule))
-
- assert result == {"abc": 100}
diff --git a/tests/test_core.py b/tests/test_core.py
deleted file mode 100644
index 9118bcc..0000000
--- a/tests/test_core.py
+++ /dev/null
@@ -1,1002 +0,0 @@
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, R0801
-import asyncio
-import logging
-import os
-import time
-from typing import Mapping, Tuple
-from unittest.mock import patch
-
-import numpy as np
-import pandas as pd
-import pytest
-
-import dynamicio
-from dynamicio.config import IOConfig
-from dynamicio.core import CASTING_WARNING_MSG, DynamicDataIO
-from dynamicio.errors import ColumnsDataTypeError, SchemaNotFoundError, SchemaValidationError
-from dynamicio.mixins import WithS3File
-from tests import constants
-from tests.mocking.io import (
- CsvWithSomeBool,
- HdfWithSomeBool,
- JsonWithSomeBool,
- ParquetWithCustomValidate,
- ParquetWithSomeBool,
- ReadMockS3CsvIO,
- ReadS3CsvIO,
- ReadS3DataWithFalseTypes,
- ReadS3IO,
- ReadS3ParquetIO,
- WriteS3CsvIO,
- WriteS3CsvWithSchema,
- WriteS3ParquetExternalIO,
-)
-
-
-@pytest.fixture(autouse=True, scope="module")
-def propagate_logger():
- # We need this because otherwise caplog can't capture the logs
- logging.getLogger("dynamicio.metrics").propagate = True
- yield
- logging.getLogger("dynamicio.metrics").propagate = False
-
-
-class TestCoreIO:
- @pytest.mark.unit
- def test_abstract_class_dynamic_data_io_cant_be_used_for_object_instantiation(self):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When/Then
- with pytest.raises(TypeError):
- DynamicDataIO(source_config=s3_csv_local_config)
-
- @pytest.mark.unit
- def test_objects_of_dynamic_data_io_subclasses_cant_be_instantiated_in_the_absence_of_a_non_empty_schema(
- self,
- ):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When/Then
- with pytest.raises(AssertionError):
-
- class AbsentSchemaIO(DynamicDataIO):
- pass
-
- AbsentSchemaIO(source_config=s3_csv_local_config)
-
- @pytest.mark.unit
- def test_objects_of_s3io_subclasses_cant_be_instantiated_in_the_presence_of_a_empty_dict_schema(
- self,
- ):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # Given/When/Then
- with pytest.raises(ValueError):
-
- class EmptySchemaIO(WithS3File, DynamicDataIO):
- dataset_name = "EmptySchema"
- schema = {}
-
- EmptySchemaIO(source_config=s3_csv_local_config)
-
- @pytest.mark.unit
- def test_objects_of_dynamic_data_io_subclasses_cant_be_instantiated_in_the_presence_of_a_schema_eq_to_none(
- self,
- ):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When/Then
- with pytest.raises(ValueError):
-
- class NoneSchemaIO(WithS3File, DynamicDataIO):
- dataset_name = "NoneSchema"
- schema = None
-
- NoneSchemaIO(source_config=s3_csv_local_config)
-
- @pytest.mark.unit
- def test_dynamic_data_io_object_instantiation_is_only_possible_for_subclasses(self):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- s3_csv_io = ReadS3CsvIO(source_config=s3_csv_local_config)
-
- # Then
- assert isinstance(s3_csv_io, ReadS3CsvIO) and isinstance(s3_csv_io, DynamicDataIO)
-
- @pytest.mark.unit
- def test_subclasses_of_dynamic_data_io_need_to_define_a_schema(self):
- # Given/When/Then
- with pytest.raises(AssertionError):
-
- class S3CsvIONoSchema(DynamicDataIO): # pylint: disable=unused-variable
- pass
-
- @pytest.mark.unit
- def test_subclasses_of_dynamic_data_io_need_to_define_a_static_validate_function(self):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When/Then
- with pytest.raises(AssertionError):
-
- class CMVolumesIONoValidationFunction(DynamicDataIO):
- schema = {"foo": "int64"}
-
- CMVolumesIONoValidationFunction(source_config=s3_csv_local_config)
-
- @pytest.mark.unit
- def test_subclasses_of_dynamic_data_io_need_to_implement_private_reader_for_new_source_types(
- self,
- ):
- # Given
- athena_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_ATHENA")
-
- # When
- with pytest.raises(AssertionError):
- ReadS3IO(source_config=athena_cloud_config)
-
- @pytest.mark.unit
- def test_key_error_is_thrown_for_missing_schema_if_unified_io_subclass_assigns_schema_from_file_but_file_is_missing(
- self,
- ):
- # Given
- read_mock_s3_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_MOCK_S3_CSV")
-
- # When
- with pytest.raises(SchemaNotFoundError):
- ReadMockS3CsvIO(source_config=read_mock_s3_cloud_config)
-
- @pytest.mark.integration
- def test_schema_validations_are_applied_for_an_io_class_with_a_schema_definition(self, valid_dataframe):
- # Given
- df = valid_dataframe
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
- io_instance = ReadS3CsvIO(source_config=s3_csv_cloud_config)
-
- # When
- return_value = io_instance.validate_from_schema(df)
-
- # Then
- assert io_instance == return_value
-
- @pytest.mark.integration
- def test_log_metrics_from_schema_are_applied_for_an_io_class_with_a_schema_definition(self, caplog, valid_dataframe):
- # Given
- df = valid_dataframe
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
- io_instance = ReadS3CsvIO(source_config=s3_csv_cloud_config)
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- return_value = io_instance.log_metrics_from_schema(df)
-
- # Then
- assert (
- io_instance is return_value
- and (len(caplog.records) == 10)
- and (getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "id", "metric": "UniqueCounts", "value": 4.0}')
- and (getattr(caplog.records[1], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "id", "metric": "Counts", "value": 4.0}')
- and (getattr(caplog.records[2], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "foo_name-class_a", "metric": "CountsPerLabel", "value": 2.0}')
- and (getattr(caplog.records[3], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "foo_name-class_b", "metric": "CountsPerLabel", "value": 1.0}')
- and (getattr(caplog.records[4], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "foo_name-class_c", "metric": "CountsPerLabel", "value": 1.0}')
- and (getattr(caplog.records[5], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Min", "value": 1500.0}')
- and (getattr(caplog.records[6], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Max", "value": 1500.0}')
- and (getattr(caplog.records[7], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Mean", "value": 1500.0}')
- and (getattr(caplog.records[8], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Std", "value": 0.0}')
- and (getattr(caplog.records[9], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Variance", "value": 0.0}')
- )
-
- @pytest.mark.integration
- def test_schema_validations_errors_are_thrown_for_each_validation_if_df_does_not_map_to_schema_definition(self, invalid_dataframe):
- # Given
- df = invalid_dataframe
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- with pytest.raises(SchemaValidationError):
- ReadS3CsvIO(source_config=s3_csv_cloud_config).validate_from_schema(df)
-
- @pytest.mark.integration
- def test_schema_validations_exception_message_is_a_dict_with_all_violated_validations(self, invalid_dataframe, expected_messages):
- # Given
- df = invalid_dataframe
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- try:
- ReadS3CsvIO(source_config=s3_csv_cloud_config).validate_from_schema(df)
- except SchemaValidationError as _exception:
- # Then
- assert _exception.message.keys() == expected_messages # pylint: disable=no-member
-
- @pytest.mark.integration
- def test_local_writers_only_write_out_castable_columns_according_to_the_io_schema_case_float64_to_int64_id(self, dataset_with_more_columns_than_dictated_in_schema):
-
- # Given
- # Note col_1 will be interpreted with type float64
- input_df = dataset_with_more_columns_than_dictated_in_schema
-
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- # class WriteS3ParquetExternalIO(UnifiedIO):
- # schema = {
- # 'bar': 'int64',
- # 'event_type': 'object',
- # 'id': 'int64',
- # 'end_odometer': 'int64',
- # 'foo_name': 'object',
- # }
- write_s3_io = WriteS3ParquetExternalIO(source_config=s3_parquet_local_config)
- write_s3_io.write(input_df)
-
- # # Then
- try:
- output_df = pd.read_parquet(s3_parquet_local_config.local.file_path)
- assert output_df.columns.to_list() == [
- "id",
- "foo_name",
- "bar",
- "end_odometer",
- "event_type",
- ]
- finally:
- os.remove(s3_parquet_local_config.local.file_path)
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema")
- def test_schema_validations_are_not_applied_on_read_if_validate_flag_is_false(self, mock_validate_from_schema):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- # ReadS3CsvIO(source_config=s3_csv_cloud_config, apply_schema_validations=False).read()
- ReadS3CsvIO(source_config=s3_csv_local_config).read() # False is the default value
-
- # Then
- mock_validate_from_schema.assert_not_called()
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema")
- def test_schema_validations_are_automatically_applied_on_read_if_validate_flag_is_true(self, mock_validate_from_schema):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- ReadS3CsvIO(source_config=s3_csv_local_config, apply_schema_validations=True).read()
-
- # Then
- mock_validate_from_schema.assert_called()
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema")
- def test_schema_validations_are_automatically_applied_on_write_if_validate_flag_is_true(self, mock_validate_from_schema, valid_dataframe):
- # Given
- df = valid_dataframe
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- WriteS3CsvWithSchema(source_config=s3_csv_local_config, apply_schema_validations=True).write(df)
-
- # Then
- try:
- mock_validate_from_schema.assert_called()
- finally:
- os.remove(s3_csv_local_config.local.file_path)
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema")
- def test_schema_validations_are_not_applied_on_write_if_validate_flag_is_false(self, mock_validate_from_schema, valid_dataframe):
- # Given
- df = valid_dataframe
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- # WriteS3CsvWithSchema(source_config=s3_csv_cloud_config, apply_schema_validations=False).write(df)
- WriteS3CsvWithSchema(source_config=s3_csv_local_config).write(df) # False is the default value
-
- # Then
- try:
- mock_validate_from_schema.assert_not_called()
- finally:
- os.remove(s3_csv_local_config.local.file_path)
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema")
- def test_schema_metrics_are_not_logged_on_read_if_metrics_flag_is_false(self, mock_log_metrics_from_schema):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- # ReadS3CsvIO(source_config=s3_csv_cloud_config, log_schema_metrics=False).read()
- ReadS3CsvIO(source_config=s3_csv_local_config).read() # False is the default value
-
- # Then
- mock_log_metrics_from_schema.assert_not_called()
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema")
- def test_schema_metrics_are_automatically_logged_on_read_if_validate_flag_is_true(self, mock_log_metrics_from_schema):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- ReadS3CsvIO(source_config=s3_csv_local_config, log_schema_metrics=True).read()
-
- # Then
- mock_log_metrics_from_schema.assert_called()
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema")
- def test_schema_metrics_are_automatically_logged_on_write_if_metrics_flag_is_true(self, mock_log_metrics_from_schema, valid_dataframe):
- # Given
- df = valid_dataframe
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- WriteS3CsvWithSchema(source_config=s3_csv_local_config, log_schema_metrics=True).write(df)
-
- # Then
- try:
- mock_log_metrics_from_schema.assert_called()
- finally:
- os.remove(s3_csv_local_config.local.file_path)
-
- @pytest.mark.unit
- @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema")
- def test_schema_metrics_are_not_logged_on_write_if_metrics_flag_is_false(self, mock_log_metrics_from_schema, valid_dataframe):
- # Given
- df = valid_dataframe
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- # WriteS3CsvWithSchema(source_config=s3_csv_cloud_config, log_schema_metrics=False).write(df)
- WriteS3CsvWithSchema(source_config=s3_csv_local_config).write(df) # False is the default value
-
- # Then
- try:
- mock_log_metrics_from_schema.assert_not_called()
- finally:
- os.remove(s3_csv_local_config.local.file_path)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "df, expected_dtype, expected_warning",
- [
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]),
- "bool",
- None,
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- ],
- )
- def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_parquet(
- self, caplog, df, expected_dtype, expected_warning
- ):
- # Note: In the presence of a boolean cell value in a column, if that column also has numbers or strings, df.to_parquet() will not write it out.
- # It will try to convert it to a bool and it will fail throwing an `pyarrow.lib.ArrowInvalid:` error
- #
- # This makes parquet a safer option from the available filetypes.
-
- # Given
- s3_parquet_with_some_bool_col_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_BOOL")
-
- ParquetWithSomeBool(source_config=s3_parquet_with_some_bool_col_local_config).write(df)
-
- # Then
- try:
- if caplog.messages:
- assert caplog.messages[0] == expected_warning
- assert pd.read_parquet(s3_parquet_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype
- finally:
- os.remove(s3_parquet_with_some_bool_col_local_config.local.file_path)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "df, expected_dtype, expected_warning",
- [
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]),
- "bool",
- None,
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": 1}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records(
- [
- {"id": 1, "foo_name": "A", "bar": 12, "bool_col": True},
- {"id": 2, "foo_name": "B", "bar": 12, "bool_col": "random"},
- ]
- ),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- ],
- )
- def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_csv(
- self, caplog, df, expected_dtype, expected_warning
- ):
-
- # Given
- s3_csv_with_some_bool_col_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_CSV_WITH_BOOL")
-
- CsvWithSomeBool(source_config=s3_csv_with_some_bool_col_local_config).write(df)
-
- # Then
- try:
- if caplog.messages:
- assert caplog.messages[0] == expected_warning
- assert pd.read_csv(s3_csv_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype
- finally:
- os.remove(s3_csv_with_some_bool_col_local_config.local.file_path)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "df, expected_dtype, expected_warning",
- [
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]),
- "bool",
- None,
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": 1}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records(
- [
- {"id": 1, "foo_name": "A", "bar": 12, "bool_col": True},
- {"id": 2, "foo_name": "B", "bar": 12, "bool_col": "random"},
- ]
- ),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- ],
- )
- def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_hdf(
- self, caplog, df, expected_dtype, expected_warning
- ):
-
- # Given
- s3_hdf_with_some_bool_col_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_HDF_WITH_BOOL")
-
- HdfWithSomeBool(source_config=s3_hdf_with_some_bool_col_local_config).write(df)
-
- # Then
- try:
- if caplog.messages:
- assert caplog.messages[0] == expected_warning
- assert pd.read_hdf(s3_hdf_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype
- finally:
- os.remove(s3_hdf_with_some_bool_col_local_config.local.file_path)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "df, expected_dtype, expected_warning",
- [
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]),
- "bool",
- None,
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": 1}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records(
- [
- {"id": 1, "foo_name": "A", "bar": 12, "bool_col": True},
- {"id": 2, "foo_name": "B", "bar": 12, "bool_col": "random"},
- ]
- ),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- (
- pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]),
- "bool",
- CASTING_WARNING_MSG.format("bool_col", "bool", "object"),
- ),
- ],
- )
- def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_json(
- self, caplog, df, expected_dtype, expected_warning
- ):
-
- # Note: In the presence of a boolean cell value in a column, but with additional values of ambiguous type, df.to_json() will try to convert the column
- # to a type `int` or `float`, converting boolean values to numbers to `1.0 : True` and `0.0 : False`, and the rest to NaN. This can cause data corruption issues.
-
- # Given
- s3_json_with_some_bool_col_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_JSON_WITH_BOOL")
-
- JsonWithSomeBool(source_config=s3_json_with_some_bool_col_local_config).write(df)
-
- # Then
- try:
- if caplog.messages:
- assert caplog.messages[0] == expected_warning
- assert pd.read_json(s3_json_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype
- finally:
- os.remove(s3_json_with_some_bool_col_local_config.local.file_path)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "df",
- [
- (pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NA}])),
- (
- pd.DataFrame.from_records(
- [
- {"id": 1, "foo_name": "A", "bar": False, "bool_col": True},
- {"id": 2, "foo_name": "B", "bar": "BAD-VALUE", "bool_col": False},
- ]
- )
- ),
- ],
- )
- def test__has_valid_dtypes_throws_columns_data_type_error_when_casting_fails(self, df):
-
- # Note: In the presence of a boolean cell value in a column, but with additional values of ambiguous type, df.to_json() will try to convert the column
- # to a type `int` or `float`, converting boolean values to numbers to `1.0 : True` and `0.0 : False`, and the rest to NaN. This can cause data corruption issues.
-
- # Given
- s3_parquet_with_some_bool_col_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_BOOL")
-
- # Then
- with pytest.raises(ColumnsDataTypeError):
- ParquetWithSomeBool(source_config=s3_parquet_with_some_bool_col_local_config).write(df)
-
- @pytest.mark.unit
- def test_a_custom_validate_method_can_be_used_to_override_the_default_abstract_one(self):
-
- # Given
- df = pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 13, "bool_col": False}])
- s3_parquet_with_some_bool_col_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_CUSTOM_VALIDATE")
-
- # When
- ParquetWithCustomValidate(source_config=s3_parquet_with_some_bool_col_local_config).write(df)
-
- # Then
- try:
- pd.testing.assert_frame_equal(pd.read_parquet(s3_parquet_with_some_bool_col_local_config.local.file_path), df)
- finally:
- os.remove(s3_parquet_with_some_bool_col_local_config.local.file_path)
-
- @pytest.mark.integration
- def test_show_casting_warnings_flag_default_value_prevents_showing_casting_logs(self, caplog):
- # Given
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
- io_instance = ReadS3DataWithFalseTypes(source_config=s3_csv_cloud_config) # i.e.show_casting_warnings=False
-
- # When
- with caplog.at_level(logging.INFO):
- io_instance.read()
-
- # Then
- assert len(caplog.records) == 0
-
- @pytest.mark.integration
- def test_show_casting_warnings_flag_allows_casting_logs_to_be_printed_if_set_to_true(self, caplog):
- # Given
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
- io_instance = ReadS3DataWithFalseTypes(source_config=s3_csv_cloud_config, show_casting_warnings=True)
-
- # When
- with caplog.at_level(logging.INFO):
- io_instance.read()
-
- # Then
- assert getattr(caplog.records[0], "message") == "Expected: 'float64' dtype for READ_S3_DATA_WITH_FALSE_TYPES['id]', found 'int64'"
-
- @pytest.mark.unit
- def test_options_are_read_from_code(self):
-
- # Given
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_CODE")
-
- # When
- config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config, option_1=False, option_2=True)
-
- # Then
- assert config_io.options == {"option_1": False, "option_2": True}
-
- @pytest.mark.unit
- def test_options_are_read_from_resource_definition(self):
- # Given
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_DEFINITION")
-
- # When
- config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config)
-
- # Then
- assert config_io.options == {"option_3": False, "option_4": True}
-
- @pytest.mark.unit
- def test_options_are_that_are_read_from_both_resource_definition_and_code_but_with_no_conflicts_are_merged(self):
- # Given
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_DEFINITION")
-
- # When
- config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config, option_1=False, option_2=True)
-
- # Then
- assert config_io.options == {"option_1": False, "option_2": True, "option_3": False, "option_4": True}
-
- @pytest.mark.unit
- def test_options_from_code_are_prioritized(self):
- # Given
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_DEFINITION")
-
- # When
- config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config, option_1=False, option_2=True, option_3=True) # option_3 is conflicting
-
- # Then
- assert config_io.options == {"option_1": False, "option_2": True, "option_3": True, "option_4": True}
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "camel_case_string, expected_string",
- [
- ("TestStringABC", "TEST_STRING_ABC"),
- ("TestString", "TEST_STRING"),
- ("ThisIsAnotherTest", "THIS_IS_ANOTHER_TEST"),
- ("AbstractS3Test", "ABSTRACT_S3_TEST"),
- ("YetAnotherGREATTest", "YET_ANOTHER_GREAT_TEST"),
- ],
- )
- def test_transform_class_names_to_dataset_names(self, camel_case_string, expected_string):
- # Given/When
- transformed_string = DynamicDataIO._transform_class_name_to_dataset_name(camel_case_string) # pylint: disable=W0212
-
- assert transformed_string == expected_string
-
- @pytest.mark.unit
- def test_no_options_at_all_are_provided_with_no_issues(self):
-
- # Given
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_CODE")
-
- # When
- config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config)
-
- # Then
- assert config_io.options == {}
-
- @pytest.mark.unit
- def test_dataset_name_is_defined_by_io_class_if_schema_from_file_is_not_provided(self):
-
- # Given
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config)
-
- # Then
- assert config_io.name == "READ_S3_PARQUET_IO"
-
- @pytest.mark.unit
- def test_dataset_name_is_inferred_from_schema_if_schema_from_file_is_provided(self):
-
- # Given
- s3_read_from_csv_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- config_io = ReadS3CsvIO(source_config=s3_read_from_csv_config)
-
- # Then
- assert config_io.name == "READ_FROM_S3_CSV"
-
-
-class TestAsyncCoreIO:
- @pytest.mark.unit
- def test_read_is_called_through_async_read(self):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- with patch.object(dynamicio.core.DynamicDataIO, "read") as mock_read:
- mock_read.return_value = pd.DataFrame.from_records([[1, "name_a"]], columns=["id", "foo_name"])
- asyncio.run(ReadS3CsvIO(source_config=s3_csv_local_config).async_read())
-
- # Then
- mock_read.assert_called()
-
- @pytest.mark.unit
- @pytest.mark.asyncio
- async def test_write_is_called_through_async_write(self):
- # Given
- df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]})
-
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- with patch.object(dynamicio.core.DynamicDataIO, "write") as mock_write:
- await asyncio.gather(WriteS3CsvIO(source_config=s3_csv_local_config).async_write(df))
-
- # Then
- mock_write.assert_called()
-
- @pytest.mark.unit
- def test_async_read_does_indeed_operate_in_parallel(self):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- def dummy_read(self) -> pd.DataFrame: # pylint: disable=unused-argument
- time.sleep(0.1)
- return pd.DataFrame.from_records([[1, "name_a"]], columns=["id", "foo_name"])
-
- async def multi_read(config: Mapping[str, str]) -> Tuple:
- return await asyncio.gather(
- ReadS3CsvIO(source_config=config).async_read(),
- ReadS3CsvIO(source_config=config).async_read(),
- ReadS3CsvIO(source_config=config).async_read(),
- ReadS3CsvIO(source_config=config).async_read(),
- )
-
- # When
- with patch.object(dynamicio.core.DynamicDataIO, "read", new=dummy_read):
- start_time = time.time()
- asyncio.run(multi_read(s3_csv_local_config))
- duration = time.time() - start_time
-
- # Then
- assert duration < 0.125
-
- @pytest.mark.unit
- def test_async_write_does_indeed_operate_in_parallel(self):
- # Given
- df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]})
-
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- def dummy_write(self, _df: pd.DataFrame) -> bool: # pylint: disable=unused-argument
- time.sleep(0.1)
- return True
-
- async def multi_write(config: Mapping[str, str], _df: pd.DataFrame) -> Tuple:
- return await asyncio.gather(
- WriteS3CsvIO(source_config=config).async_write(_df),
- WriteS3CsvIO(source_config=config).async_write(_df),
- WriteS3CsvIO(source_config=config).async_write(_df),
- WriteS3CsvIO(source_config=config).async_write(_df),
- )
-
- # When
- with patch.object(dynamicio.core.DynamicDataIO, "read", new=dummy_write):
- start_time = time.time()
- asyncio.run(multi_write(s3_csv_local_config, df))
- duration = time.time() - start_time
-
- # Then
- assert duration < 0.125
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 2a504d7..7f74c81 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -1,169 +1,51 @@
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods
-import logging
-
-import pytest
-
-from dynamicio.metrics import Counts, CountsPerLabel, log_metric, Max, Mean, Min, Std, UniqueCounts, Variance
-
-
-@pytest.fixture(autouse=True, scope="module")
-def propagate_logger():
- # We need this because otherwise caplog can't capture the logs
- logging.getLogger("dynamicio.metrics").propagate = True
- yield
- logging.getLogger("dynamicio.metrics").propagate = False
-
-
-class TestMetricsLogger:
- @pytest.mark.unit
- def test_metric_logging_works_even_if_value_is_nan(self, caplog):
- # Given/ When
- with caplog.at_level(logging.INFO):
- log_metric(dataset="Test-DataSet", column="A", metric="B", value=float("nan"))
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "A", "metric": "B", "value": NaN}'
-
- @pytest.mark.unit
- def test_metric_logging_works_even_if_value_is_inf(self, caplog):
- # Given/ When
- with caplog.at_level(logging.INFO):
- log_metric(dataset="Test-DataSet", column="A", metric="B", value=float("inf"))
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "A", "metric": "B", "value": Infinity}'
-
-
-class TestMin:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_min = Min(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print() # keep this in for a better test output
- log_min()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Min", "value": 5.0}'
-
-
-class TestMax:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_max = Max(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_max()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Max", "value": 9.0}'
-
-
-class TestMean:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_mean = Mean(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_mean()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Mean", "value": 6.6}'
-
-
-class TestStd:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_std = Std(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_std()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Std", "value": 1.429840705968481}'
-
-
-class TestVariance:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_var = Variance(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_var()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Variance", "value": 2.0444444444444443}'
-
-
-class TestCounts:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_counts = Counts(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_counts()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Counts", "value": 10.0}'
-
-
-class TestUniqueCounts:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_unique_counts = UniqueCounts(dataset_name="Test-DataSet", df=df, column="weight_a")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_unique_counts()
-
- # Then
- assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "UniqueCounts", "value": 5.0}'
-
-
-class TestCountsPerLabel:
- @pytest.mark.unit
- def test_metric_generation_and_logging(self, caplog, input_df):
- # Given
- df = input_df
- log_counts_per_label = CountsPerLabel(dataset_name="Test-DataSet", df=df, column="activity")
-
- # When
- with caplog.at_level(logging.INFO):
- print()
- log_counts_per_label()
-
- # Then
- assert (
- (len(caplog.records) == 3)
- and (getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "activity-discharge", "metric": "CountsPerLabel", "value": 5.0}')
- and (getattr(caplog.records[1], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "activity-load", "metric": "CountsPerLabel", "value": 2.0}')
- and (
- getattr(caplog.records[2], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "activity-pass_through", "metric": "CountsPerLabel", "value": 3.0}'
- )
+from unittest import mock
+from unittest.mock import call
+
+from pandera import Field, SchemaModel
+from pandera.typing import Series
+
+from dynamicio import ParquetResource
+from dynamicio.metrics import Metric
+from tests.constants import TEST_RESOURCES
+
+
+class ParquetSampleSchema(SchemaModel):
+ """Schema for sample parquet file."""
+
+ id: Series[int]
+ foo_name: Series[str] = Field(log_statistics={"metrics": [Metric.COUNTS_PER_LABEL]})
+ bar: Series[int] = Field(
+ log_statistics={
+ "metrics": [
+ Metric.MIN,
+ Metric.MAX,
+ Metric.MEAN,
+ Metric.STD,
+ Metric.VARIANCE,
+ Metric.COUNTS,
+ Metric.UNIQUE_COUNTS,
+ ]
+ }
+ )
+
+
+def test_metrics_logged_successfully():
+ test_path = TEST_RESOURCES / "data/input/parquet_sample.parquet"
+
+ resource = ParquetResource(path=test_path, pa_schema=ParquetSampleSchema)
+
+ with mock.patch("dynamicio.metrics.log_metric") as log_metric:
+ _ = resource.read()
+ log_metric.assert_has_calls(
+ [
+ call(column="foo_name", metric=Metric.COUNTS_PER_LABEL, value=8),
+ call(column="foo_name", metric=Metric.COUNTS_PER_LABEL, value=7),
+ call(column="bar", metric=Metric.MIN, value=1),
+ call(column="bar", metric=Metric.MAX, value=15),
+ call(column="bar", metric=Metric.MEAN, value=8),
+ call(column="bar", metric=Metric.STD, value=4.47213595499958),
+ call(column="bar", metric=Metric.VARIANCE, value=20),
+ call(column="bar", metric=Metric.COUNTS, value=15),
+ call(column="bar", metric=Metric.UNIQUE_COUNTS, value=15),
+ ]
)
diff --git a/tests/test_mixins/test_kafka_mixins.py b/tests/test_mixins/test_kafka_mixins.py
deleted file mode 100644
index 9fcb8fe..0000000
--- a/tests/test_mixins/test_kafka_mixins.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801
-import os
-from unittest.mock import MagicMock, patch
-
-import pandas as pd
-import pytest
-from kafka import KafkaProducer
-
-import dynamicio.mixins.with_kafka
-
-from dynamicio.config import IOConfig
-from dynamicio.mixins import WithKafka
-from tests import constants
-from tests.mocking.io import (
- MockKafkaProducer,
- WriteKafkaIO,
-)
-
-
-class TestKafkaIO:
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer")
- @patch.object(MockKafkaProducer, "send")
- def test_write_to_kafka_is_called_for_writing_an_iterable_of_dicts_with_env_as_cloud_kafka(self, mock__kafka_producer, mock__kafka_producer_send, input_messages_df):
- # Given
- def rows_generator(_df, chunk_size):
- _chunk = []
- for _, row in df.iterrows():
- _chunk.append(row.to_dict())
- if len(_chunk) == chunk_size:
- yield pd.DataFrame(_chunk)
- _chunk.clear()
-
- df = input_messages_df
-
- mock__kafka_producer.return_value = MockKafkaProducer()
- mock__kafka_producer_send.return_value = MagicMock()
-
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
-
- # When
- for chunk in rows_generator(_df=df, chunk_size=2):
- WriteKafkaIO(kafka_cloud_config).write(chunk)
- # Then
- assert mock__kafka_producer_send.call_count == 1
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer")
- @patch.object(MockKafkaProducer, "send")
- def test_write_to_kafka_is_called_with_document_transformer_if_provided_for_writing_an_iterable_of_dicts_with_env_as_cloud_kafka(
- self, mock__kafka_producer, mock__kafka_producer_send, input_messages_df
- ):
- # Given
- def rows_generator(_df, chunk_size):
- _chunk = []
- for _, row in df.iterrows():
- _chunk.append(row.to_dict())
- if len(_chunk) == chunk_size:
- yield pd.DataFrame(_chunk)
- _chunk.clear()
-
- df = input_messages_df.iloc[[0]]
-
- mock__kafka_producer.return_value = MockKafkaProducer()
- mock__kafka_producer_send.return_value = MagicMock()
-
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
-
- # When
- for chunk in rows_generator(_df=df, chunk_size=2):
- WriteKafkaIO(kafka_cloud_config, document_transformer=lambda v: dict(**v, worked=True)).write(chunk)
- # Then
- mock__kafka_producer_send.assert_called_once_with(
- {
- "id": "message01",
- "foo": "xxxxxxxx",
- "bar": 0,
- "baz": ["a", "b", "c"],
- "worked": True,
- }
- )
-
- @pytest.mark.unit
- def test_kafka_producer_default_value_serialiser_is_used_unless_alternative_is_given(self, test_df):
- # Given
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config)
-
- # When
- with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer, patch.object(MockKafkaProducer, "send") as mock__kafka_producer_send:
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock__kafka_producer.return_value = MockKafkaProducer()
- mock__kafka_producer_send.return_value = MagicMock()
- write_kafka_io.write(test_df)
-
- # Then
- value_serializer = write_kafka_io._WithKafka__kafka_config.pop("value_serializer")
- assert "WithKafka._default_value_serializer" in str(value_serializer)
-
- @pytest.mark.unit
- def test_kafka_producer_default_key_serialiser_is_used_unless_alternative_is_given(self, test_df):
- # Given
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config)
-
- # When
- with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer, patch.object(MockKafkaProducer, "send") as mock__kafka_producer_send:
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock__kafka_producer.return_value = MockKafkaProducer()
- mock__kafka_producer_send.return_value = MagicMock()
- write_kafka_io.write(test_df)
-
- # Then
- key_serializer = write_kafka_io._WithKafka__kafka_config.pop("key_serializer")
- assert "WithKafka._default_key_serializer" in str(key_serializer)
-
- @pytest.mark.unit
- @patch.object(MockKafkaProducer, "send")
- @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer")
- def test_kafka_producer_default_compression_type_is_snappy(self, mock__kafka_producer, mock__kafka_producer_send, test_df):
- # Given
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock__kafka_producer.return_value = MockKafkaProducer()
- mock__kafka_producer_send.return_value = MagicMock()
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config)
-
- # When
- write_kafka_io.write(test_df)
-
- # Then
- write_kafka_io._WithKafka__kafka_config.pop("value_serializer") # Removed as it returns a unique function identifier
- write_kafka_io._WithKafka__kafka_config.pop("key_serializer") # Removed as it returns a unique function identifier
- assert write_kafka_io._WithKafka__kafka_config == {"bootstrap_servers": "mock-kafka-server", "compression_type": "snappy"}
-
- @pytest.mark.unit
- @patch.object(MockKafkaProducer, "send")
- @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer")
- def test_kafka_producer_options_are_replaced_by_the_user_options(self, mock__kafka_producer, mock__kafka_producer_send, test_df):
- # Given
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock__kafka_producer.return_value = MockKafkaProducer()
- mock__kafka_producer_send.return_value = MagicMock()
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config, compression_type="lz4", acks=2)
-
- # When
- write_kafka_io.write(test_df)
-
- # Then
- value_serializer = write_kafka_io._WithKafka__kafka_config.pop("value_serializer") # Removed as it returns a unique function identifier
- write_kafka_io._WithKafka__kafka_config.pop("key_serializer") # Removed as it returns a unique function identifier
- assert write_kafka_io._WithKafka__kafka_config == {
- "acks": 2,
- "bootstrap_servers": "mock-kafka-server",
- "compression_type": "lz4",
- } and "WithKafka._default_value_serializer" in str(value_serializer)
-
- @pytest.mark.unit
- def test_producer_send_method_sends_messages_with_index_as_key_by_default_if_a_keygen_is_not_provided(self, test_df):
- # Given
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config)
-
- # When
- with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer:
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock_producer = MockKafkaProducer()
- mock__kafka_producer.return_value = mock_producer
- write_kafka_io.write(test_df)
-
- # Then
- assert mock_producer.my_stream == [
- {"key": 0, "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1"}},
- {"key": 1, "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2"}},
- {"key": 2, "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3"}},
- ]
-
- @pytest.mark.unit
- def test_producer_send_method_can_send_keyed_messages_using_a_custom_key_generator(self, test_df):
- # Given
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda _, message: "XXX")
-
- # When
- with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer:
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock_producer = MockKafkaProducer()
- mock__kafka_producer.return_value = mock_producer
- write_kafka_io.write(test_df)
-
- # Then
- assert mock_producer.my_stream == [
- {"key": "XXX", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1"}},
- {"key": "XXX", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2"}},
- {"key": "XXX", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3"}},
- ]
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "key, encoded_key",
- [
- (None, None),
- ("cacik", b"cacik"),
- ],
- )
- def test_default_key_serialiser_returns_none_if_key_is_not_provided_and_an_encoded_string_otherwise(self, key, encoded_key):
- # Given/When/Then
- assert encoded_key == WithKafka._default_key_serializer(key)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- "value, encoded_value",
- [
- (None, b"null"),
- ({"a": 1, "b": "cacik"}, b'{"a": 1, "b": "cacik"}'),
- ({"a": 1, "b": None}, b'{"a": 1, "b": null}'),
- ],
- )
- def test_default_value_serialiser_returns_encoded_mapping_if_key_is_not_provided_and_an_encoded_string_otherwise(self, value, encoded_value):
- # Given/When/Then
- assert encoded_value == WithKafka._default_value_serializer(value)
-
- @pytest.mark.unit
- def test_default_key_generator_and_transformer_are_used_if_none_are_provided_by_the_user(self):
- # Given
- keyed_test_df = pd.DataFrame.from_records(
- [
- ["key-01", "cm_1", "id_1", 1000, "ABC"],
- ["key-01", "cm_2", "id_2", 1000, "ABC"], # <-- index is non-unique
- ["key-02", "cm_3", "id_3", 1000, "ABC"],
- ],
- columns=["key", "id", "foo", "bar", "baz"],
- ).set_index("key")
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config)
-
- # When
- with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer:
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock_producer = MockKafkaProducer()
- mock__kafka_producer.return_value = mock_producer
-
- # When
- write_kafka_io.write(keyed_test_df)
- assert (write_kafka_io._WithKafka__key_generator("idx", "value") == "idx") and (write_kafka_io._WithKafka__document_transformer("value") == "value")
-
- @pytest.mark.unit
- def test_custom_key_generator_and_transformer_are_used_if_they_are_provided_by_the_user(self):
- # Given
- keyed_test_df = pd.DataFrame.from_records(
- [
- ["key-01", "cm_1", "id_1", 1000, "ABC"],
- ["key-01", "cm_2", "id_2", 1000, "ABC"],
- ["key-02", "cm_3", "id_3", 1000, "ABC"],
- ],
- columns=["key", "id", "foo", "bar", "baz"],
- ).set_index("key")
- kafka_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
- write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda idx, _: "xxx", document_transformer=lambda _: "xxx")
-
- # When
- with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer:
- mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
- mock_producer = MockKafkaProducer()
- mock__kafka_producer.return_value = mock_producer
-
- # When
- write_kafka_io.write(keyed_test_df)
- assert (write_kafka_io._WithKafka__key_generator("idx", "value") == "xxx") and (write_kafka_io._WithKafka__document_transformer("value") == "xxx")
diff --git a/tests/test_mixins/test_local_mixins.py b/tests/test_mixins/test_local_mixins.py
deleted file mode 100644
index 39732b6..0000000
--- a/tests/test_mixins/test_local_mixins.py
+++ /dev/null
@@ -1,795 +0,0 @@
-# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801
-import asyncio
-import os
-import time
-from typing import Mapping, Tuple
-from unittest.mock import patch
-
-import numpy as np
-import pandas as pd
-import pytest
-
-import dynamicio.mixins.utils
-import dynamicio.mixins.with_local
-
-from dynamicio.config import IOConfig
-from tests import constants
-from tests.conftest import max_pklproto_hdf
-from tests.constants import TEST_RESOURCES
-from tests.mocking.io import (
- AsyncReadS3HdfIO,
- ReadFromBatchLocalHdf,
- ReadFromBatchLocalParquet,
- ReadPostgresIO,
- ReadS3CsvIO,
- ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO,
- ReadS3DataWithLessColumnsIO,
- ReadS3HdfIO,
- ReadS3JsonIO,
- ReadS3ParquetIO,
- TemplatedFile,
- WriteKafkaIO,
- WritePostgresIO,
- WriteS3CsvIO,
- WriteS3HdfIO,
- WriteS3ParquetIO,
-)
-from tests.mocking.models import ERModel
-
-
-class TestLocalIO:
- @pytest.mark.unit
- def test_read_parquet_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet"
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- s3_parquet_df = ReadS3DataWithLessColumnsIO(source_config=s3_parquet_local_config).read()
-
- # Then
- assert expected_df_with_less_columns.equals(s3_parquet_df)
-
- @pytest.mark.unit
- def test_read_json_pandas_reader_will_maintain_columns_order_of_the_original_dataset_when_filtering_out_columns(
- self,
- ):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/definitions/external.yaml/json_with_more_columns.json"
- s3_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_JSON")
-
- # When
- s3_json_df = ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO(source_config=s3_json_local_config).read()
-
- # Then
- assert s3_json_df.columns.to_list() == ["foo_name", "bar", "bar_type", "a_number", "b_number"]
-
- @pytest.mark.unit
- def test_read_hdf_pandas_reader_will_maintain_columns_order_of_the_original_dataset_when_filtering_out_columns(
- self,
- ):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/definitions/external.yaml/h5_with_more_columns.h5"
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_HDF")
-
- # When
- s3_hdf_df = ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO(source_config=s3_hdf_local_config).read()
-
- # Then
- assert s3_hdf_df.columns.to_list() == ["foo_name", "bar", "bar_type", "a_number", "b_number"]
-
- @pytest.mark.unit
- def test_read_csv_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV_ALT")
-
- # When
- s3_csv_df = ReadS3DataWithLessColumnsIO(source_config=s3_csv_local_config).read()
-
- # Then
- assert expected_df_with_less_columns.equals(s3_csv_df)
-
- @pytest.mark.unit
- def test_read_h5_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5"
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_HDF")
-
- # When
- s3_hdf_df = ReadS3DataWithLessColumnsIO(source_config=s3_parquet_local_config).read()
-
- # Then
- assert expected_df_with_less_columns.equals(s3_hdf_df)
-
- @pytest.mark.unit
- def test_read_json_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json"
- s3_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_JSON")
-
- # When
- s3_json_df = ReadS3DataWithLessColumnsIO(source_config=s3_json_local_config).read()
-
- # Then
- assert expected_df_with_less_columns.equals(s3_json_df)
-
- @pytest.mark.unit
- def test_read_json_pandas_reader_will_only_filter_out_columns_not_in_schema(self, expected_df_with_less_columns):
- # Given
- s3_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_JSON")
-
- # When
- s3_json_df = ReadS3DataWithLessColumnsIO(source_config=s3_json_local_config).read()
-
- # Then
- assert expected_df_with_less_columns.equals(s3_json_df)
-
- @pytest.mark.unit
- def test_read_hdf_pandas_reader_will_only_filter_out_columns_not_in_schema(self, expected_df_with_less_columns):
- # Given
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_HDF")
-
- # When
- s3_hdf_df = ReadS3DataWithLessColumnsIO(source_config=s3_hdf_local_config).read()
-
- # Then
- assert expected_df_with_less_columns.equals(s3_hdf_df)
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_local.WithLocal, "_read_from_local")
- def test_local_reader_is_called_for_loading_any_file_when_env_is_set_to_local(self, mock__read_from_local, expected_s3_csv_df):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
- mock__read_from_local.return_value = expected_s3_csv_df
-
- # When
- ReadS3CsvIO(source_config=s3_csv_local_config).read()
-
- # Then
- mock__read_from_local.assert_called()
-
- @pytest.mark.unit
- def test_a_local_parquet_file_is_loaded_when_io_config_is_initialised_with_local_env_and_parquet_file_type(self, test_df):
- # Given
- pg_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- pg_parquet_df = ReadPostgresIO(source_config=pg_parquet_local_config).read()
-
- # Then
- assert test_df.equals(pg_parquet_df)
-
- @pytest.mark.unit
- def test_a_local_h5_file_is_loaded_when_io_config_is_initialised_with_local_env_and_hdf_file_type(self, expected_s3_hdf_df):
- # Given
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_HDF")
-
- # When
- s3_hdf_df = ReadS3HdfIO(source_config=s3_hdf_local_config).read()
-
- # Then
- assert expected_s3_hdf_df.equals(s3_hdf_df)
-
- @pytest.mark.unit
- def test_a_local_json_file_is_loaded_when_io_config_is_initialised_with_local_env_and_json_file_type(self, expected_s3_json_df):
- # Given
- s3_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_JSON")
-
- # When
- options = {"orient": "columns"}
- s3_json_df = ReadS3JsonIO(source_config=s3_json_local_config, **options).read()
-
- # Then
- assert expected_s3_json_df.equals(s3_json_df)
-
- @pytest.mark.unit
- def test_a_local_csv_file_is_loaded_when_io_config_is_initialised_with_local_env_and_csv_file_type(self, expected_s3_csv_df):
- # Given
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- s3_csv_df = ReadS3CsvIO(source_config=s3_csv_local_config).read()
-
- # Then
- assert expected_s3_csv_df.equals(s3_csv_df)
-
- @pytest.mark.unit
- def test_a_local_parquet_file_is_loaded_when_io_config_is_set_with_local_env_a_parquet_file_type_for_postgres(self, test_df):
- # Given
- pg_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- pg_df = ReadPostgresIO(source_config=pg_local_config, model=ERModel).read()
-
- # Then
- assert test_df.equals(pg_df)
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_local.WithLocal, "_write_to_local")
- def test_local_writer_is_called_for_writing_any_file_when_env_is_set_to_local(self, mock__write_to_local):
- # Given
- df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]})
-
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- WriteS3CsvIO(source_config=s3_csv_local_config).write(df)
-
- # Then
- mock__write_to_local.assert_called()
-
- @pytest.mark.unit
- def test_a_df_is_written_locally_as_parquet_when_io_config_is_initialised_with_local_env_value_and_parquet_file_type(
- self,
- test_df,
- ):
- # Given
- df = test_df
-
- pg_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_PG_PARQUET")
-
- # When
- WritePostgresIO(source_config=pg_parquet_local_config).write(df)
-
- # Then
- try:
- assert os.path.isfile(pg_parquet_local_config.local.file_path)
- finally:
- os.remove(pg_parquet_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_a_df_is_written_locally_as_csv_when_io_config_is_initialised_with_local_env_value_and_csv_file_type(
- self,
- ):
- # Given
- df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]})
-
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- WriteS3CsvIO(source_config=s3_csv_local_config).write(df)
-
- # Then
- try:
- assert os.path.isfile(s3_csv_local_config.local.file_path)
- finally:
- os.remove(s3_csv_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_a_df_is_written_locally_as_json_when_io_config_is_initialised_with_local_env_value_and_json_file_type(self, input_messages_df):
- # Given
- df = input_messages_df
-
- kafka_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_KAFKA_JSON")
-
- # When
- WriteKafkaIO(source_config=kafka_json_local_config).write(df)
-
- # Then
- try:
- assert os.path.isfile(kafka_json_local_config.local.file_path)
- finally:
- os.remove(kafka_json_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_a_df_is_written_locally_as_h5_when_io_config_is_initialised_with_local_env_value_and_hdf_file_type(
- self,
- ):
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
-
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_HDF")
-
- # When
- WriteS3HdfIO(source_config=s3_hdf_local_config).write(df)
-
- # Then
- try:
- assert os.path.isfile(s3_hdf_local_config.local.file_path)
- finally:
- os.remove(s3_hdf_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_dynamicio_default_pickle_protocol_is_4(
- self,
- ):
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
-
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_HDF")
-
- # When
- WriteS3HdfIO(source_config=s3_hdf_local_config).write(df)
-
- # Then
- try:
- assert max_pklproto_hdf(s3_hdf_local_config.local.file_path) == 4
- finally:
- os.remove(s3_hdf_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_dynamicio_default_pickle_protocol_is_bypassed_by_user_input(
- self,
- ):
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
-
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_HDF")
-
- # When
- WriteS3HdfIO(source_config=s3_hdf_local_config, protocol=5).write(df)
-
- # Then
- try:
- assert max_pklproto_hdf(s3_hdf_local_config.local.file_path) == 5
- finally:
- os.remove(s3_hdf_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_read_resolves_file_path_if_templated_for_some_input_data(self):
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet"
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="TEMPLATED_FILE_PATH")
-
- io_object = TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read")
-
- with patch.object(io_object, "_read_csv_file") as mocked__read_csv_file:
- mocked__read_csv_file.return_value = pd.read_csv(os.path.join(TEST_RESOURCES, "data/input/some_csv_to_read.csv"))
- io_object.read()
-
- mocked__read_csv_file.assert_called_once_with(
- config.local.file_path.format(file_name_to_replace="some_csv_to_read"),
- io_object.schema,
- )
-
- @pytest.mark.unit
- def test_write_resolves_file_path_if_templated_for_some_output_data(self):
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet"
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="TEMPLATED_FILE_PATH")
-
- io_object = TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read")
-
- df = pd.read_csv(os.path.join(TEST_RESOURCES, "data/input/some_csv_to_read.csv"))
- with patch.object(io_object, "_write_csv_file") as mocked__write_csv_file:
- io_object.write(df)
-
- mocked__write_csv_file.assert_called_once()
- (called_with_df, called_with_file_path) = mocked__write_csv_file.call_args[0]
- pd.testing.assert_frame_equal(df, called_with_df)
- assert called_with_file_path == config.local.file_path.format(file_name_to_replace="some_csv_to_read")
-
- @pytest.mark.integration
- def test_local_writers_only_write_out_castable_columns_according_to_the_io_schema_case_float64_to_int64_id(
- self,
- ):
-
- # Given
- # Note col_1 will be interpreted with type float64
- input_df = pd.DataFrame.from_dict({"col_1": [3.0, 2.0, 1.0], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- # class WriteS3ParquetIO(DynamicDataIO):
- # schema = {"col_1": "int64", "col_2": "object"}
- #
- # @staticmethod
- # def validate(df: pd.DataFrame):
- # pass
- write_s3_io = WriteS3ParquetIO(source_config=s3_parquet_local_config)
- write_s3_io.write(input_df)
-
- # # Then
- try:
- output_df = pd.read_parquet(s3_parquet_local_config.local.file_path)
- assert list(output_df.dtypes) == [
- np.dtype("int64"),
- np.dtype("O"),
- ] # order of the list matters
- finally:
- os.remove(s3_parquet_local_config.local.file_path)
-
- @pytest.mark.integration
- def test_local_writers_only_write_out_columns_in_a_provided_io_schema(self):
-
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- # class WriteS3ParquetIO(DynamicDataIO):
- # schema = {"col_1": "int64", "col_2": "object"}
- #
- # @staticmethod
- # def validate(df: pd.DataFrame):
- # pass
- write_s3_io = WriteS3ParquetIO(source_config=s3_parquet_local_config)
- write_s3_io.write(input_df)
-
- # Then
- try:
- output_df = pd.read_parquet(s3_parquet_local_config.local.file_path)
- no_of_columns_of_output_df = len(list(output_df.columns))
- no_of_columns_of_input_df = len(list(input_df.columns))
- assert (no_of_columns_of_input_df - no_of_columns_of_output_df == 1) and (set(output_df.columns) == {*write_s3_io.schema.columns.keys()}) # pylint: disable=no-member
- finally:
- os.remove(s3_parquet_local_config.local.file_path)
-
- @pytest.mark.unit
- def test_pyarrow_is_used_as_backend_parquet(self):
-
- # When
- implementation = dynamicio.mixins.with_local.pd.io.parquet.get_engine("auto")
-
- # Then
- assert implementation.__class__.__name__ == "PyArrowImpl"
-
- @pytest.mark.integration
- def test_write_parquet_file_is_called_with_additional_pyarrow_args(self):
-
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- to_parquet_kwargs = {
- "use_deprecated_int96_timestamps": False,
- "coerce_timestamps": "ms",
- "allow_truncated_timestamps": True,
- "row_group_size": 1000000,
- }
-
- # When
- with patch.object(dynamicio.mixins.with_local.pd.DataFrame, "to_parquet") as mocked__to_parquet:
- write_s3_io = WriteS3ParquetIO(source_config=s3_parquet_local_config, **to_parquet_kwargs)
- write_s3_io.write(input_df)
-
- # Then
- mocked__to_parquet.assert_called_once_with(os.path.join(constants.TEST_RESOURCES, "data/processed/write_some_parquet.parquet"), **to_parquet_kwargs)
-
- @pytest.mark.integration
- @patch.object(dynamicio.mixins.with_local.pd, "read_parquet")
- def test_read_parquet_file_is_called_with_additional_pyarrow_args(self, mock__read_parquet):
-
- # Given
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_CODE")
-
- read_parquet_kwargs = {"filters": [("a", "<", "2")]}
-
- # When
- ReadFromBatchLocalParquet(config, **read_parquet_kwargs).read()
- # Then
- mock__read_parquet.assert_called_once_with(config.local.file_path, columns=["id", "foo_name", "bar"], **read_parquet_kwargs)
-
- @pytest.mark.unit
- def test_read_with_pyarrow_is_called_as_default_when_no_engine_option_is_provided(self):
- # Given
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__read_with_pyarrow") as mocked__read_with_pyarrow:
- ReadS3ParquetIO(config).read()
-
- # Then
- mocked__read_with_pyarrow.assert_called_once_with(config.local.file_path, columns=["id", "foo_name", "bar"])
-
- @pytest.mark.unit
- def test_read_with_pyarrow_is_called_when_engine_option_is_set_to_pyarrow(self):
- # Given
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__read_with_pyarrow") as mocked__read_with_pyarrow:
- ReadS3ParquetIO(config, engine="pyarrow").read()
-
- # Then
- mocked__read_with_pyarrow.assert_called_once_with(config.local.file_path, engine="pyarrow", columns=["id", "foo_name", "bar"])
-
- @pytest.mark.unit
- def test_read_with_fastparquet_is_called_when_engine_option_is_set_to_fastparquet(self):
- # Given
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__read_with_fastparquet") as mocked__read_with_fastparquet:
- ReadS3ParquetIO(config, engine="fastparquet").read()
-
- # Then
- mocked__read_with_fastparquet.assert_called_once_with(config.local.file_path, engine="fastparquet", columns=["id", "foo_name", "bar"])
-
- @pytest.mark.unit
- def test_write_with_pyarrow_is_called_as_default_when_no_engine_option_is_provided(self):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__write_with_pyarrow") as mocked__write_with_pyarrow:
- WriteS3ParquetIO(config).write(input_df)
-
- # Then
- mocked__write_with_pyarrow.assert_called()
-
- @pytest.mark.unit
- def test_write_with_pyarrow_is_called_when_engine_option_is_set_to_pyarrow(self):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__write_with_pyarrow") as mocked__write_with_pyarrow:
- WriteS3ParquetIO(config, engine="pyarrow").write(input_df)
-
- # Then
- mocked__write_with_pyarrow.assert_called()
-
- @pytest.mark.unit
- def test_write_with_fastparquet_is_called_when_engine_option_is_set_to_fastparquet(self):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__write_with_fastparquet") as mocked__write_with_fastparquet:
- WriteS3ParquetIO(config, engine="fastparquet").write(input_df)
-
- # Then
- mocked__write_with_fastparquet.assert_called()
-
- @pytest.mark.unit
- def test_async_read_does_not_operate_in_parallel_for_hdf_files(self):
-
- # Given
- s3_hdf_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_HDF")
-
- async def multi_read(config: Mapping[str, str]) -> Tuple:
- return await asyncio.gather(
- AsyncReadS3HdfIO(source_config=config).async_read(),
- AsyncReadS3HdfIO(source_config=config).async_read(),
- )
-
- def dummy_read_hdf(*args, **kwargs) -> pd.DataFrame: # pylint: disable=unused-argument
- time.sleep(0.1)
- return pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
-
- # When
- with patch.object(dynamicio.mixins.with_local.pd, "read_hdf", new=dummy_read_hdf):
- start_time = time.time()
- asyncio.run(multi_read(s3_hdf_cloud_config))
- duration = time.time() - start_time
-
- # Then
- assert duration >= 0.2
-
- @pytest.mark.unit
- def test_async_write_does_not_operate_in_parallel_for_hdf_files(self):
-
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_HDF")
-
- async def multi_write(config: Mapping[str, str], _df: pd.DataFrame) -> Tuple:
- return await asyncio.gather(WriteS3HdfIO(source_config=config).async_write(_df), WriteS3HdfIO(source_config=config).async_write(_df))
-
- @dynamicio.mixins.utils.allow_options([*dynamicio.mixins.utils.args_of(pd.DataFrame.to_hdf), *["protocol"]])
- def dummy_to_hdf(*args, **kwargs): # pylint: disable=unused-argument
- time.sleep(0.1)
-
- # When
- with patch.object(dynamicio.mixins.with_local.pd.DataFrame, "to_hdf", new=dummy_to_hdf):
- start_time = time.time()
- asyncio.run(multi_write(s3_hdf_local_config, df))
- duration = time.time() - start_time
-
- # Then
- assert duration >= 0.2
-
-
-class TestBatchLocal:
- @pytest.mark.unit
- def test_multiple_files_are_loaded_when_batch_local_type_is_used_for_parquet(self, expected_s3_parquet_df):
- # Given
- parquet_local_batch_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_BATCH_LOCAL_PARQUET")
- expected_concatenated_df = expected_s3_parquet_df
-
- # When
- concatenated_df = ReadFromBatchLocalParquet(source_config=parquet_local_batch_config).read()
-
- # Then
- pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df)
-
- @pytest.mark.unit
- def test_files_that_dont_comply_to_the_provided_file_type_are_ignored(self, expected_s3_parquet_df):
- # Given
- parquet_local_batch_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_BATCH_LOCAL_NOT_JUST_PARQUET")
- expected_concatenated_df = expected_s3_parquet_df
-
- # When
- concatenated_df = ReadFromBatchLocalParquet(source_config=parquet_local_batch_config).read()
-
- # Then
- pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df)
-
- @pytest.mark.unit
- def test_if_hdf_file_is_chosen_then_file_type_is_converted_to_h5_for_filtering(self, expected_s3_parquet_df):
- # Given
- parquet_local_batch_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_BATCH_LOCAL_NOT_JUST_PARQUET")
- expected_concatenated_df = expected_s3_parquet_df
-
- # When
- concatenated_df = ReadFromBatchLocalParquet(source_config=parquet_local_batch_config).read()
-
- # Then
- pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df)
-
- @pytest.mark.unit
- def test_multiple_files_are_loaded_when_batch_local_type_is_used_for_hdf(self, expected_s3_hdf_df):
- # Given
- parquet_local_batch_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_BATCH_LOCAL_HDF")
- expected_concatenated_df = expected_s3_hdf_df
-
- # When
- concatenated_df = ReadFromBatchLocalHdf(source_config=parquet_local_batch_config).read()
-
- # Then
- pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df.sort_values(by="id").reset_index(drop=True))
diff --git a/tests/test_mixins/test_mixin_utils.py b/tests/test_mixins/test_mixin_utils.py
deleted file mode 100644
index c79c224..0000000
--- a/tests/test_mixins/test_mixin_utils.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801
-import os
-from typing import Any
-
-import pytest
-
-from dynamicio.config import IOConfig
-from dynamicio.mixins.utils import allow_options, args_of, get_string_template_field_names, resolve_template
-from tests import constants
-from tests.mocking.io import (
- ReadS3CsvIO,
-)
-
-
-class TestGetStringTemplateFieldNames:
- @pytest.mark.unit
- @pytest.mark.parametrize(
- ["s", "expected_result"],
- [
- ("", []),
- ("abc", []),
- ("{abc}", ["abc"]),
- ("a{abc}d{def}", ["abc", "def"]),
- ("a{0}b{1}", ["0", "1"]),
- ("{abc:.2f}", ["abc"]),
- ],
- )
- def test_returns_correct_result(self, s, expected_result):
- result = get_string_template_field_names(s)
- assert result == expected_result
-
-
-class TestResolveTemplate:
- @pytest.mark.unit
- @pytest.mark.parametrize(
- ["s", "options", "expected_result"],
- [
- ("{abc}d{def}", {"abc": "100", "def": "hello"}, "100dhello"),
- ("{hello}", {"world": "100", "hello": "world"}, "world"),
- ],
- )
- def test_returns_correct_result(self, s, options, expected_result):
- result = resolve_template(s, options)
- assert result == expected_result
-
- @pytest.mark.unit
- @pytest.mark.parametrize(["s"], [("abc{0}",), ("{1def}def",)])
- def test_raises_value_error_if_s_has_fields_which_are_not_valid_identifiers(self, s):
- with pytest.raises(ValueError):
- resolve_template(s, None)
-
- @pytest.mark.unit
- @pytest.mark.parametrize(
- ["s", "options"],
- [("{abc}", {}), ("{abc}", {"def": "something"}), ("{abc}{def}", {"def": "700"})],
- )
- def test_raises_value_error_if_template_field_cannot_be_resolved_to_options(self, s, options):
- with pytest.raises(ValueError):
- resolve_template(s, options)
-
-
-class TestAllowedOptions:
- @pytest.fixture(autouse=True)
- def _pass_fixtures(self, capsys):
- self.capsys = capsys # pylint: disable=attribute-defined-outside-init
-
- @pytest.mark.unit
- def test_args_of_returns_valid_set_of_allowed_kwargs_for_a_given_function(self):
- # Given
- def magic_function(arg_a: str, arg_b: int, arg_c: bool) -> bool:
- print(f"{arg_a}: {arg_b}")
- return arg_c
-
- func = magic_function
-
- # When
- options = args_of(func)
-
- # Then
- assert options == {"arg_a", "arg_b", "arg_c"}
-
- @pytest.mark.integration
- def test_allow_options_can_use_iterable_returned_from_args_of_to_filter_out_invalid_options(
- self,
- ):
- # Given
- def magic_function(arg_a: str, arg_b: int, arg_c: bool) -> bool:
- print(f"{arg_a}: {arg_b}")
- return arg_c
-
- func = magic_function
-
- @allow_options(args_of(func))
- def mock_method(**options: Any):
- return [*options]
-
- # When
- options = mock_method(arg_a="A", arg_b=1, arg_c=True, invalid_option="I SHOULDN'T BE HERE")
-
- # Then
- assert options == ["arg_a", "arg_b", "arg_c"]
-
- @pytest.mark.integration
- def test_allow_options_does_not_filter_out_valid_args_when_they_are_passed_as_args_and_not_as_kwargs(
- self,
- ):
- # Given
- def magic_function(arg_a: str, arg_b: int, arg_c: bool) -> bool:
- return [arg_a, arg_b, arg_c]
-
- func = magic_function
-
- @allow_options(args_of(func))
- def mock_method(schema: "str", **options: Any):
- print(schema)
- return magic_function(**options)
-
- # When
- # options = mock_method(schema="schema", **{"arg_a": "A", "arg_b": 1, "arg_c": True, "invalid_option": "I SHOULDN'T BE HERE"}) # THIS WOULD FAIL!
- options = mock_method(
- "schema",
- **{"arg_a": "A", "arg_b": 1, "arg_c": True, "invalid_option": "I SHOULDN'T BE HERE"},
- )
-
- # Then
- captured = self.capsys.readouterr()
- assert (captured.out == "schema\n") and (options == ["A", 1, True])
-
- @pytest.mark.integration # This is an integration test as it uses `allow_options()` after `args_of()`
- def test_when_reading_locally_or_from_s3_invalid_options_are_ignored(self, expected_s3_csv_df):
- # Given
- invalid_option = "INVALID_OPTION"
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- s3_csv_df = ReadS3CsvIO(source_config=s3_csv_local_config, foo=invalid_option).read()
-
- # Then
- assert expected_s3_csv_df.equals(s3_csv_df)
-
- @pytest.mark.integration
- def test_when_reading_locally_or_from_s3_valid_options_are_considered(self, expected_s3_csv_df):
- # Given
- # VALID OPTION: dtype=None
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="LOCAL",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- s3_csv_df = ReadS3CsvIO(source_config=s3_csv_local_config, dtype=None).read()
-
- # Then
- assert expected_s3_csv_df.equals(s3_csv_df)
diff --git a/tests/test_mixins/test_postgres_mixins.py b/tests/test_mixins/test_postgres_mixins.py
deleted file mode 100644
index 62447db..0000000
--- a/tests/test_mixins/test_postgres_mixins.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801
-import os
-from unittest.mock import ANY, patch
-
-import pandas as pd
-import pytest
-from sqlalchemy.sql.base import ImmutableColumnCollection
-
-from dynamicio import WithPostgres
-from dynamicio.config import IOConfig
-from tests import constants
-from tests.mocking.io import (
- ReadPostgresIO,
- WriteExtendedPostgresIO,
- WritePostgresIO,
-)
-from tests.mocking.models import ERModel, PgModel
-
-
-class TestPostgresIO:
- @pytest.mark.unit
- def test_when_reading_from_postgres_with_env_as_cloud_get_table_columns_returns_valid_list_of_columns_for_a_model(self, expected_columns):
- # Given
- pg_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- columns = ReadPostgresIO(source_config=pg_cloud_config)._get_table_columns(ERModel) # pylint: disable=protected-access
- # Then
- assert columns == expected_columns
-
- @pytest.mark.unit
- @patch.object(WithPostgres, "_read_from_postgres")
- def test_read_from_postgres_is_called_for_loading_a_table_with_columns_with_env_as_cloud_and_type_as_postgres(self, mock__read_from_postgres, test_df):
- # Given
- mock__read_from_postgres.return_value = test_df
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- ReadPostgresIO(source_config=postgres_cloud_config).read()
-
- # Then
- mock__read_from_postgres.assert_called()
-
- @pytest.mark.unit
- @patch.object(WithPostgres, "_write_to_postgres")
- def test_write_to_postgres_is_called_for_uploading_a_table_with_columns_with_env_as_cloud_and_type_as_postgres(self, mock__write_to_postgres, test_df):
- # Given
- df = test_df
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_PG_PARQUET")
-
- # When
- WritePostgresIO(source_config=postgres_cloud_config).write(df)
-
- # Then
- mock__write_to_postgres.assert_called()
-
- @pytest.mark.unit
- @patch.object(WithPostgres, "_write_to_postgres")
- def test_write_to_postgres_is_called_with_truncate_and_append_option(self, mock__write_to_postgres, test_df):
- # Given
- df = test_df
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(
- source_key="WRITE_TO_PG_PARQUET",
- )
-
- # When
- write_config = WritePostgresIO(source_config=postgres_cloud_config, truncate_and_append=True)
-
- write_config.write(df)
-
- # Then
- mock__write_to_postgres.assert_called_once()
- (called_with_df,) = mock__write_to_postgres.call_args[0]
- pd.testing.assert_frame_equal(test_df, called_with_df)
- assert "truncate_and_append" in write_config.options
-
- @pytest.mark.unit
- @patch.object(WithPostgres, "_read_from_postgres")
- def test_read_from_postgres_by_implicitly_generating_datamodel_from_schema(self, mock__read_from_postgres, test_df):
- # Given
- mock__read_from_postgres.return_value = test_df
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When / Then
- ReadPostgresIO(source_config=postgres_cloud_config).read()
- mock__read_from_postgres.assert_called()
-
- @pytest.mark.unit
- @patch.object(WithPostgres, "_read_database")
- def test_read_from_postgres_with_query(self, mock__read_database):
- # Given
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- ReadPostgresIO(source_config=postgres_cloud_config, sql_query="SELECT * FROM example").read()
-
- # Then
- mock__read_database.assert_called_with(ANY, "SELECT * FROM example")
-
- @pytest.mark.unit
- @patch.object(WithPostgres, "_read_database")
- def test_read_from_postgres_with_query_in_options(self, mock__read_database):
- # Given
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES_WITH_QUERY_IN_OPTIONS")
-
- # When
- ReadPostgresIO(source_config=postgres_cloud_config).read()
-
- # Then
- mock__read_database.assert_called_with(ANY, "SELECT * FROM table_name_from_yaml_options")
-
- @pytest.mark.unit
- @patch.object(pd, "read_sql")
- def test_read_from_postgres_with_query_and_options(self, mock__read_sql):
- # Given
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- ReadPostgresIO(source_config=postgres_cloud_config, sql_query="SELECT * FROM example", parse_dates=["date"], wrong_arg="whatever").read()
-
- # Then
- mock__read_sql.assert_called_with(sql="SELECT * FROM example", con=ANY, parse_dates=["date"])
-
- @pytest.mark.unit
- def test_generate_model_from_schema_returns_model(self):
- # Given
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- schema = postgres_cloud_config.dynamicio_schema
- schema_name = postgres_cloud_config.dynamicio_schema.name
- model = ReadPostgresIO(source_config=postgres_cloud_config)._generate_model_from_schema(schema)
-
- # Then
- assert len(model.__table__.columns) == len(schema.columns) and model.__tablename__ == schema_name
-
- @pytest.mark.unit
- def test_get_table_columns_from_generated_model_returns_valid_list_of_columns(self):
- # Given
- pg_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_POSTGRES")
-
- # When
- schema = pg_cloud_config.dynamicio_schema
- model = ReadPostgresIO(source_config=pg_cloud_config)._generate_model_from_schema(schema) # pylint: disable=protected-access
- columns = ReadPostgresIO(source_config=pg_cloud_config)._get_table_columns(model) # pylint: disable=protected-access
-
- # Then
- assert isinstance(model.__table__.columns, ImmutableColumnCollection)
- for x, y in zip(columns, [PgModel.id, PgModel.foo, PgModel.bar, PgModel.baz]):
- assert str(x) == str(y)
-
- @pytest.mark.unit
- def test_to_check_if_dataframe_has_valid_data_types(self):
- # Given
- postgres_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_PG_PARQUET")
-
- df = pd.DataFrame.from_records(
- [
- ["cm_1", "id_1", 1000, "12/12/2000", True, 12.76],
- ["cm_2", "id_2", 1000, "01/02/1990", False, 199.76],
- ["cm_3", "id_3", 1000, "01/05/1990", False, 12.76],
- ],
- columns=["id", "foo", "bar", "start_date", "active", "net"],
- )
-
- # When
- is_valid = WriteExtendedPostgresIO(source_config=postgres_cloud_config, show_casting_warnings=True)._has_valid_dtypes(df)
-
- # Then
- assert is_valid is True
diff --git a/tests/test_mixins/test_s3_mixins.py b/tests/test_mixins/test_s3_mixins.py
deleted file mode 100644
index 4a98897..0000000
--- a/tests/test_mixins/test_s3_mixins.py
+++ /dev/null
@@ -1,831 +0,0 @@
-# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801
-import os
-import shutil
-from tempfile import NamedTemporaryFile
-from unittest import mock
-from unittest.mock import patch
-
-import pandas as pd
-import pydantic
-import pytest
-import yaml
-
-
-import dynamicio.mixins.with_local
-import dynamicio.mixins.with_s3
-
-from dynamicio.config import IOConfig
-from dynamicio.errors import ColumnsDataTypeError
-from tests import constants
-from tests.constants import TEST_RESOURCES
-from tests.mocking.io import (
- ReadS3CsvIO,
- ReadS3HdfIO,
- ReadS3JsonIO,
- ReadS3ParquetIO,
- ReadS3ParquetWEmptyFilesIO,
- ReadS3ParquetWithDifferentCastableDTypeIO,
- ReadS3ParquetWithDifferentNonCastableDTypeIO,
- ReadS3ParquetWithLessColumnsIO,
- TemplatedFile,
- WriteS3CsvIO,
- WriteS3HdfIO,
- WriteS3JsonIO,
- WriteS3ParquetIO,
-)
-
-
-class TestS3FileIO:
- @pytest.mark.unit
- def test_read_resolves_file_path_if_templated(self):
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet"
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="TEMPLATED_FILE_PATH")
-
- file_path = f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv"
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_read_csv_file") as mock__read_csv_file, patch.object(
- dynamicio.mixins.with_s3.WithS3File, "_s3_named_file_reader"
- ) as mock_s3_reader:
- with open(file_path, "r") as file: # pylint: disable=unspecified-encoding
- mock_s3_reader.return_value = file
- io_obj = TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read")
- final_schema = io_obj.schema
- io_obj.read()
-
- mock__read_csv_file.assert_called_once_with(file_path, final_schema)
-
- @pytest.mark.unit
- def test_write_resolves_file_path_if_templated(self):
- # Given
- # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet"
- config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="TEMPLATED_FILE_PATH")
-
- # When
- with patch.object(dynamicio.mixins.with_local.WithLocal, "_write_csv_file") as mock__write_csv_file:
- df = pd.read_csv(os.path.join(TEST_RESOURCES, "data/input/some_csv_to_read.csv"))
- TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read").write(df)
-
- # Then
- args, _ = mock__write_csv_file.call_args
- assert "s3://mock-bucket/path/to/some_csv_to_read.csv" == args[1]
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_s3.WithS3File, "_read_from_s3_file")
- def test_read_from_s3_file_is_called_for_loading_a_file_with_env_as_cloud_s3(self, mock__read_from_s3_file, expected_s3_csv_df):
- # Given
- mock__read_from_s3_file.return_value = expected_s3_csv_df
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- ReadS3CsvIO(source_config=s3_csv_cloud_config).read()
-
- # Then
- mock__read_from_s3_file.assert_called()
-
- @pytest.mark.unit
- def test_s3_reader_is_not_called_for_loading_a_parquet_with_env_as_cloud_s3_and_type_as_parquet_and_no_disk_space_flag(self):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- file_path = f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv"
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock_s3_reader, patch.object(
- dynamicio.mixins.with_s3.WithS3File, "_read_parquet_file"
- ) as mock_read_parquet_file:
- with open(file_path, "r") as file: # pylint: disable=unspecified-encoding
- mock_s3_reader.return_value = file
- ReadS3ParquetIO(source_config=s3_parquet_cloud_config, no_disk_space=True).read()
-
- # Then
- mock_s3_reader.assert_not_called()
- mock_read_parquet_file.assert_called()
-
- @pytest.mark.unit
- def test_s3_reader_is_called_for_loading_a_hdf_with_env_as_cloud_s3_and_type_as_hdf(self, expected_s3_hdf_file_path, expected_s3_hdf_df):
- # Given
- s3_hdf_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_HDF")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "boto3_client") as mock__boto3_client:
-
- def mock_download_fobj(s3_bucket, s3_key, target_file):
- with open(expected_s3_hdf_file_path, "rb") as fin:
- shutil.copyfileobj(fin, target_file)
-
- mock__boto3_client.download_fileobj.side_effect = mock_download_fobj
- loaded_hdf_pd = ReadS3HdfIO(source_config=s3_hdf_cloud_config, no_disk_space=True).read()
-
- # Then
- pd.testing.assert_frame_equal(loaded_hdf_pd, expected_s3_hdf_df)
-
- @pytest.mark.unit
- def test_s3_reader_is_not_called_for_loading_a_json_with_env_as_cloud_s3_and_type_as_json_and_no_disk_space_flag(self):
- # Given
- s3_json_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_JSON")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock__s3_reader, patch.object(
- dynamicio.mixins.with_s3.WithS3File, "_read_json_file"
- ) as mock__read_json_file:
- ReadS3JsonIO(source_config=s3_json_cloud_config, no_disk_space=True).read()
-
- # Then
- mock__s3_reader.assert_not_called()
- mock__read_json_file.assert_called()
-
- @pytest.mark.unit
- def test_s3_reader_is_not_called_for_loading_a_csv_with_env_as_cloud_s3_and_type_as_csv_and_no_disk_space_flag(self):
- # Given
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_CSV")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock__s3_reader, patch.object(
- dynamicio.mixins.with_s3.WithS3File, "_read_csv_file"
- ) as mock__read_csv_file:
- ReadS3CsvIO(source_config=s3_csv_cloud_config, no_disk_space=True).read()
-
- # Then
- mock__s3_reader.assert_not_called()
- mock__read_csv_file.assert_called()
-
- @pytest.mark.unit
- def test_ValueError_is_raised_if_file_path_missing_from_config(self, tmp_path):
- tmp_yaml = tmp_path / "test.yaml"
- with open(tmp_yaml, "w") as fout:
- yaml.safe_dump(
- {
- "READ_FROM_S3_MISSING_FILE_PATH": {
- "LOCAL": {
- "type": "local",
- "local": {
- "file_path": "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "type": "s3_file",
- "s3": {"bucket": "[[ MOCK_BUCKET ]]", "file_type": "csv"},
- },
- "schema": {"file_path": "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"},
- }
- },
- fout,
- )
-
- with pytest.raises(pydantic.ValidationError):
- IOConfig(
- path_to_source_yaml=str(tmp_yaml),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- )
-
- @pytest.mark.unit
- def test_s3_writers_only_validate_schema_prior_writing_out_the_dataframe(self):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- # class WriteS3ParquetIO(DynamicDataIO):
- # schema = {"col_1": "int64", "col_2": "object"}
- #
- # @staticmethod
- # def validate(df: pd.DataFrame):
- # pass
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object(WriteS3ParquetIO, "_apply_schema") as mock__apply_schema, patch.object(
- WriteS3ParquetIO, "_write_parquet_file"
- ) as mock__write_parquet_file:
- with NamedTemporaryFile(delete=False) as temp_file:
- mock__s3_writer.return_value = temp_file
- WriteS3ParquetIO(source_config=s3_parquet_cloud_config).write(input_df)
-
- # Then
- mock__apply_schema.assert_called()
- mock__write_parquet_file.assert_called()
-
- @pytest.mark.unit
- def test_columns_data_type_error_exception_is_not_generated_if_column_dtypes_can_be_casted_to_the_expected_dtypes(self, expected_s3_parquet_df):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_read_parquet_file") as mock__read_parquet_file, patch.object(
- dynamicio.mixins.with_s3.WithS3File, "_s3_named_file_reader"
- ):
- mock__read_parquet_file.return_value = expected_s3_parquet_df
- ReadS3ParquetWithDifferentCastableDTypeIO(source_config=s3_parquet_cloud_config).read()
-
- assert True, "No exception was raised"
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_named_file_reader")
- @patch.object(dynamicio.mixins.with_s3.WithS3File, "_read_parquet_file")
- def test_columns_data_type_error_exception_is_generated_if_column_dtypes_dont_map_to_the_expected_dtypes(self, mock__s3_reader, moc__read_parquet_file, expected_s3_parquet_df):
- """
- ------------------------------ Captured log call -------------------------------
-
- WARNING ...:dataio.py:273 Expected: 'float64' dtype for column: 'id', found: 'int64' instead.
- WARNING ...:dataio.py:273 Expected: 'int64' dtype for column: 'foo_name', found: 'object' instead.
- ERROR ...:dataio.py:277 Tried casting column: 'foo_name' to 'int64' from 'object', but failed.
-
- =========================== short test summary info ============================
-
- FAILED ...:test_columns_data_type_error_exception_is_generated_if_column_dtypes_dont_map_to_the_expected_dtypes
-
- ============================== 1 failed in 0.48s ===============================
-
- """
- # Given
- dataframe_returned = expected_s3_parquet_df
- mock__s3_reader.return_value = dataframe_returned
-
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When/Then
- with pytest.raises(ColumnsDataTypeError):
- ReadS3ParquetWithDifferentNonCastableDTypeIO(source_config=s3_parquet_cloud_config).read()
- moc__read_parquet_file.assert_called()
-
- @pytest.mark.unit
- def test_read_parquet_file_is_called_while_s3_reader_is_not_for_loading_a_parquet_with_env_as_cloud_s3_and_type_as_parquet_with_no_disk_space_option(
- self,
- ):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock__s3_reader, patch.object(
- dynamicio.mixins.with_local.WithLocal, "_read_parquet_file"
- ) as mock__read_parquet_file:
- ReadS3ParquetIO(source_config=s3_parquet_cloud_config, no_disk_space=True).read()
-
- # Then
- mock__s3_reader.assert_not_called()
- mock__read_parquet_file.assert_called()
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_s3.WithS3File, "_write_to_s3_file")
- def test_s3_writer_is_called_for_writing_a_file_with_env_is_set_to_cloud_s3(self, mock__write_to_s3_file):
- # Given
- df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]})
-
- s3_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_JSON")
-
- # When
- ReadS3HdfIO(source_config=s3_json_local_config).write(df)
-
- # Then
- mock__write_to_s3_file.assert_called()
-
- @pytest.mark.unit
- def test_write_parquet_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_s3(self):
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
-
- s3_parquet_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object(
- dynamicio.mixins.with_local.WithLocal, "_write_parquet_file"
- ) as mock__write_parquet_file:
- with NamedTemporaryFile(delete=False) as temp_file:
- mock__s3_writer.return_value = temp_file
- WriteS3ParquetIO(source_config=s3_parquet_local_config).write(df)
-
- # Then
- mock__write_parquet_file.assert_called()
-
- @pytest.mark.unit
- def test_write_csv_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_csv(self):
- # Given
- df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]})
-
- s3_csv_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_CSV")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object(
- dynamicio.mixins.with_local.WithLocal, "_write_csv_file"
- ) as mock__write_csv_file:
- with NamedTemporaryFile(delete=False) as temp_file:
- mock__s3_writer.return_value = temp_file
- WriteS3CsvIO(source_config=s3_csv_local_config).write(df)
-
- # Then
- mock__write_csv_file.assert_called()
-
- @pytest.mark.unit
- def test_write_json_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_json(self):
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
-
- s3_json_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_JSON")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object(
- dynamicio.mixins.with_local.WithLocal, "_write_json_file"
- ) as mock__write_json_file:
- with NamedTemporaryFile(delete=False) as temp_file:
- mock__s3_writer.return_value = temp_file
- WriteS3JsonIO(source_config=s3_json_local_config).write(df)
-
- # Then
- mock__write_json_file.assert_called()
-
- @pytest.mark.unit
- def test_write_hdf_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_hdf(self):
- # Given
- df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]})
- s3_hdf_local_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_HDF")
-
- # When
- with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer:
- with NamedTemporaryFile(delete=False) as temp_file:
- mock__s3_writer.return_value = temp_file
- WriteS3HdfIO(source_config=s3_hdf_local_config).write(df)
-
- # Then
- assert os.stat(temp_file.name).st_size == 1064192, "Confirm that the output file size did not change"
-
-
-class TestS3PathPrefixIO:
- @pytest.mark.unit
- def test_error_is_raised_if_path_prefix_missing_from_config(self, tmp_path):
-
- tmp_yaml = tmp_path / "test.yaml"
- with open(tmp_yaml, "w") as fout:
- yaml.safe_dump(
- {
- "READ_FROM_S3_MISSING_PATH_PREFIX": {
- "LOCAL": {
- "type": "local",
- "local": {
- "file_path": "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv",
- "file_type": "csv",
- },
- },
- "CLOUD": {
- "type": "s3_path_prefix",
- "s3": {"bucket": "[[ MOCK_BUCKET ]]", "file_type": "csv"},
- },
- "schema": {"file_path": "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"},
- }
- },
- fout,
- )
-
- with pytest.raises(pydantic.ValidationError):
- IOConfig(
- path_to_source_yaml=str(tmp_yaml),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- )
-
- @pytest.mark.unit
- def test_ValueError_is_raised_if_partition_cols_missing_from_options_when_uploading(self):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PATH_PREFIX_PARQUET")
-
- # When / Then
- with pytest.raises(ValueError):
- WriteS3ParquetIO(source_config=s3_parquet_cloud_config).write(input_df)
-
- @pytest.mark.unit
- def test_error_is_raised_if_file_type_not_parquet_when_uploading(self, tmp_path):
-
- tmp_yaml = tmp_path / "test.yaml"
- with open(tmp_yaml, "w") as fout:
- yaml.safe_dump(
- {
- "WRITE_TO_S3_PATH_PREFIX_NOT_PARQUET": {
- "CLOUD": {
- "type": "s3_path_prefix",
- "s3": {
- "bucket": "[[ MOCK_BUCKET ]]",
- "path_prefix": "[[ MOCK_KEY ]]",
- "file_type": "not_parquet",
- },
- }
- }
- },
- fout,
- )
-
- with pytest.raises(pydantic.ValidationError):
- IOConfig(
- path_to_source_yaml=str(tmp_yaml),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- )
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_s3.WithS3PathPrefix, "_read_from_s3_path_prefix")
- def test_read_from_s3_path_prefix_is_called_for_loading_a_path_prefix_with_env_as_cloud_s3(self, mock__read_from_s3_path_prefix, expected_s3_csv_df):
- # Given
- mock__read_from_s3_path_prefix.return_value = expected_s3_csv_df
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_CSV")
-
- # When
- ReadS3CsvIO(source_config=s3_csv_cloud_config).read()
-
- # Then
- mock__read_from_s3_path_prefix.assert_called()
-
- @pytest.mark.unit
- @patch.object(dynamicio.mixins.with_s3.WithS3PathPrefix, "_write_to_s3_path_prefix")
- def test_write_to_s3_path_prefix_is_called_for_uploading_to_a_path_prefix_with_env_as_cloud_s3(self, mock__write_to_s3_path_prefix):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
-
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PATH_PREFIX_PARQUET")
-
- # When
- WriteS3ParquetIO(source_config=s3_parquet_cloud_config).write(input_df)
-
- # Then
- mock__write_to_s3_path_prefix.assert_called()
-
- @pytest.mark.unit
- @patch.object(WriteS3ParquetIO, "_write_parquet_file")
- # pylint: disable=unused-argument
- def test_awscli_runner_is_called_with_correct_s3_path_and_aws_command_when_uploading_a_path_prefix_with_env_as_cloud_s3(self, mock__write_parquet_file, mock_temporary_directory):
- # Given
- input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]})
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="WRITE_TO_S3_PATH_PREFIX_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- WriteS3ParquetIO(source_config=s3_parquet_cloud_config, partition_cols="col_2").write(input_df)
-
- # Then
- mocked__awscli_runner.assert_called_with("s3", "sync", "temp", "s3://mock-bucket/mock-key", "--acl", "bucket-owner-full-control", "--only-show-errors", "--exact-timestamps")
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test_awscli_runner_is_called_with_correct_s3_path_and_aws_command_when_loading_a_path_prefix_with_env_as_cloud_s3(
- self, mock_listdir, mock_temporary_directory, mock__read_hdf_file
- ):
- # Given
- s3_hdf_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_HDF")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- ReadS3HdfIO(source_config=s3_hdf_cloud_config).read()
-
- # Then
- mocked__awscli_runner.assert_called_with("s3", "sync", "s3://mock-bucket/mock-key", "temp", "--acl", "bucket-owner-full-control", "--only-show-errors", "--exact-timestamps")
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_hdf_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_hdf(
- self, mock_listdir, mock_temporary_directory, mock__read_hdf_file
- ):
- # Given
- s3_hdf_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_HDF")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- read_obj = ReadS3HdfIO(source_config=s3_hdf_cloud_config)
- actual_schema = read_obj.schema
- read_obj.read()
-
- # Then
- assert len(mock__read_hdf_file.mock_calls) == 3
- mock__read_hdf_file.assert_has_calls(
- [
- mock.call("temp/obj_1.h5", actual_schema),
- mock.call("temp/obj_2.h5", actual_schema),
- mock.call("temp/obj_3.h5", actual_schema),
- ]
- )
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_parquet_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_parquet(
- self, mock_listdir, mock_temporary_directory, mock__read_parquet_file
- ):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- read_obj = ReadS3ParquetIO(source_config=s3_parquet_cloud_config)
- actual_schema = read_obj.schema
- read_obj.read()
-
- # Then
- assert len(mock__read_parquet_file.mock_calls) == 3
- mock__read_parquet_file.assert_has_calls(
- [
- mock.call("temp/obj_1.h5", actual_schema),
- mock.call("temp/obj_2.h5", actual_schema),
- mock.call("temp/obj_3.h5", actual_schema),
- ]
- )
-
- @pytest.mark.unit
- def test_read_parquet_file_is_called_while_awscli_runner_is_not_for_loading_a_parquet_with_env_as_cloud_s3_and_type_as_parquet_with_no_disk_space_option(
- self,
- ):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mock__awscli_runner, patch.object(
- dynamicio.mixins.with_local.WithLocal, "_read_parquet_file"
- ) as mock__read_parquet_file:
- ReadS3ParquetIO(source_config=s3_parquet_cloud_config, no_disk_space=True).read()
-
- # Then
- mock__read_parquet_file.assert_called()
- mock__awscli_runner.assert_not_called()
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_parquet_file_can_read_directory_of_parquet_files_loading_only_necessary_columns(self, mock_parquet_temporary_directory):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- df = ReadS3ParquetWithLessColumnsIO(source_config=s3_parquet_cloud_config).read()
-
- # Then
- assert df.shape == (15, 2) and df.columns.tolist() == ["id", "foo_name"]
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_parquet_file_can_filter_out_rows_using_appropriate_options(self, mock_parquet_temporary_directory):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- df = ReadS3ParquetIO(source_config=s3_parquet_cloud_config, filters=[[("foo_name", "==", "name_a")]]).read()
-
- # Then
- assert df.shape == (8, 3) and df.columns.tolist() == ["id", "foo_name", "bar"] and df.foo_name.unique() == ["name_a"]
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_csv_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_csv(
- self,
- mock_listdir,
- mock_temporary_directory,
- mock__read_csv_file,
- ):
- # Given
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_CSV")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- read_obj = ReadS3ParquetIO(source_config=s3_csv_cloud_config)
- actual_schema = read_obj.schema
- read_obj.read()
-
- # Then
- assert len(mock__read_csv_file.mock_calls) == 3
- mock__read_csv_file.assert_has_calls(
- [
- mock.call("temp/obj_1.h5", actual_schema),
- mock.call("temp/obj_2.h5", actual_schema),
- mock.call("temp/obj_3.h5", actual_schema),
- ]
- )
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_json_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_json(
- self,
- mock_listdir,
- mock_temporary_directory,
- mock__read_json_file,
- ):
- # Given
- s3_csv_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_JSON")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- read_obj = ReadS3ParquetIO(source_config=s3_csv_cloud_config)
- actual_schema = read_obj.schema
- read_obj.read()
-
- # Then
- assert len(mock__read_json_file.mock_calls) == 3
- mock__read_json_file.assert_has_calls(
- [
- mock.call("temp/obj_1.h5", actual_schema),
- mock.call("temp/obj_2.h5", actual_schema),
- mock.call("temp/obj_3.h5", actual_schema),
- ]
- )
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test_a_concatenated_hdf_file_is_returned_with_schema_columns_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_hdf(
- self,
- mock_listdir,
- mock_temporary_directory,
- mock__read_hdf_file,
- ):
- # Given
- s3_hdf_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_HDF")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_Value = True
- h5_df = ReadS3HdfIO(source_config=s3_hdf_cloud_config).read()
-
- # Then
- pd.testing.assert_frame_equal(
- h5_df,
- pd.DataFrame(
- {
- "id": [1, 2, 3],
- "foo_name": ["class_a", "class_a", "class_a"],
- "bar": [1001, 1001, 1001],
- }
- ),
- )
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test_a_ValueError_is_raised_if_file_type_is_not_supported_when_loading_a_path_prefix_with_env_as_cloud_s3(
- self,
- tmp_path,
- mock_listdir,
- mock_temporary_directory,
- mock__read_hdf_file,
- ):
- # Given
- test_yaml_file = tmp_path / "mytest.yml"
- with open(test_yaml_file, "w") as fout:
- yaml.dump(
- {
- "READ_FROM_S3_PATH_PREFIX_TXT": {
- "CLOUD": {
- "type": "s3_path_prefix",
- "s3": {
- "bucket": "test-bucket",
- "path_prefix": "[[ MOCK_KEY ]]",
- "file_type": "txt",
- },
- }
- }
- },
- fout,
- )
-
- # When & Then
- with pytest.raises(pydantic.ValidationError):
- IOConfig(
- path_to_source_yaml=test_yaml_file,
- env_identifier="CLOUD",
- dynamic_vars=constants,
- )
-
- @pytest.mark.unit
- # pylint: disable=unused-argument
- def test__read_parquet_file_can_read_directory_of_parquet_files_containing_empty_files(
- self,
- mock_parquet_temporary_directory_w_empty_files,
- ):
- # Given
- s3_parquet_cloud_config = IOConfig(
- path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")),
- env_identifier="CLOUD",
- dynamic_vars=constants,
- ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET")
-
- # When
- with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner:
- mocked__awscli_runner.return_value = True
- df = ReadS3ParquetWEmptyFilesIO(source_config=s3_parquet_cloud_config).read()
-
- # Then
- assert df.shape == (10, 2) and df.columns.tolist() == ["id", "bar"]
diff --git a/tests/test_regressions/conftest.py b/tests/test_regressions/conftest.py
deleted file mode 100644
index 8c192b4..0000000
--- a/tests/test_regressions/conftest.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import imp
-import pathlib
-
-import pytest
-
-
-@pytest.fixture
-def regressions_resources_dir() -> pathlib.Path:
- return (pathlib.Path(__file__).parent / "resources").resolve()
-
-
-@pytest.fixture
-def tests_resources_dir(regressions_resources_dir):
- return regressions_resources_dir.parent.parent / "resources"
-
-
-@pytest.fixture
-def regressions_constants_module(regressions_resources_dir, tests_resources_dir):
- mod = imp.new_module("regressions_constants_module")
- mod.__dict__.update(
- {
- "REGRESSIONS_RESOURCES_DIR": str(regressions_resources_dir),
- "TEST_RESOURCES_DIR": str(tests_resources_dir),
- }
- )
- return mod
diff --git a/tests/test_regressions/resources/missing_v430_validations.yaml b/tests/test_regressions/resources/missing_v430_validations.yaml
deleted file mode 100644
index 0669e30..0000000
--- a/tests/test_regressions/resources/missing_v430_validations.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-PRODUCTS:
- LOCAL:
- type: "local"
- local:
- file_path: "[[ TEST_RESOURCES_DIR ]]/data/input/some_csv_to_read.csv"
- file_type: "csv"
- schema:
- name: products
- columns:
- id:
- type: "object"
- validations: {}
- metrics: []
\ No newline at end of file
diff --git a/tests/test_regressions/test_v430.py b/tests/test_regressions/test_v430.py
deleted file mode 100644
index e0d1d11..0000000
--- a/tests/test_regressions/test_v430.py
+++ /dev/null
@@ -1,26 +0,0 @@
-"""Test regressions discovered in v4.3.0 release"""
-
-from dynamicio import UnifiedIO
-from dynamicio.config import IOConfig
-from dynamicio.core import SCHEMA_FROM_FILE
-
-
-class IO(UnifiedIO):
- schema = SCHEMA_FROM_FILE
-
-
-def test_missing_validations_and_metrics(regressions_resources_dir, regressions_constants_module):
- """Dynamicio was refusing to work with schemas that did not have any validations specified."""
- # Given
- input_config = IOConfig(
- path_to_source_yaml=regressions_resources_dir / "missing_v430_validations.yaml",
- env_identifier="LOCAL",
- dynamic_vars=regressions_constants_module,
- )
- io_instance = IO(source_config=input_config.get(source_key="PRODUCTS"), apply_schema_validations=True, log_schema_metrics=True)
-
- # When
- data = io_instance.read()
-
- # Then
- assert data.to_dict() == {"id": {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15}}
diff --git a/tests/test_resource_inject.py b/tests/test_resource_inject.py
new file mode 100644
index 0000000..c4052c5
--- /dev/null
+++ b/tests/test_resource_inject.py
@@ -0,0 +1,12 @@
+from pathlib import Path
+
+from dynamicio import LocalFileResource
+
+
+def test_file_resource_inject_success(injectable_string, passing_injections, test_df, tmpdir, file_name):
+ file_resource = LocalFileResource(
+ path=Path(tmpdir / injectable_string) / file_name,
+ )
+ file_resource = file_resource.inject(**passing_injections)
+ file_resource.write(test_df)
+ file_resource.read()
diff --git a/tests/test_resource_test_path_inject.py b/tests/test_resource_test_path_inject.py
new file mode 100644
index 0000000..9c52376
--- /dev/null
+++ b/tests/test_resource_test_path_inject.py
@@ -0,0 +1,15 @@
+# import all resources
+import pytest
+
+from dynamicio import LocalFileResource, S3Resource
+
+
+@pytest.fixture(params=[LocalFileResource, S3Resource])
+def resource_instance(request, file_name):
+ return request.param(bucket="bucket", path="some_file.extension", test_path="{var1}")
+
+
+def test_resource_test_path_inject(resource_instance):
+ assert str(resource_instance.test_path) == "{var1}"
+ resource_instance = resource_instance.inject(var1="aoeu")
+ assert str(resource_instance.test_path) == "aoeu"
diff --git a/tests/test_serde.py b/tests/test_serde.py
new file mode 100644
index 0000000..48c6de8
--- /dev/null
+++ b/tests/test_serde.py
@@ -0,0 +1,33 @@
+from unittest.mock import MagicMock
+
+import pandas as pd
+import pytest
+
+from dynamicio.io.serde import CsvSerde, HdfSerde, JsonSerde, ParquetSerde
+
+
+@pytest.fixture(params=[CsvSerde, JsonSerde, ParquetSerde, HdfSerde])
+def serde_class(request):
+ return request.param
+
+
+@pytest.fixture
+def serde_instance(serde_class):
+ _serde_instance = serde_class()
+ return _serde_instance
+
+
+def test_serde_read_write(serde_instance, test_df, tmp_path):
+ serde_instance.write_to_file(tmp_path / "file", test_df)
+ read_write_df = serde_instance.read_from_file(tmp_path / "file")
+ pd.testing.assert_frame_equal(read_write_df, test_df)
+
+
+def test_serde_validation_callback_called(serde_class, tmp_path, test_df):
+ validation_callback = MagicMock()
+ validation_callback.return_value = test_df
+ serde_instance = serde_class(validations=[validation_callback])
+ serde_instance.write_to_file(tmp_path / "file", test_df)
+ validation_callback.assert_not_called()
+ serde_instance.read_from_file(tmp_path / "file")
+ validation_callback.assert_called_once()
diff --git a/tests/test_uhura.py b/tests/test_uhura.py
new file mode 100644
index 0000000..872e964
--- /dev/null
+++ b/tests/test_uhura.py
@@ -0,0 +1,82 @@
+from pathlib import Path
+
+import pandas as pd
+import pytest
+from uhura.modes import fixture_builder_mode, task_test_mode
+
+from dynamicio import KafkaResource, LocalFileResource, PostgresResource, S3Resource
+
+
+@pytest.fixture()
+def resources(file_name, tmpdir):
+ file_resource = LocalFileResource(path=tmpdir / "actual" / file_name)
+ s3_resource = S3Resource(bucket="bucket", path=file_name)
+ return file_resource, s3_resource
+
+
+@pytest.fixture
+def file_resource(resources):
+ return resources[0]
+
+
+@pytest.fixture
+def s3_resource(resources):
+ return resources[1]
+
+
+def test_uhura_file(test_df, tmpdir, file_name):
+ file_resource = LocalFileResource(path=tmpdir / "actual" / file_name)
+ file_resource.write(test_df)
+ pd.testing.assert_frame_equal(file_resource.read(), test_df)
+
+ with fixture_builder_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"):
+ file_resource.read()
+ file_resource.write(test_df)
+
+ with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"):
+ df = file_resource.read()
+ file_resource.write(df)
+ with pytest.raises(AssertionError):
+ file_resource.write(df.drop("a", axis=1))
+
+
+@pytest.fixture
+def s3_fixtures(file_name, tmpdir, test_df):
+ # Fixtures setup for s3 test
+ file_resource = LocalFileResource(path=tmpdir / "actual" / file_name)
+ file_resource.path = Path(tmpdir / "uhura" / "input" / "s3" / "bucket" / file_name)
+ file_resource.write(test_df)
+ file_resource.path = Path(tmpdir / "uhura" / "output" / "s3" / "bucket" / file_name)
+ file_resource.write(test_df)
+
+
+def test_uhura_s3(test_df, tmpdir, file_name, s3_fixtures):
+ # Actual test
+ s3_resource = S3Resource(bucket="bucket", path=file_name)
+ with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"):
+ df = s3_resource.read()
+ s3_resource.write(df)
+
+ # Check that, in test mode, the dfs are being compared and if not the same -> fail.
+ with pytest.raises(AssertionError):
+ s3_resource.write(df.drop("a", axis=1))
+
+
+def test_postgres_uhura(tmpdir, test_df):
+ postgres_resource = PostgresResource(db_user="asdf", db_host="asdf", db_name="asdf", table_name="tabular_table")
+ LocalFileResource(path=tmpdir / "uhura" / "input" / "postgres" / "public.tabular_table.parquet").write(test_df)
+ LocalFileResource(path=tmpdir / "uhura" / "output" / "postgres" / "public.tabular_table.parquet").write(test_df)
+ with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"):
+ postgres_resource.read()
+ postgres_resource.write(test_df)
+ with pytest.raises(AssertionError):
+ postgres_resource.write(test_df.drop("a", axis=1))
+
+
+def test_kafka_uhura(tmpdir, test_df):
+ kafka_resource = KafkaResource(topic="tropico", server="asdf")
+ LocalFileResource(path=tmpdir / "uhura" / "output" / "kafka" / "tropico.json").write(test_df)
+ with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"):
+ kafka_resource.write(test_df)
+ with pytest.raises(AssertionError):
+ kafka_resource.write(test_df.drop("a", axis=1))
diff --git a/tests/test_validations.py b/tests/test_validations.py
index a7137b1..e05907e 100644
--- a/tests/test_validations.py
+++ b/tests/test_validations.py
@@ -1,450 +1,52 @@
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods
+import pandas as pd
import pytest
+from pandera import SchemaModel
+from pandera.errors import SchemaError
+from pandera.typing import Series
-from dynamicio.validations import (
- has_acceptable_percentage_of_nulls,
- has_no_null_values,
- has_unique_values,
- is_between,
- is_greater_than,
- is_greater_than_or_equal,
- is_in,
- is_lower_than,
- is_lower_than_or_equal,
-)
+from dynamicio import LocalFileResource
+import tests.constants as constants
+from tests.fixtures.schemas import SampleSchema
+file_path = constants.TEST_FIXTURES / "sample.parquet"
-class TestHasUniqueValues:
- @pytest.mark.unit
- def test_returns_true_if_column_has_no_duplicate_values(self, input_df):
- # Given
- df = input_df
- # When
- validation = has_unique_values("TEST", df, column="id")
+def test_parquet_resource_read_with_schema():
+ resource = LocalFileResource(path=file_path, pa_schema=SampleSchema)
+ df = resource.read()
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "TEST[id] has unique values"
+ target_df = pd.read_parquet(file_path)
+ pd.testing.assert_frame_equal(df, target_df)
- @pytest.mark.unit
- def test_returns_false_if_column_has_duplicate_values(self, input_df):
- # Given
- df = input_df
- # When
- validation = has_unique_values("TEST", df, column="activity")
+def test_parquet_resource_write_with_schema(tmpdir):
+ output_path = tmpdir / "test_parquet_resource_write.parquet"
+ in_memory_df = pd.read_parquet(file_path)
- # Then
- assert not validation.valid and validation.value == 3 and validation.message == "Values ['discharge', 'pass_through', 'load'] for TEST[activity] are duplicated!"
+ resource = LocalFileResource(path=output_path, pa_schema=SampleSchema)
+ resource.write(in_memory_df)
+ target_df = pd.read_parquet(output_path)
+ pd.testing.assert_frame_equal(in_memory_df, target_df)
-class TestHasNoNullValues:
- @pytest.mark.unit
- def test_returns_true_if_column_in_df_has_no_nulls(self, input_df):
- # Given
- df = input_df
- # When
- validation = has_no_null_values("TEST", df, column="activity")
+def test_parquet_resource_read_with_schema_fails_validation():
+ class FailingSchema(SchemaModel):
+ z: Series[int]
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "TEST[activity] has 0 nulls"
+ resource = LocalFileResource(path=file_path, pa_schema=FailingSchema)
+ with pytest.raises(SchemaError):
+ resource.read()
- @pytest.mark.unit
- def test_returns_false_if_column_in_df_has_none_values(self, input_df):
- # Given
- df = input_df
- # When
- validation = has_no_null_values("TEST", df, column="duration_a")
+def test_parquet_resource_read_with_schema_pandera_config_is_applied():
+ class FailingSchema(SchemaModel):
+ a: Series[int]
+ b: Series[str]
- # Then
- assert not validation.valid and validation.value == 1 and validation.message == "TEST[duration_a] has 1 nulls"
+ class Config:
+ strict = True
- @pytest.mark.unit
- def test_returns_false_if_column_in_df_has_nat_values(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = has_no_null_values("TEST", df, column="start_time")
-
- # Then
- assert not validation.valid and validation.value == 1 and validation.message == "TEST[start_time] has 1 nulls"
-
-
-class TestHasAcceptablePercentageOfNulls:
- @pytest.mark.unit
- def test_throws_exception_if_threshold_is_greater_than_1(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- with pytest.raises(ValueError):
- has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=1.2)
-
- @pytest.mark.unit
- def test_throws_exception_if_threshold_is_lower_than_0(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- with pytest.raises(ValueError):
- has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=-0.1)
-
- @pytest.mark.unit
- def test_returns_true_if_percentage_threshold_is_not_exceeded(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=0.11)
-
- # Then
- assert validation.valid is True and validation.value == 0.1 and validation.message == "Percentage of nulls of for TEST[duration_a] is 0.1"
-
- @pytest.mark.unit
- def test_returns_true_if_inpu_df_is_empty(self, empty_df):
- # Given
- df = empty_df
-
- # When
- validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=0.11)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Percentage of nulls of for TEST[duration_a] is 0"
-
- @pytest.mark.unit
- def test_returns_true_if_threshold_is_not_exceeded_for_any_null_type_value(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_b", threshold=0.2)
-
- # Then
- assert not validation.valid and validation.value == 0.3 and validation.message == "Percentage of nulls of for TEST[duration_b] is 0.3 which exceeds threshold: 0.2"
-
- @pytest.mark.unit
- def test_returns_false_if_threshold_is_exceeded(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=0.09)
-
- # Then
- assert not validation.valid and validation.value == 0.1 and validation.message == "Percentage of nulls of for TEST[duration_a] is 0.1 which exceeds threshold: 0.09"
-
-
-class TestHasAcceptableCategoricalValues:
- @pytest.mark.unit
- def test_returns_true_if_columns_unique_values_are_a_subset_of_input_set(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_in(
- "TEST",
- df,
- column="activity",
- categorical_values={"load", "discharge", "pass_through", "one_more"},
- )
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[activity] are acceptable"
-
- @pytest.mark.unit
- def test_returns_true_only_if_columns_unique_vals_are_an_exact_match_of_the_input_set_when_match_all_is_set_to_false(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge", "pass_through"}, match_all=False)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All acceptable categorical values for TEST[activity] are present"
-
- @pytest.mark.unit
- def test_returns_false_if_columns_unique_vals_are_less_than_the_acceptable_categoricals_when_match_all_is_set_to_false(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge", "pass_through", "one_more"}, match_all=False)
-
- # Then
- assert validation.valid is False and validation.value == 1 and validation.message == "Missing categorical values for TEST[activity]: {'one_more'}"
-
- @pytest.mark.unit
- def test_returns_false_if_columns_unique_vals_are_more_than_the_acceptable_categoricals_when_match_all_is_set_to_false(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge"}, match_all=False)
-
- # Then
- assert validation.valid is False and validation.value == 3 and validation.message == "Values {'pass_through'} for TEST[activity] are not acceptable for 3 cells"
-
- @pytest.mark.unit
- def test_returns_true_if_columns_unique_vals_are_an_exact_match_of_the_input_set(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge", "pass_through"})
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[activity] are acceptable"
-
- @pytest.mark.unit
- def test_returns_false_if_columns_unique_values_are_not_a_subset_of_input_set(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_in("TEST", df, column="activity", categorical_values={"load", "pass_through"})
-
- # Then
- assert not validation.valid and validation.value == 5 and validation.message == "Values {'discharge'} for TEST[activity] are not acceptable for 5 cells"
-
- @pytest.mark.unit
- def test_returns_true_if_nulls_are_an_allowed_categorical_value(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_in("TEST", df, column="category_a", categorical_values={"A", "B", "C", None})
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_a] are acceptable"
-
- @pytest.mark.unit
- def test_ignores_the_existence_of_null_values(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_in("TEST", df, column="category_a", categorical_values={"A", "B", "C"})
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_a] are acceptable"
-
- @pytest.mark.unit
- def test_treats_nan_and_na_values_as_nulls_and_returns_true_if_null_is_acceptable(self, input_df):
- # Given
- df = input_df # where category_b has None, pd.NA and np.nan values
-
- # When
- validation = is_in("TEST", df, column="category_b", categorical_values={"A", "B", "C", None})
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_b] are acceptable"
-
- @pytest.mark.unit
- def test_ignores_nan_and_na_values_as_it_does_with_nulls(self, input_df):
- # Given
- df = input_df # where category_b has None, pd.NA and np.nan values
-
- # When
- validation = is_in("TEST", df, column="category_b", categorical_values={"A", "B", "C"})
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_b] are acceptable"
-
-
-class TestIsGreaterThan:
- @pytest.mark.unit
- def test_returns_true_if_all_column_values_are_above_threshold(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_greater_than("TEST", df, column="weight_a", threshold=4)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are above 4"
-
- @pytest.mark.unit
- def test_returns_false_if_any_column_values_are_below_threshold(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_greater_than("TEST", df, column="weight_a", threshold=6)
-
- # Then
- assert not validation.valid and validation.value == 0.5 and validation.message == "5 cell values for TEST[weight_a] are below 6"
-
- @pytest.mark.unit
- def test_returns_false_if_any_column_values_are_below_or_equal_to_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_greater_than("TEST", df, column="weight_a", threshold=5)
-
- # Then
- assert not validation.valid and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are below 5"
-
- @pytest.mark.unit
- def test_is_greater_than_returns_true_if_all_column_values_are_below_threshold_irrespective_of_nulls(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_greater_than("TEST", df, column="weight_b", threshold=4)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_b] are above 4"
-
-
-class TestIsGreaterThanOrEqual:
- @pytest.mark.unit
- def test_returns_true_if_all_column_values_are_above_or_equal_to_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_greater_than_or_equal("TEST", df, column="weight_a", threshold=5)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are above 5"
-
- def test_returns_false_if_any_column_values_are_below_the_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_greater_than_or_equal("TEST", df, column="weight_a", threshold=6)
-
- # Then
- assert validation.valid is False and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are below 6"
-
-
-class TestIsLowerThan:
- @pytest.mark.unit
- def test_returns_true_if_all_column_values_are_below_threshold(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_lower_than("TEST", df, column="weight_a", threshold=10)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are below 10"
-
- @pytest.mark.unit
- def test_returns_false_if_any_column_values_are_above_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_lower_than("TEST", df, column="weight_a", threshold=8)
-
- # Then
- assert not validation.valid and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are above 8"
-
- @pytest.mark.unit
- def test_is_lower_than_returns_false_if_any_column_values_are_below_or_equal_to_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_lower_than("TEST", df, column="weight_a", threshold=9)
-
- # Then
- assert not validation.valid and validation.value == 0.1 and validation.message == "1 cell values for TEST[weight_a] are above 9"
-
- @pytest.mark.unit
- def test_is_lower_than_returns_true_if_all_columns_values_are_below_threshold_irrespective_of_nulls(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_lower_than("TEST", df, column="weight_b", threshold=10)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_b] are below 10"
-
-
-class TestIsLowerThanOrEqual:
- @pytest.mark.unit
- def test_returns_true_if_all_column_values_are_below_or_equal_to_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_lower_than_or_equal("TEST", df, column="weight_a", threshold=9)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are below 9"
-
- @pytest.mark.unit
- def test_returns_false_if_any_column_values_are_above_the_threshold(self, input_df):
- # Given
- df = input_df
-
- # When/Then
- validation = is_lower_than_or_equal("TEST", df, column="weight_a", threshold=8)
-
- # Then
- assert not validation.valid and validation.value == 0.1 and validation.message == "1 cell values for TEST[weight_a] are above 8"
-
-
-class TestIsBetween:
- @pytest.mark.integration
- def test_returns_true_if_all_column_values_are_between_upper_and_lower_bounds(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_between("TEST", df, column="weight_a", lower=4, upper=10)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] is between 4 and 10 thresholds"
-
- @pytest.mark.integration
- def test_returns_false_if_any_column_values_are_below_the_lower_bound(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_between("TEST", df, column="weight_a", lower=6, upper=10)
-
- # Then
- assert not validation.valid and validation.value == 0.5 and validation.message == "5 cell values for TEST[weight_a] are either below 6 or above 10"
-
- @pytest.mark.integration
- def test_returns_false_if_any_column_values_are_above_the_upper_bound(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_between("TEST", df, column="weight_a", lower=4, upper=8)
-
- # Then
- assert not validation.valid and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are either below 4 or above 8"
-
- @pytest.mark.integration
- def test_returns_true_if_all_column_values_are_within_bounds_bounds_included(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_between("TEST", df, column="weight_a", lower=5, upper=9, include_left=True, include_right=True)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] is between 5 and 9 thresholds"
-
- @pytest.mark.integration
- def test_returns_true_if_all_column_values_are_between_upper_and_lower_bounds_irrespective_of_nulls(self, input_df):
- # Given
- df = input_df
-
- # When
- validation = is_between("TEST", df, column="weight_b", lower=4, upper=10)
-
- # Then
- assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_b] is between 4 and 10 thresholds"
+ resource = LocalFileResource(path=file_path, pa_schema=FailingSchema)
+ with pytest.raises(SchemaError):
+ resource.read()