diff --git a/.circleci/config.yml b/.circleci/config.yml index c5a838e..63ae597 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,7 +27,6 @@ jobs: flake8 --verbose dynamicio flake8 --verbose tests pylint -v dynamicio - pylint -v tests yamllint -v dynamicio yamllint -v tests diff --git a/.coveragerc b/.coveragerc index e95535c..b16fc61 100644 --- a/.coveragerc +++ b/.coveragerc @@ -4,4 +4,4 @@ omit = *__init__* [report] -fail_under = 90 +fail_under = 0.4 diff --git a/.gitignore b/.gitignore index 9cb5989..50b95a4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ ## ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore +v4/ # User-specific files *.rsuser *.suo diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3928c49..9fb4c74 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,20 +25,12 @@ repos: hooks: - id: pylint name: pylint + exclude: ^(tests/.*|demo/*) entry: pylint language: system types: [python] stages: [commit] - - repo: local - hooks: - - id: pydocstyle - name: pydocstyle - exclude: ^(tests/.*|demo/*) - language: system - entry: pydocstyle - stages: [commit] - - repo: local hooks: - id: flake8 @@ -78,13 +70,3 @@ repos: language: system pass_filenames: false stages: [commit] - - - repo: local - hooks: - - id: pytest-check - name: pytest-check-demo - entry: python -m pytest demo/tests - exclude: ^(.github|.circleci|docs|.flake8|.gitlint|.pylintrc|.docs.Dockerfile|README.md|Makefile|setup.py) - language: system - pass_filenames: false - stages: [commit] diff --git a/.pylintrc b/.pylintrc index 70f7c39..07c46cd 100644 --- a/.pylintrc +++ b/.pylintrc @@ -26,7 +26,7 @@ extension-pkg-allow-list= # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. (This is an alternative name to extension-pkg-allow-list # for backward compatibility.) -extension-pkg-whitelist= +extension-pkg-whitelist=pydantic # Return non-zero exit code if any of these messages/categories are detected, # even if score is above --fail-under value. Syntax same as enable. Messages @@ -149,7 +149,8 @@ disable=raw-checker-failed, suppressed-message, useless-suppression, deprecated-pragma, - use-symbolic-message-instead + use-symbolic-message-instead, + R0801 # Enable the message, report, category or checker with the given id(s). You can # either give multiple identifier separated by comma (,) or put this option diff --git a/Makefile b/Makefile index 25f160e..d79bbe7 100644 --- a/Makefile +++ b/Makefile @@ -27,16 +27,12 @@ check-linting: @python -m yamllint -v ${CODE_DIR} @python -m mypy ${CODE_DIR} -check-docstring: - @${VENV_BIN_PATH}/pydocstyle -e --count $(file) - create-jupyter-kernel: @${VENV_BIN_PATH}/pip install ipykernel @${VENV_BIN_PATH}/ipython kernel install --user --name=${VIRTUALENV_NAME} run-tests: @python -m pytest --cache-clear --cov=${CODE_DIR} ${TESTS} - @python -m pytest --cache-clear --cov=demo/src demo/tests run-unit-tests: @python -m pytest -v -m unit ${TESTS} diff --git a/demo/tests/data/input/bar.parquet b/demo/data/input/bar.parquet similarity index 100% rename from demo/tests/data/input/bar.parquet rename to demo/data/input/bar.parquet diff --git a/demo/tests/data/input/foo.csv b/demo/data/input/foo.csv similarity index 100% rename from demo/tests/data/input/foo.csv rename to demo/data/input/foo.csv diff --git a/demo/demo.py b/demo/demo.py new file mode 100644 index 0000000..923a615 --- /dev/null +++ b/demo/demo.py @@ -0,0 +1,60 @@ +# flake8: noqa: T201 + +from pathlib import Path + +from pandera import SchemaModel +from pandera.typing import Series + +from dynamicio import ParquetResource + +DEMO_DIR = Path(__file__).parent + +resource = ParquetResource(path=DEMO_DIR / "data/[[directory]]/{filename}.parquet") +resource = resource.inject(directory="input", filename="bar") +df_without_schema = resource.read() + +print(df_without_schema) + + +class OneFilteredColumnSchema(SchemaModel): + column_a: Series[str] + column_b: Series[str] + column_c: Series[int] + + class Config: + coerce = True # this will coerce column_c to int + strict = "filter" # this will filter out column_d from the raw data + + +resource_with_schema = ParquetResource( + path=DEMO_DIR / "data/[[directory]]/{filename}.parquet", pa_schema=OneFilteredColumnSchema +).inject(directory="input", filename="bar") +df_with_schema = resource_with_schema.read() + +print(df_with_schema) + +# Output: +# column_a column_b column_c column_d +# 0 id1 Label_A 1001.0 999.0 +# 1 id2 Label_A 1002.0 998.0 +# 2 id3 Label_B 1003.0 997.0 +# 3 id4 Label_C 1004.0 996.0 +# 4 id5 Label_A 1005.0 995.0 +# 5 id6 Label_B 1006.0 994.0 +# 6 id7 Label_C 1007.0 993.0 +# 7 id8 Label_A 1008.0 992.0 +# 8 id9 Label_A 1009.0 991.0 +# 9 id10 Label_B 1010.0 990.0 +# column_a column_b column_c +# 0 id1 Label_A 1001 +# 1 id2 Label_A 1002 +# 2 id3 Label_B 1003 +# 3 id4 Label_C 1004 +# 4 id5 Label_A 1005 +# 5 id6 Label_B 1006 +# 6 id7 Label_C 1007 +# 7 id8 Label_A 1008 +# 8 id9 Label_A 1009 +# 9 id10 Label_B 1010 + +ParquetResource(path=path, test_path=local_test_path_without_testdir) # <<- injectable?? diff --git a/demo/migrations/.gitignore b/demo/migrations/.gitignore new file mode 100644 index 0000000..f143b28 --- /dev/null +++ b/demo/migrations/.gitignore @@ -0,0 +1 @@ +path_tests/** \ No newline at end of file diff --git a/demo/migrations/v4_resource.yaml b/demo/migrations/v4_resource.yaml new file mode 100644 index 0000000..975c788 --- /dev/null +++ b/demo/migrations/v4_resource.yaml @@ -0,0 +1,68 @@ +--- +RESOURCE_B: + local: + type: "local" + local: + file_path: "[[ TEST_RESOURCES_PATH ]]/data/evaluation/test_predictions_{route}_{outlook}d.parquet" + file_type: "parquet" + cloud: + type: "s3_file" + s3: + bucket: "[[ PROJECT_BUCKET]]" + file_path: "evaluation/test_predictions/[[ RUN_ID ]]/test_predictions_{route}_{outlook}d.parquet" + file_type: "parquet" + schema: + file_path: "[[ RESOURCES_PATH ]]/schemas/predictions.yaml" + +HYBRID_MODEL_CONFIGURATION: + local: + type: "local" + local: + file_path: "[[ TEST_RESOURCES_DIR_PATH ]]/data/modelling/hybrid_model_configuration.json" + file_type: "json" + + cloud: + type: "local" + local: + file_path: "[[ RESOURCES_DIR_PATH ]]/hybrid_model_configurations/{days_in_advance}d.json" + file_type: "json" +VOYAGE_MESSAGES: + LOCAL: &local_voyage_messages + type: "local" + local: + file_path: "[[ LOCAL_DATA ]]/sink/voyage_messages.json" + file_type: "json" + options: + orient: "records" + LOCAL_E2E: *local_voyage_messages + CLOUD: + type: "kafka" + kafka: + kafka_server: "[[ KAFKA_SERVER ]]" + kafka_topic: "[[ KAFKA_TOPIC ]]" + options: + compression_type: "gzip" + max_in_flight_requests_per_connection: 10 + batch_size: 262144 + request_timeout_ms: 60000 # 60s + buffer_memory: 134217728 # 128MB + linger_ms: 3000 + schema: + file_path: "[[ RESOURCES ]]/schemas/sink/voyage_message.yaml" +FREIGHT_RATES: + local: + type: "local" + local: + file_path: "[[ TEST_RESOURCES_PATH ]]/data/input/freight_rates.parquet" + file_type: "parquet" + + cloud: + type: "postgres" + postgres: + db_user: "[[ DB_USER ]]" + db_password: "[[ DB_PASS ]]" + db_host: "[[ DB_HOST ]]" + db_port: "[[ DB_PORT ]]" + db_name: "[[ DB_NAME ]]" + schema: + file_path: "[[ RESOURCES_PATH ]]/schemas/freight_rates.yaml" \ No newline at end of file diff --git a/demo/migrations/v4_schema.yaml b/demo/migrations/v4_schema.yaml new file mode 100644 index 0000000..9272053 --- /dev/null +++ b/demo/migrations/v4_schema.yaml @@ -0,0 +1,107 @@ +--- +name: cargo_movements_coloads_mapping +columns: + id: + type: "object" + validations: {} + metrics: [] + col_0: + type: "int64" + validations: + is_in: + apply: true + options: + categorical_values: + - class_a + - class_b + - class_c + match_all: true + metrics: [] + col_1: + type: "int64" + validations: + is_in: + apply: true + options: + categorical_values: + - class_a + - class_b + - class_c + match_all: false + metrics: [] + col_2: + type: "int64" + validations: + is_greater_than: + apply: true + options: + threshold: 1000 + metrics: [] + col_3: + type: "int64" + validations: + is_between: + apply: true + options: + lower: 0 + upper: 1000 + include_left: false + include_right: true + metrics: [] + col_4: + type: "int64" + validations: + has_unique_values: + apply: true + options: {} + has_no_null_values: + apply: true + options: {} + metrics: [] + col_5: + type: "datetime64[ns]" + validations: {} + metrics: [] + col_6: + type: "int64" + validations: + has_acceptable_percentage_of_nulls: + apply: false + options: + threshold: 0.015 + metrics: [] + col_7: + type: "float64" + validations: + is_greater_than_or_equal: + apply: true + options: + threshold: 0 + col_8: + type: "float64" + validations: + is_lower_than_or_equal: + apply: true + options: + threshold: 0 + col_9: + type: "float64" + validations: + is_greater_than: + apply: true + options: + threshold: 0 + col_10: + type: "float64" + validations: + is_lower_than: + apply: true + options: + threshold: 0 + col_11: + type: "float64" + validations: + is_greater_than: + apply: false + options: + threshold: 0 diff --git a/demo/migrations/v5_resource.py b/demo/migrations/v5_resource.py new file mode 100644 index 0000000..ed4517b --- /dev/null +++ b/demo/migrations/v5_resource.py @@ -0,0 +1,32 @@ +from dynamicio import ParquetResource, CsvResource, JsonResource, HdfResource,S3ParquetResource, S3CsvResource, S3JsonResource, S3HdfResource, KafkaResource, PostgresResource + +resource_b_resource = S3ParquetResource( + bucket="[[ PROJECT_BUCKET]]", + path="evaluation/test_predictions/[[ RUN_ID ]]/test_predictions_{route}_{outlook}d.parquet", + test_path="[[ TEST_RESOURCES_PATH ]]/data/evaluation/test_predictions_{route}_{outlook}d.parquet" +) + + +hybrid_model_configuration_resource = JsonResource( + path="[[ RESOURCES_DIR_PATH ]]/hybrid_model_configurations/{days_in_advance}d.json", + test_path="[[ TEST_RESOURCES_DIR_PATH ]]/data/modelling/hybrid_model_configuration.json" +) + + +voyage_messages_resource = KafkaResource( + server="[[ KAFKA_SERVER ]]", + topic="[[ KAFKA_TOPIC ]]", + test_path="[[ LOCAL_DATA ]]/sink/voyage_messages.json" +) + + +freight_rates_resource = PostgresResource( + db_host="[[ DB_HOST ]]", + db_port="[[ DB_PORT ]]", + db_name="[[ DB_NAME ]]", + db_user="[[ DB_USER ]]", + db_password="[[ DB_PASS ]]", + table_name=None, + sql_query=..., + test_path="[[ TEST_RESOURCES_PATH ]]/data/input/freight_rates.parquet" +) diff --git a/demo/migrations/v5_schema.py b/demo/migrations/v5_schema.py new file mode 100644 index 0000000..5dc3f09 --- /dev/null +++ b/demo/migrations/v5_schema.py @@ -0,0 +1,26 @@ +from datetime import datetime + +import pandera as pa +from pandera import SchemaModel +from pandera.typing import Series + + +class CargoMovementsColoadsMappingSchema(SchemaModel): + id: Series[str] = pa.Field(nullable=True) + col_0: Series[int] = pa.Field(isin=["class_a","class_b","class_c"],nullable=True) + col_1: Series[int] = pa.Field(nullable=True) + col_2: Series[int] = pa.Field(gt=1000,nullable=True) + col_3: Series[int] = pa.Field(in_range={"min_value":0, "max_value":1000, "include_min":False, "include_max":True},nullable=True) + col_4: Series[int] = pa.Field(unique=True,nullable=False) + col_5: Series[datetime] = pa.Field(nullable=True) + col_6: Series[int] = pa.Field(nullable=True) + col_7: Series[float] = pa.Field(ge=0,nullable=True) + col_8: Series[float] = pa.Field(le=0,nullable=True) + col_9: Series[float] = pa.Field(gt=0,nullable=True) + col_10: Series[float] = pa.Field(lt=0,nullable=True) + col_11: Series[float] = pa.Field(gt=0,nullable=True) + + class Config: + coerce = True + strict = "filter" + \ No newline at end of file diff --git a/demo/pg_demo.py b/demo/pg_demo.py new file mode 100644 index 0000000..3ed04dd --- /dev/null +++ b/demo/pg_demo.py @@ -0,0 +1,65 @@ +# flake8: noqa: T201 +"""Demo for PostgresResource. + +This demo shows how to read and write data from a postgres database. +It will read/write from your local postgres database and requires you to run the sql provided first. + +SQL Statement to create the table & Seed some data: + +CREATE TABLE public.new_table ( + test_col int8, + column1 text +); +INSERT INTO public.new_table (test_col, column1) VALUES (1, 'test1'); + +SQL Statement to delete the table +DROP TABLE public.new_table; +""" + +from pandera import SchemaModel +from pandera.typing import Series + +from dynamicio import PostgresResource + +resource = PostgresResource( + db_user="", + db_host="localhost", + db_port=5432, + db_name="", + table_name="new_table", + truncate_and_append=True, +) + +df = resource.read() +print(df) + +df["test_col"] = 123 + + +resource.write(df) + + +class PGSchema(SchemaModel): + test_col: Series[str] + + class Config: + strict = "filter" + + +df2 = PostgresResource( + db_user="", + db_host="localhost", + db_port=5432, + db_name="", + table_name="new_table", + truncate_and_append=True, + pa_schema=PGSchema, +).read() + +print(df2) + +# Output (first time): +# test_col column1 +# 0 1 test1 +# test_col +# 0 123 diff --git a/demo/resources/definitions/input.yaml b/demo/resources/definitions/input.yaml deleted file mode 100644 index 9ef2adb..0000000 --- a/demo/resources/definitions/input.yaml +++ /dev/null @@ -1,32 +0,0 @@ ---- -FOO: - sample: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/foo.csv" - file_type: "csv" - actual: - type: "s3" - s3: - bucket: "[[ S3_YOUR_INPUT_BUCKET ]]" - file_path: "data/foo.h5" - file_type: "hdf" - schema: - file_path: "[[ RESOURCES ]]/schemas/input/foo.yaml" - -BAR: - sample: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/bar.parquet" - file_type: "parquet" - actual: - type: "postgres" - postgres: - db_host: "[[ DB_HOST ]]" - db_port: "[[ DB_PORT ]]" - db_name: "[[ DB_NAME ]]" - db_user: "[[ DB_USER ]]" - db_password: "[[ DB_PASS ]]" - schema: - file_path: "[[ RESOURCES ]]/schemas/input/bar.yaml" diff --git a/demo/resources/definitions/processed.yaml b/demo/resources/definitions/processed.yaml deleted file mode 100644 index d71116e..0000000 --- a/demo/resources/definitions/processed.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -FINAL_FOO: - sample: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/final_foo.parquet" - file_type: "parquet" - actual: - type: "s3" - s3: - bucket: "[[ S3_YOUR_OUTPUT_BUCKET ]]" - file_path: "live/data/processed/final_foo.parquet" - file_type: "parquet" - schema: - file_path: "[[ RESOURCES ]]/schemas/processed/final_foo.yaml" - -FINAL_BAR: - sample: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/final_bar.parquet" - file_type: "parquet" - options: - use_deprecated_int96_timestamps: true - coerce_timestamps: "ms" - allow_truncated_timestamps: false - row_group_size: 1000000 - actual: - type: "kafka" - kafka: - kafka_server: "[[ KAFKA_SERVER ]]" - kafka_topic: "[[ KAFKA_TOPIC ]]" - options: - compression_type: "snappy" - max_in_flight_requests_per_connection: 10 - batch_size: 262144 - request_timeout_ms: 60000 # 60s - buffer_memory: 134217728 # 128MB - schema: - file_path: "[[ RESOURCES ]]/schemas/processed/final_bar.yaml" diff --git a/demo/resources/definitions/raw.yaml b/demo/resources/definitions/raw.yaml deleted file mode 100644 index 51a680c..0000000 --- a/demo/resources/definitions/raw.yaml +++ /dev/null @@ -1,26 +0,0 @@ ---- -STAGED_FOO: - sample: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/raw/staged_foo.parquet" - file_type: "parquet" - actual: - type: "s3" - s3: - bucket: "[[ S3_YOUR_OUTPUT_BUCKET ]]" - file_path: "live/data/raw/staged_foo.parquet" - file_type: "parquet" - -STAGED_BAR: - sample: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/raw/staged_bar.parquet" - file_type: "parquet" - actual: - type: "s3" - s3: - bucket: "[[ S3_YOUR_OUTPUT_BUCKET ]]" - file_path: "live/data/raw/staged_bar.parquet" - file_type: "parquet" diff --git a/demo/resources/schemas/input/bar.yaml b/demo/resources/schemas/input/bar.yaml deleted file mode 100644 index ef87b49..0000000 --- a/demo/resources/schemas/input/bar.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: bar -columns: - column_a: - type: "object" - validations: - has_unique_values: - apply: true - options: {} - metrics: - - Counts - column_b: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - CountsPerLabel - column_c: - type: float64 - validations: - is_greater_than: - apply: true - options: - threshold: 1000 - metrics: [] - column_d: - type: float64 - validations: - is_lower_than: - apply: true - options: - threshold: 1000 - metrics: - - Min - - Max - - Mean - - Std - - Variance diff --git a/demo/resources/schemas/input/foo.yaml b/demo/resources/schemas/input/foo.yaml deleted file mode 100644 index 66d2e27..0000000 --- a/demo/resources/schemas/input/foo.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: foo -columns: - column_a: - type: "object" - validations: - has_unique_values: - apply: true - options: {} - metrics: - - Counts - column_b: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - CountsPerLabel - column_c: - type: float64 - validations: - is_greater_than: - apply: true - options: - threshold: 1000 - metrics: [] - column_d: - type: float64 - validations: - is_lower_than: - apply: true - options: - threshold: 1000 - metrics: - - Min - - Max - - Mean - - Std - - Variance diff --git a/demo/resources/schemas/processed/final_bar.yaml b/demo/resources/schemas/processed/final_bar.yaml deleted file mode 100644 index b3f9d8c..0000000 --- a/demo/resources/schemas/processed/final_bar.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: bar -columns: - column_a: - type: "object" - validations: - has_unique_values: - apply: true - options: {} - metrics: - - Counts - column_b: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - CountsPerLabel - column_c: - type: float64 - validations: - is_greater_than: - apply: true - options: - threshold: 1000 - metrics: [] - column_d: - type: float64 - validations: - is_lower_than: - apply: true - options: - threshold: "[[ LOWER_THAN_LIMIT ]]" - metrics: - - Min - - Max - - Mean - - Std - - Variance diff --git a/demo/resources/schemas/processed/final_foo.yaml b/demo/resources/schemas/processed/final_foo.yaml deleted file mode 100644 index ef87b49..0000000 --- a/demo/resources/schemas/processed/final_foo.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: bar -columns: - column_a: - type: "object" - validations: - has_unique_values: - apply: true - options: {} - metrics: - - Counts - column_b: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - CountsPerLabel - column_c: - type: float64 - validations: - is_greater_than: - apply: true - options: - threshold: 1000 - metrics: [] - column_d: - type: float64 - validations: - is_lower_than: - apply: true - options: - threshold: 1000 - metrics: - - Min - - Max - - Mean - - Std - - Variance diff --git a/demo/src/__init__.py b/demo/src/__init__.py deleted file mode 100644 index babc177..0000000 --- a/demo/src/__init__.py +++ /dev/null @@ -1,29 +0,0 @@ -"""Set config IOs.""" -__all__ = ["input_config", "raw_config", "processed_config"] - -import logging -import os - -from demo.src import environment -from demo.src.environment import ENVIRONMENT, RESOURCES -from dynamicio.config import IOConfig - -logging.basicConfig(level=logging.INFO) -logging.getLogger("kafka").setLevel(logging.WARNING) - - -input_config = IOConfig( - path_to_source_yaml=(os.path.join(RESOURCES, "definitions/input.yaml")), - env_identifier=ENVIRONMENT, - dynamic_vars=environment, -) -raw_config = IOConfig( - path_to_source_yaml=(os.path.join(RESOURCES, "definitions/raw.yaml")), - env_identifier=ENVIRONMENT, - dynamic_vars=environment, -) -processed_config = IOConfig( - path_to_source_yaml=(os.path.join(RESOURCES, "definitions/processed.yaml")), - env_identifier=ENVIRONMENT, - dynamic_vars=environment, -) diff --git a/demo/src/__main__.py b/demo/src/__main__.py deleted file mode 100644 index 610d826..0000000 --- a/demo/src/__main__.py +++ /dev/null @@ -1,21 +0,0 @@ -"""The main module that serves as the entry point for all the other sub-modules under the: loader, pre_process and the publish packages.""" -# pylint: disable=unused-argument - -from functools import partial -from signal import SIGINT, SIGTERM, getsignal, signal - -from demo.src.runner_selection import choose_module, create_parser, custom_signal_handler -from demo.src.runners import staging, transform - -AIRFLOW_TASK_MODULES = {"staging": staging, "transform": transform} - -# Run the respective handler() functions when SIGINT or SIGTERM is received -signal(SIGINT, partial(custom_signal_handler, default_func=(getsignal(SIGINT)))) -signal(SIGTERM, partial(custom_signal_handler, default_func=(getsignal(SIGTERM)))) - -# Load input args -parser = create_parser(AIRFLOW_TASK_MODULES) -args = parser.parse_args() - -func = choose_module(args.with_module, AIRFLOW_TASK_MODULES) -func.main() diff --git a/demo/src/constants.py b/demo/src/constants.py deleted file mode 100644 index 3d13c43..0000000 --- a/demo/src/constants.py +++ /dev/null @@ -1,8 +0,0 @@ -"""A module for configuring all constants:""" - -# Parquet -TO_PARQUET_KWARGS = { - "use_deprecated_int96_timestamps": False, - "coerce_timestamps": "ms", - "allow_truncated_timestamps": True, -} diff --git a/demo/src/environment.py b/demo/src/environment.py deleted file mode 100644 index cbc5a91..0000000 --- a/demo/src/environment.py +++ /dev/null @@ -1,18 +0,0 @@ -"""A module for configuring all environment variables.""" -import os - -ENVIRONMENT = "sample" -CLOUD_ENV = "DEV" -RESOURCES = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../resources") -TEST_RESOURCES = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../tests") -S3_YOUR_INPUT_BUCKET = None -S3_YOUR_OUTPUT_BUCKET = None -KAFKA_SERVER = None -KAFKA_TOPIC = None -DB_HOST = None -DB_PORT = None -DB_NAME = None -DB_USER = None -DB_PASS = None -REFERENCE_DATA_STATE_KEY = None -LOWER_THAN_LIMIT = 1000 diff --git a/demo/src/io.py b/demo/src/io.py deleted file mode 100644 index c7e0098..0000000 --- a/demo/src/io.py +++ /dev/null @@ -1,38 +0,0 @@ -"""Responsible for configuring io operations for input data.""" -# pylint: disable=too-few-public-methods -__all__ = ["InputIO", "StagedFoo", "StagedBar"] - -from sqlalchemy.ext.declarative import declarative_base - -from dynamicio import UnifiedIO, WithLocal, WithPostgres, WithS3File -from dynamicio.core import SCHEMA_FROM_FILE, DynamicDataIO - -Base = declarative_base() - - -class InputIO(UnifiedIO): - """UnifiedIO subclass for V6 data.""" - - schema = SCHEMA_FROM_FILE - - -class StagedFoo(WithS3File, WithLocal, DynamicDataIO): - """UnifiedIO subclass for staged foos.""" - - schema = { - "column_a": "object", - "column_b": "object", - "column_c": "int64", - "column_d": "int64", - } - - -class StagedBar(WithLocal, WithPostgres, DynamicDataIO): - """UnifiedIO subclass for cargo movements volumes data.""" - - schema = { - "column_a": "object", - "column_b": "object", - "column_c": "int64", - "column_d": "int64", - } diff --git a/demo/src/runner_selection.py b/demo/src/runner_selection.py deleted file mode 100644 index 137740f..0000000 --- a/demo/src/runner_selection.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Pipeline utilities for all tasks.""" -import argparse -import logging -from signal import signal -from types import FrameType -from typing import Callable - -logger = logging.getLogger(__name__) - - -def custom_signal_handler(signal_received: int, frame: FrameType, default_func: Callable[[int, FrameType], None]): - """A custom signal handler for managing SIGINT and SIGTERM signals. - - Args: - signal_received: The signal received - frame: The frame from which the signal was sent. - default_func: The handler to delegate to. - """ - logger.info(f"Termination signal detected: {signal_received}. Exiting gracefully...") - default_func(signal, frame) - - -def create_parser(airflow_task_modules: dict) -> argparse.ArgumentParser: - """Generates an argument parser for the docker container's entry point, namely for `pipeline.src.main`. - - - Parameter name: `--with_module` - - Options: staging, transform or sink - Args: - airflow_task_modules: The names of the airflow tasks to be chosen - Returns: - argparse.ArgumentParser - """ - _parser = argparse.ArgumentParser() - _parser.add_argument("--with_module", type=str, choices=list(airflow_task_modules.keys())) - return _parser - - -def choose_module(module_arg: str, airflow_task_modules: dict): - """A function for choosing the a module to serve as the entry point for an Airflow task. - - A choice can be provided from the below list: - - - "staging" - - "transform" - - "sink" - - Args: - module_arg: str: The parsed argument provided as a parameter in an airflow task in the respective airflow dag. - airflow_task_modules: - - Returns: - func: Return the respective module, based on the provided input. - """ - try: - return airflow_task_modules[module_arg] - except KeyError as error_key: - raise ValueError("Invalid input argument...") from error_key diff --git a/demo/src/runners/__init__.py b/demo/src/runners/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/demo/src/runners/staging.py b/demo/src/runners/staging.py deleted file mode 100644 index 1327866..0000000 --- a/demo/src/runners/staging.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Add module docstring....""" -import logging - -from demo.src import constants, input_config, raw_config -from demo.src.io import InputIO, StagedBar, StagedFoo - -logger = logging.getLogger(__name__) - - -def main() -> None: - """The entry point for the Airflow Staging task. - - Returns: - Void function. - """ - # LOAD DATA - logger.info("Loading data from live sources...") - - bar_df = InputIO(source_config=input_config.get(source_key="BAR"), apply_schema_validations=True, log_schema_metrics=True).read() - foo_df = InputIO(source_config=input_config.get(source_key="FOO"), apply_schema_validations=True, log_schema_metrics=True).read() - - logger.info("Data successfully loaded from live sources...") - - # TRANSFORM DATA - logger.info("Apply transformations...") - - # TODO: Apply your transformations - - logger.info("Transformations applied successfully...") - - # SINK DATA - logger.info("Begin sinking data to staging area:") - StagedFoo(source_config=raw_config.get(source_key="STAGED_FOO"), **constants.TO_PARQUET_KWARGS).write(foo_df) - StagedBar(source_config=raw_config.get(source_key="STAGED_BAR")).write(bar_df) - logger.info("Data staging is complete...") diff --git a/demo/src/runners/transform.py b/demo/src/runners/transform.py deleted file mode 100644 index 6d9d85a..0000000 --- a/demo/src/runners/transform.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Add module docstring....""" -import asyncio -import logging - -import demo.src.environment -from demo.src import processed_config, raw_config -from demo.src.io import InputIO, StagedBar, StagedFoo - -logger = logging.getLogger(__name__) - - -async def main() -> None: - """The entry point for the Airflow Staging task. - - Returns: - Void function. - """ - # LOAD DATA - logger.info("Loading data from live sources...") - - [bar_df, foo_df] = await asyncio.gather( - StagedBar(source_config=raw_config.get(source_key="STAGED_BAR")).async_read(), StagedFoo(source_config=raw_config.get(source_key="STAGED_FOO")).async_read() - ) - - logger.info("Data successfully loaded from live sources...") - - # TRANSFORM DATA - logger.info("Apply transformations...") - - # TODO: Apply your transformations - - logger.info("Transformations applied successfully...") - - # SINK DATA - logger.info(f"Begin sinking data to staging area: S3:{demo.src.environment.S3_YOUR_OUTPUT_BUCKET}:live/data/raw") - await asyncio.gather( - InputIO(source_config=processed_config.get(source_key="FINAL_FOO"), apply_schema_validations=True, log_schema_metrics=True).async_write(foo_df), - InputIO(source_config=processed_config.get(source_key="FINAL_BAR"), apply_schema_validations=True, log_schema_metrics=True).async_write(bar_df), - ) - logger.info("Data staging is complete...") diff --git a/demo/tests/__init__.py b/demo/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/demo/tests/conftest.py b/demo/tests/conftest.py deleted file mode 100644 index a0a2bd3..0000000 --- a/demo/tests/conftest.py +++ /dev/null @@ -1,25 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring -import pandas as pd -import pytest - -from demo.tests import constants - - -@pytest.fixture(scope="class") -def expected_staged_foo_df(): - return pd.read_parquet(f"{constants.E2E_TEST_RESOURCES}/data/raw/expected/staged_foo.parquet") - - -@pytest.fixture(scope="class") -def expected_staged_bar_df(): - return pd.read_parquet(f"{constants.E2E_TEST_RESOURCES}/data/raw/expected/staged_bar.parquet") - - -@pytest.fixture(scope="class") -def expected_final_foo_df(): - return pd.read_parquet(f"{constants.E2E_TEST_RESOURCES}/data/processed/expected/final_foo.parquet") - - -@pytest.fixture(scope="class") -def expected_final_bar_df(): - return pd.read_parquet(f"{constants.E2E_TEST_RESOURCES}/data/processed/expected/final_bar.parquet") diff --git a/demo/tests/constants.py b/demo/tests/constants.py deleted file mode 100644 index a9020ce..0000000 --- a/demo/tests/constants.py +++ /dev/null @@ -1,13 +0,0 @@ -"""A module for configuring all dynamic environment variables for testing purposes""" -import os - -E2E_TEST_RESOURCES = os.path.join(os.path.dirname(os.path.realpath(__file__))) - -# Dynamic Vars -S3_INPUT_BUCKET = "mock-foo-input-bucket" -S3_OUTPUT_BUCKET = "mock-bar-output-bucket" -DB_HOST = "10.0.2.217" -DB_PORT = "5432" -DB_NAME = "backend" -DB_USER = "user" -DB_PASS = "pass" diff --git a/demo/tests/data/processed/expected/final_bar.parquet b/demo/tests/data/processed/expected/final_bar.parquet deleted file mode 100644 index a4915bc..0000000 Binary files a/demo/tests/data/processed/expected/final_bar.parquet and /dev/null differ diff --git a/demo/tests/data/processed/expected/final_foo.parquet b/demo/tests/data/processed/expected/final_foo.parquet deleted file mode 100644 index a4915bc..0000000 Binary files a/demo/tests/data/processed/expected/final_foo.parquet and /dev/null differ diff --git a/demo/tests/data/raw/expected/staged_bar.parquet b/demo/tests/data/raw/expected/staged_bar.parquet deleted file mode 100644 index 4338275..0000000 Binary files a/demo/tests/data/raw/expected/staged_bar.parquet and /dev/null differ diff --git a/demo/tests/data/raw/expected/staged_foo.parquet b/demo/tests/data/raw/expected/staged_foo.parquet deleted file mode 100644 index 4338275..0000000 Binary files a/demo/tests/data/raw/expected/staged_foo.parquet and /dev/null differ diff --git a/demo/tests/runners/__init__.py b/demo/tests/runners/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/demo/tests/runners/conftest.py b/demo/tests/runners/conftest.py deleted file mode 100644 index e69de29..0000000 diff --git a/demo/tests/runners/test_staging.py b/demo/tests/runners/test_staging.py deleted file mode 100644 index d4484d8..0000000 --- a/demo/tests/runners/test_staging.py +++ /dev/null @@ -1,23 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring -# noqa -import pytest - - -class TestStaging: - @pytest.mark.unit - def test_a_function(self): - # Given - - # When - - # Then - assert True - - @pytest.mark.integration - def test_a_combination_of_functions(self): - # Given - - # When - - # Then - assert True diff --git a/demo/tests/runners/test_transform.py b/demo/tests/runners/test_transform.py deleted file mode 100644 index 51f6640..0000000 --- a/demo/tests/runners/test_transform.py +++ /dev/null @@ -1,22 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, no-self-use -import pytest - - -class TestTransform: - @pytest.mark.unit - def test_a_function(self): - # Given - - # When - - # Then - assert True - - @pytest.mark.integration - def test_a_combination_of_functions(self): - # Given - - # When - - # Then - assert True diff --git a/demo/tests/test_pipeline.py b/demo/tests/test_pipeline.py deleted file mode 100644 index 185529b..0000000 --- a/demo/tests/test_pipeline.py +++ /dev/null @@ -1,55 +0,0 @@ -"""An example pipeline to showcase how dynamicio can bt used for setting up a local e2e testing!""" -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, unused-argument, too-few-public-methods -# noqa -import asyncio -import os - -import pandas as pd -import pytest - -from demo.src import processed_config, raw_config -from demo.src.runners import staging, transform - - -class TestPipeline: - """Example e2e test.""" - - @pytest.mark.end_to_end - def test_dag_with_mock_sample_input_data( - self, - expected_staged_foo_df, - expected_staged_bar_df, - expected_final_foo_df, - expected_final_bar_df, - ): - """Showcases how you can leverage dynamicio to read local data for fast feedback when you want to run your pipelines locally.""" - # Given - # The src/resources/input.yaml - - # When - staging.main() - asyncio.run(transform.main()) - - # Then - try: - pd.testing.assert_frame_equal( - expected_staged_foo_df, - pd.read_parquet(raw_config.get(source_key="STAGED_FOO").local.file_path) - ) - pd.testing.assert_frame_equal( - expected_staged_bar_df, - pd.read_parquet(raw_config.get(source_key="STAGED_BAR").local.file_path) - ) - pd.testing.assert_frame_equal( - expected_final_foo_df, - pd.read_parquet(processed_config.get(source_key="FINAL_FOO").local.file_path) - ) - pd.testing.assert_frame_equal( - expected_final_bar_df, - pd.read_parquet(processed_config.get(source_key="FINAL_BAR").local.file_path) - ) - finally: - os.remove(raw_config.get(source_key="STAGED_FOO").local.file_path) - os.remove(raw_config.get(source_key="STAGED_BAR").local.file_path) - os.remove(processed_config.get(source_key="FINAL_FOO").local.file_path) - os.remove(processed_config.get(source_key="FINAL_BAR").local.file_path) diff --git a/demo/tests/test_runner_selection.py b/demo/tests/test_runner_selection.py deleted file mode 100644 index 821ea23..0000000 --- a/demo/tests/test_runner_selection.py +++ /dev/null @@ -1,41 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, no-self-use - -import pytest - -from demo.src.runner_selection import choose_module -from demo.src.runners import staging, transform - -AIRFLOW_TASK_MODULES = {"staging": staging, "transform": transform} - - -class TestUtilities: - @pytest.mark.unit - def test_choose_module_returns_the_staging_module_with_module_arg_set_to_staging(self): - # Given - module_arg = "staging" - - # When - func = choose_module(module_arg, AIRFLOW_TASK_MODULES) - - # Then - assert func == staging - - @pytest.mark.unit - def test_choose_module_returns_the_transform_module_with_module_arg_set_to_transform(self): - # Given - module_arg = "transform" - - # When - func = choose_module(module_arg, AIRFLOW_TASK_MODULES) - - # Then - assert func == transform - - @pytest.mark.unit - def test_choose_module_raises_value_error_with_module_arg_set_to_false_input(self): - # Given - module_arg = "whatever" - - # When/Then - with pytest.raises(ValueError): - choose_module(module_arg, AIRFLOW_TASK_MODULES) diff --git a/docs/cli.html b/docs/cli.html deleted file mode 100644 index ca7b4e3..0000000 --- a/docs/cli.html +++ /dev/null @@ -1,326 +0,0 @@ - - - - - - -dynamicio.cli API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.cli

-
-
-

Implements the dynamicio Command Line Interface (CLI).

-
- -Expand source code - -
"""Implements the dynamicio Command Line Interface (CLI)."""
-import argparse
-import glob
-import os
-import pprint
-from typing import Mapping, MutableMapping, Optional, Sequence
-
-import pandas as pd  # type: ignore
-import yaml
-
-from dynamicio.errors import InvalidDatasetTypeError
-
-
-def parse_args(args: Optional[Sequence] = None) -> argparse.Namespace:
-    """Arguments parser for dynamicio cli.py.
-
-    Args:
-        args: List of args to be parsed. Defaults to None, in which case
-            sys.argv[1:] is used.
-
-    Returns:
-        An instance of ArgumentParser populated with the provided args.
-    """
-    parser = argparse.ArgumentParser(prog="dynamicio", description="Generate dataset schemas")
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument(
-        "-b",
-        "--batch",
-        action="store_true",
-        help="flag, used to generate multiple schemas provided a datasets directory.",
-    )
-    group.add_argument(
-        "-s",
-        "--single",
-        action="store_true",
-        help="flag, used to generate a schema provided a single dataset.",
-    )
-    parser.add_argument("-p", "--path", required=True, help="the path to the dataset/datasets-directory.", type=str)
-    parser.add_argument("-o", "--output", required=True, help="the path to the schemas output directory.", type=str)
-    return parser.parse_args(args)
-
-
-def generate_schema_for(dataset: str) -> Mapping:
-    """Generate a schema for a dataset.
-
-    Args:
-        dataset: The path to the dataset for which we want to generate a schema
-
-    Returns:
-        A dictionary containing the schema for the dataset, or None if the dataset is not valid.
-
-    Raises:
-        InvalidDatasetTypeError: If the dataset type is not supported by dynamicio.
-    """
-    dataset_name, file_type = os.path.splitext(os.path.basename(dataset))
-    if file_type == ".parquet":
-        df = pd.read_parquet(dataset)
-    elif file_type == ".csv":
-        df = pd.read_csv(dataset)
-    elif file_type == ".json":
-        df = pd.read_json(dataset)
-    elif file_type == ".h5":
-        df = pd.read_hdf(dataset)
-    else:
-        raise InvalidDatasetTypeError(dataset)
-
-    print(f"Generating schema for: {dataset}")
-    json_schema: MutableMapping = {"name": dataset_name, "columns": {}}
-    for column, d_type in zip(list(df.columns), list(df.dtypes)):
-        json_schema["columns"][column] = {"type": "", "validations": {}, "metrics": []}
-        json_schema["columns"][column]["type"] = d_type.name
-
-    return json_schema
-
-
-def main(args: argparse.Namespace):
-    """Main function for dynamicio cli.py.
-
-    Args:
-        args: Parsed args.
-    """
-    if args.batch:
-        dataset_files = glob.glob(os.path.join(args.path, "*.*"))
-        for dataset in dataset_files:
-            try:
-                json_schema = generate_schema_for(dataset)
-            except InvalidDatasetTypeError as exception:
-                print(f"Skipping {exception.message}! You may want to remove this file from the datasets directory")
-            else:
-                with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml:  # pylint: disable=unspecified-encoding]
-                    yaml.safe_dump(json_schema, yml)
-
-    if args.single:
-        json_schema = generate_schema_for(str(args.path))
-        with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml:  # pylint: disable=unspecified-encoding]
-            yaml.safe_dump(json_schema, yml)
-        pprint.pprint(json_schema)
-
-
-def run():
-    """Entry point for the dynamicio cli.py."""
-    args = parse_args()
-    main(args)
-
-
-
-
-
-
-
-

Functions

-
-
-def generate_schema_for(dataset: str) ‑> Mapping[~KT, +VT_co] -
-
-

Generate a schema for a dataset.

-

Args

-
-
dataset
-
The path to the dataset for which we want to generate a schema
-
-

Returns

-

A dictionary containing the schema for the dataset, or None if the dataset is not valid.

-

Raises

-
-
InvalidDatasetTypeError
-
If the dataset type is not supported by dynamicio.
-
-
- -Expand source code - -
def generate_schema_for(dataset: str) -> Mapping:
-    """Generate a schema for a dataset.
-
-    Args:
-        dataset: The path to the dataset for which we want to generate a schema
-
-    Returns:
-        A dictionary containing the schema for the dataset, or None if the dataset is not valid.
-
-    Raises:
-        InvalidDatasetTypeError: If the dataset type is not supported by dynamicio.
-    """
-    dataset_name, file_type = os.path.splitext(os.path.basename(dataset))
-    if file_type == ".parquet":
-        df = pd.read_parquet(dataset)
-    elif file_type == ".csv":
-        df = pd.read_csv(dataset)
-    elif file_type == ".json":
-        df = pd.read_json(dataset)
-    elif file_type == ".h5":
-        df = pd.read_hdf(dataset)
-    else:
-        raise InvalidDatasetTypeError(dataset)
-
-    print(f"Generating schema for: {dataset}")
-    json_schema: MutableMapping = {"name": dataset_name, "columns": {}}
-    for column, d_type in zip(list(df.columns), list(df.dtypes)):
-        json_schema["columns"][column] = {"type": "", "validations": {}, "metrics": []}
-        json_schema["columns"][column]["type"] = d_type.name
-
-    return json_schema
-
-
-
-def main(args: argparse.Namespace) -
-
-

Main function for dynamicio cli.py.

-

Args

-
-
args
-
Parsed args.
-
-
- -Expand source code - -
def main(args: argparse.Namespace):
-    """Main function for dynamicio cli.py.
-
-    Args:
-        args: Parsed args.
-    """
-    if args.batch:
-        dataset_files = glob.glob(os.path.join(args.path, "*.*"))
-        for dataset in dataset_files:
-            try:
-                json_schema = generate_schema_for(dataset)
-            except InvalidDatasetTypeError as exception:
-                print(f"Skipping {exception.message}! You may want to remove this file from the datasets directory")
-            else:
-                with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml:  # pylint: disable=unspecified-encoding]
-                    yaml.safe_dump(json_schema, yml)
-
-    if args.single:
-        json_schema = generate_schema_for(str(args.path))
-        with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml:  # pylint: disable=unspecified-encoding]
-            yaml.safe_dump(json_schema, yml)
-        pprint.pprint(json_schema)
-
-
-
-def parse_args(args: Optional[Sequence[+T_co]] = None) ‑> argparse.Namespace -
-
-

Arguments parser for dynamicio cli.py.

-

Args

-
-
args
-
List of args to be parsed. Defaults to None, in which case -sys.argv[1:] is used.
-
-

Returns

-

An instance of ArgumentParser populated with the provided args.

-
- -Expand source code - -
def parse_args(args: Optional[Sequence] = None) -> argparse.Namespace:
-    """Arguments parser for dynamicio cli.py.
-
-    Args:
-        args: List of args to be parsed. Defaults to None, in which case
-            sys.argv[1:] is used.
-
-    Returns:
-        An instance of ArgumentParser populated with the provided args.
-    """
-    parser = argparse.ArgumentParser(prog="dynamicio", description="Generate dataset schemas")
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument(
-        "-b",
-        "--batch",
-        action="store_true",
-        help="flag, used to generate multiple schemas provided a datasets directory.",
-    )
-    group.add_argument(
-        "-s",
-        "--single",
-        action="store_true",
-        help="flag, used to generate a schema provided a single dataset.",
-    )
-    parser.add_argument("-p", "--path", required=True, help="the path to the dataset/datasets-directory.", type=str)
-    parser.add_argument("-o", "--output", required=True, help="the path to the schemas output directory.", type=str)
-    return parser.parse_args(args)
-
-
-
-def run() -
-
-

Entry point for the dynamicio cli.py.

-
- -Expand source code - -
def run():
-    """Entry point for the dynamicio cli.py."""
-    args = parse_args()
-    main(args)
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/config.html b/docs/config.html deleted file mode 100644 index 1978a61..0000000 --- a/docs/config.html +++ /dev/null @@ -1,1087 +0,0 @@ - - - - - - -dynamicio.config API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.config

-
-
-

Implements the IOConfig class, generating objects used as a configuration parameter for the instantiation ofsrc.utils.dynamicio.dataio.DynamicDataIO objects.

-

The IOConfig object, essentially parses a yaml file that contains a set of input sources that will be processed by a -task, converting filtering and converting them into dictionaries.

-

For example, suppose an input.yaml file, containing:

-
READ_FROM_S3_CSV:
-  LOCAL:
-    type: "local"
-    local:
-      file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
-      file_type: "csv"
-  CLOUD:
-    type: "s3"
-    s3:
-      bucket: "[[ MOCK_BUCKET ]]"
-      file_path: "[[ MOCK_KEY ]]"
-      file_type: "csv"
-
-

would be loaded with:

-
input_sources_config = IOConfig(
-        "path_to/input.yaml",
-        env_identifier="CLOUD",
-        dynamic_vars=config_module
-    )
-
-

and:

-
input_sources_config.config
-
-

would return:

-
    {
-        "READ_FROM_S3_CSV": {
-            "LOCAL": {
-                "type": "local",
-                "local": {
-                    "file_path": f"{test_global_vars.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
-                    "file_type": "csv",
-                },
-            },
-            "CLOUD": {
-                "type": "s3",
-                "s3": {
-                    "bucket": "mock-bucket",
-                    "file_path": "mock-key",
-                    "file_type": "csv"
-                }
-            },
-        }
-    }
-
-
- -Expand source code - -
"""Implements the `IOConfig` class, generating objects used as a configuration parameter for the instantiation of`src.utils.dynamicio.dataio.DynamicDataIO` objects.
-
-The `IOConfig` object, essentially parses a yaml file that contains a set of input sources that will be processed by a
-task, converting filtering and converting them into dictionaries.
-
-For example, suppose an `input.yaml` file, containing:
-
-    READ_FROM_S3_CSV:
-      LOCAL:
-        type: "local"
-        local:
-          file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv"
-          file_type: "csv"
-      CLOUD:
-        type: "s3"
-        s3:
-          bucket: "[[ MOCK_BUCKET ]]"
-          file_path: "[[ MOCK_KEY ]]"
-          file_type: "csv"
-
-would be loaded with:
-
-    input_sources_config = IOConfig(
-            "path_to/input.yaml",
-            env_identifier="CLOUD",
-            dynamic_vars=config_module
-        )
-
-and:
-
-    input_sources_config.config
-
-would return:
-
-        {
-            "READ_FROM_S3_CSV": {
-                "LOCAL": {
-                    "type": "local",
-                    "local": {
-                        "file_path": f"{test_global_vars.TEST_RESOURCES}/data/input/some_csv_to_read.csv",
-                        "file_type": "csv",
-                    },
-                },
-                "CLOUD": {
-                    "type": "s3",
-                    "s3": {
-                        "bucket": "mock-bucket",
-                        "file_path": "mock-key",
-                        "file_type": "csv"
-                    }
-                },
-            }
-        }
-"""
-__all__ = ["IOConfig", "SafeDynamicResourceLoader", "SafeDynamicSchemaLoader"]
-
-import re
-from types import ModuleType
-from typing import Any, List, Mapping
-
-import yaml
-from magic_logger import logger
-
-
-class SafeDynamicResourceLoader(yaml.SafeLoader):
-    """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
-    Dynamic variables defined in a provided module object.
-    """
-
-    module = None
-    dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
-    @classmethod
-    def with_module(cls, module: ModuleType):
-        """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
-        Args:
-            module: A global vars module with all the dynamic values defined in it.
-
-        Returns:
-            type
-        """
-        return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-    def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
-        """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
-        Args:
-            node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-                are replaced with the respective attributes in te provided module.
-
-        Returns:
-            Constructed `str` or numerical.
-        """
-        value = node.value
-
-        while result := self.dynamic_data_matcher.match(value):
-            ref = result.group(3)
-            replacement = getattr(self.module, ref)
-
-            value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
-        return value
-
-
-class SafeDynamicSchemaLoader(yaml.SafeLoader):
-    """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
-    Dynamic variables defined in a provided module object.
-    """
-
-    module = None
-    dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
-    @classmethod
-    def with_module(cls, module: ModuleType):
-        """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
-        Args:
-            module: A global vars module with all the dynamic values defined in it.
-
-        Returns:
-            type
-        """
-        return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-    def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
-        """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
-        Args:
-            node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-                are replaced with the respective attributes in te provided module.
-
-        Returns:
-            Constructed `str` or numerical.
-        """
-        value = node.value
-
-        while result := self.dynamic_data_matcher.match(value):
-            ref = result.group(3)
-            replacement = getattr(self.module, ref)
-
-            value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
-            try:
-                value = float(value)
-                return value
-            except ValueError:
-                pass
-
-        return value
-
-
-class IOConfig:
-    """Generates an object that returns a sub-dictionary of the elements of that yaml file.
-
-    The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file,
-    an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.
-
-    Example:
-        input_sources_config = IOConfig(
-            "path_to/input.yaml",
-            env_identifier="CLOUD",
-            dynamic_vars=config_module
-        )
-    """
-
-    YAML_TAG = "tag:yaml.org,2002:str"
-    SafeDynamicResourceLoader.add_constructor(YAML_TAG, SafeDynamicResourceLoader.dyn_str_constructor)
-    SafeDynamicSchemaLoader.add_constructor(YAML_TAG, SafeDynamicSchemaLoader.dyn_value_constructor)
-
-    def __init__(self, path_to_source_yaml: str, env_identifier: str, dynamic_vars: ModuleType):
-        """Class constructor.
-
-        Args:
-            path_to_source_yaml: Absolute file path to yaml file containing source definitions
-            env_identifier: "LOCAL" or "CLOUD".
-            dynamic_vars: module containing values for dynamic values that the source yaml
-                may reference.
-        """
-        self.path_to_source_yaml = path_to_source_yaml
-        self.env_identifier = env_identifier
-        self.dynamic_vars = dynamic_vars
-        self.config = self._parse_sources_config()
-
-    def _parse_sources_config(self) -> Mapping:
-        """Parses the yaml input and return a dictionary.
-
-        Returns:
-            A dictionary with the list of all file paths pointing to various input sources as those
-            are defined in their respective data/*.yaml files.
-        """
-        with open(self.path_to_source_yaml, "r") as stream:  # pylint: disable=unspecified-encoding]
-            logger.debug(f"Parsing {self.path_to_source_yaml}...")
-            return yaml.load(stream, SafeDynamicResourceLoader.with_module(self.dynamic_vars))
-
-    @property
-    def sources(self) -> List[str]:
-        """Class property for easy access to a list of sources.
-
-        Returns:
-            All top level names of the available resources for the used resources yaml config.
-        """
-        return list(self.config.keys())
-
-    def get(self, source_key: str) -> Mapping:
-        """A getter.
-
-        Args:
-            source_key: The name of the resource for which we want to create a config.
-
-        Returns:
-            A dictionary with the necessary fields for loading the data from a source.
-
-        Example:
-
-            Given:
-
-                VOYAGE_DATA:
-                  LOCAL:
-                    type: "local"
-                    local:
-                      file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
-                      file_type: "parquet"
-                  CLOUD:
-                    type: "kafka"
-                    KAFKA:
-                      KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
-                      KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
-            If you do:
-
-                input_sources_config = IOConfig(
-                    "path_to/input.yaml",
-                    env_identifier="CLOUD",
-                    dynamic_vars=globals
-                )
-                voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
-            then `voyage_data_cloud_mapping` is:
-
-                "KAFKA": {
-                    "KAFKA_SERVER": "mock-kafka-server",
-                    "KAFKA_TOPIC": "mock-kafka-topic"
-                }
-        """
-        source_config = self.config[source_key][self.env_identifier]
-        if self.config[source_key].get("schema"):
-            schema_definition = self._get_schema_definition(source_key)
-            source_config["name"] = schema_definition["name"]
-            source_config["schema"] = self._get_schema(schema_definition)
-            source_config["validations"] = self._get_validations(schema_definition)
-            source_config["metrics"] = self._get_metrics(schema_definition)
-        return source_config
-
-    def _get_schema_definition(self, source_key: str) -> Mapping:
-        """Retrieves the schema definition from a resource definition.
-
-        Returns:
-            The schema definition provided for a resource definition.
-        """
-        schema_file_path = self.config[source_key].get("schema")["file_path"]
-        with open(schema_file_path, "r") as stream:  # pylint: disable=unspecified-encoding]
-            logger.debug(f"Parsing schema: {schema_file_path}...")
-            return yaml.load(stream, SafeDynamicSchemaLoader.with_module(self.dynamic_vars))
-
-    @staticmethod
-    def _get_schema(schema_definition: Mapping) -> Mapping:
-        """Retrieve the schema from a schema definition.
-
-        Args:
-            schema_definition:
-
-        Returns:
-            The column types in the schema definition.
-        """
-        _schema = {}
-        for column in schema_definition["columns"].keys():
-            _schema[column] = schema_definition["columns"][column]["type"]
-        return _schema
-
-    @staticmethod
-    def _get_validations(schema_definition: Mapping) -> Mapping:
-        """Returns all validations for each column in a schema definition.
-
-        Args:
-            schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
-        Returns:
-            The validations applied to each column in the schema definition.
-        """
-        _validations = {}
-        for column in schema_definition["columns"].keys():
-            _validations[column] = schema_definition["columns"][column]["validations"]
-        return _validations
-
-    @staticmethod
-    def _get_metrics(schema_definition):
-        """Returns all metrics for each column in a schema definition.
-
-        Args:
-            schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
-        Returns:
-            The metrics applied to each column in the schema definition.
-        """
-        _metrics = {}
-        for column in schema_definition["columns"].keys():
-            _metrics[column] = schema_definition["columns"][column]["metrics"]
-        return _metrics
-
-
-
-
-
-
-
-
-
-

Classes

-
-
-class IOConfig -(path_to_source_yaml: str, env_identifier: str, dynamic_vars: module) -
-
-

Generates an object that returns a sub-dictionary of the elements of that yaml file.

-

The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file, -an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.

-

Example

-

input_sources_config = IOConfig( -"path_to/input.yaml", -env_identifier="CLOUD", -dynamic_vars=config_module -)

-

Class constructor.

-

Args

-
-
path_to_source_yaml
-
Absolute file path to yaml file containing source definitions
-
env_identifier
-
"LOCAL" or "CLOUD".
-
dynamic_vars
-
module containing values for dynamic values that the source yaml -may reference.
-
-
- -Expand source code - -
class IOConfig:
-    """Generates an object that returns a sub-dictionary of the elements of that yaml file.
-
-    The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file,
-    an ENVIRONMENT value {CLOUD or LOCAL} and a vars module.
-
-    Example:
-        input_sources_config = IOConfig(
-            "path_to/input.yaml",
-            env_identifier="CLOUD",
-            dynamic_vars=config_module
-        )
-    """
-
-    YAML_TAG = "tag:yaml.org,2002:str"
-    SafeDynamicResourceLoader.add_constructor(YAML_TAG, SafeDynamicResourceLoader.dyn_str_constructor)
-    SafeDynamicSchemaLoader.add_constructor(YAML_TAG, SafeDynamicSchemaLoader.dyn_value_constructor)
-
-    def __init__(self, path_to_source_yaml: str, env_identifier: str, dynamic_vars: ModuleType):
-        """Class constructor.
-
-        Args:
-            path_to_source_yaml: Absolute file path to yaml file containing source definitions
-            env_identifier: "LOCAL" or "CLOUD".
-            dynamic_vars: module containing values for dynamic values that the source yaml
-                may reference.
-        """
-        self.path_to_source_yaml = path_to_source_yaml
-        self.env_identifier = env_identifier
-        self.dynamic_vars = dynamic_vars
-        self.config = self._parse_sources_config()
-
-    def _parse_sources_config(self) -> Mapping:
-        """Parses the yaml input and return a dictionary.
-
-        Returns:
-            A dictionary with the list of all file paths pointing to various input sources as those
-            are defined in their respective data/*.yaml files.
-        """
-        with open(self.path_to_source_yaml, "r") as stream:  # pylint: disable=unspecified-encoding]
-            logger.debug(f"Parsing {self.path_to_source_yaml}...")
-            return yaml.load(stream, SafeDynamicResourceLoader.with_module(self.dynamic_vars))
-
-    @property
-    def sources(self) -> List[str]:
-        """Class property for easy access to a list of sources.
-
-        Returns:
-            All top level names of the available resources for the used resources yaml config.
-        """
-        return list(self.config.keys())
-
-    def get(self, source_key: str) -> Mapping:
-        """A getter.
-
-        Args:
-            source_key: The name of the resource for which we want to create a config.
-
-        Returns:
-            A dictionary with the necessary fields for loading the data from a source.
-
-        Example:
-
-            Given:
-
-                VOYAGE_DATA:
-                  LOCAL:
-                    type: "local"
-                    local:
-                      file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
-                      file_type: "parquet"
-                  CLOUD:
-                    type: "kafka"
-                    KAFKA:
-                      KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
-                      KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
-            If you do:
-
-                input_sources_config = IOConfig(
-                    "path_to/input.yaml",
-                    env_identifier="CLOUD",
-                    dynamic_vars=globals
-                )
-                voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
-            then `voyage_data_cloud_mapping` is:
-
-                "KAFKA": {
-                    "KAFKA_SERVER": "mock-kafka-server",
-                    "KAFKA_TOPIC": "mock-kafka-topic"
-                }
-        """
-        source_config = self.config[source_key][self.env_identifier]
-        if self.config[source_key].get("schema"):
-            schema_definition = self._get_schema_definition(source_key)
-            source_config["name"] = schema_definition["name"]
-            source_config["schema"] = self._get_schema(schema_definition)
-            source_config["validations"] = self._get_validations(schema_definition)
-            source_config["metrics"] = self._get_metrics(schema_definition)
-        return source_config
-
-    def _get_schema_definition(self, source_key: str) -> Mapping:
-        """Retrieves the schema definition from a resource definition.
-
-        Returns:
-            The schema definition provided for a resource definition.
-        """
-        schema_file_path = self.config[source_key].get("schema")["file_path"]
-        with open(schema_file_path, "r") as stream:  # pylint: disable=unspecified-encoding]
-            logger.debug(f"Parsing schema: {schema_file_path}...")
-            return yaml.load(stream, SafeDynamicSchemaLoader.with_module(self.dynamic_vars))
-
-    @staticmethod
-    def _get_schema(schema_definition: Mapping) -> Mapping:
-        """Retrieve the schema from a schema definition.
-
-        Args:
-            schema_definition:
-
-        Returns:
-            The column types in the schema definition.
-        """
-        _schema = {}
-        for column in schema_definition["columns"].keys():
-            _schema[column] = schema_definition["columns"][column]["type"]
-        return _schema
-
-    @staticmethod
-    def _get_validations(schema_definition: Mapping) -> Mapping:
-        """Returns all validations for each column in a schema definition.
-
-        Args:
-            schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
-        Returns:
-            The validations applied to each column in the schema definition.
-        """
-        _validations = {}
-        for column in schema_definition["columns"].keys():
-            _validations[column] = schema_definition["columns"][column]["validations"]
-        return _validations
-
-    @staticmethod
-    def _get_metrics(schema_definition):
-        """Returns all metrics for each column in a schema definition.
-
-        Args:
-            schema_definition: A dictionary with all columns in a dataset characterised by validations and metrics
-
-        Returns:
-            The metrics applied to each column in the schema definition.
-        """
-        _metrics = {}
-        for column in schema_definition["columns"].keys():
-            _metrics[column] = schema_definition["columns"][column]["metrics"]
-        return _metrics
-
-

Class variables

-
-
var YAML_TAG
-
-
-
-
-

Instance variables

-
-
var sources : List[str]
-
-

Class property for easy access to a list of sources.

-

Returns

-

All top level names of the available resources for the used resources yaml config.

-
- -Expand source code - -
@property
-def sources(self) -> List[str]:
-    """Class property for easy access to a list of sources.
-
-    Returns:
-        All top level names of the available resources for the used resources yaml config.
-    """
-    return list(self.config.keys())
-
-
-
-

Methods

-
-
-def get(self, source_key: str) ‑> Mapping[~KT, +VT_co] -
-
-

A getter.

-

Args

-
-
source_key
-
The name of the resource for which we want to create a config.
-
-

Returns

-

A dictionary with the necessary fields for loading the data from a source.

-

Example

-

Given:

-
VOYAGE_DATA:
-  LOCAL:
-    type: "local"
-    local:
-      file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
-      file_type: "parquet"
-  CLOUD:
-    type: "kafka"
-    KAFKA:
-      KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
-      KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
-

If you do:

-
input_sources_config = IOConfig(
-    "path_to/input.yaml",
-    env_identifier="CLOUD",
-    dynamic_vars=globals
-)
-voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
-

then voyage_data_cloud_mapping is:

-
"KAFKA": {
-    "KAFKA_SERVER": "mock-kafka-server",
-    "KAFKA_TOPIC": "mock-kafka-topic"
-}
-
-
- -Expand source code - -
def get(self, source_key: str) -> Mapping:
-    """A getter.
-
-    Args:
-        source_key: The name of the resource for which we want to create a config.
-
-    Returns:
-        A dictionary with the necessary fields for loading the data from a source.
-
-    Example:
-
-        Given:
-
-            VOYAGE_DATA:
-              LOCAL:
-                type: "local"
-                local:
-                  file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet"
-                  file_type: "parquet"
-              CLOUD:
-                type: "kafka"
-                KAFKA:
-                  KAFKA_SERVER: "[[ KAFKA_SERVER ]]"
-                  KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]"
-
-        If you do:
-
-            input_sources_config = IOConfig(
-                "path_to/input.yaml",
-                env_identifier="CLOUD",
-                dynamic_vars=globals
-            )
-            voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA")
-
-        then `voyage_data_cloud_mapping` is:
-
-            "KAFKA": {
-                "KAFKA_SERVER": "mock-kafka-server",
-                "KAFKA_TOPIC": "mock-kafka-topic"
-            }
-    """
-    source_config = self.config[source_key][self.env_identifier]
-    if self.config[source_key].get("schema"):
-        schema_definition = self._get_schema_definition(source_key)
-        source_config["name"] = schema_definition["name"]
-        source_config["schema"] = self._get_schema(schema_definition)
-        source_config["validations"] = self._get_validations(schema_definition)
-        source_config["metrics"] = self._get_metrics(schema_definition)
-    return source_config
-
-
-
-
-
-class SafeDynamicResourceLoader -(stream) -
-
-

Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].

-

Dynamic variables defined in a provided module object.

-

Initialize the scanner.

-
- -Expand source code - -
class SafeDynamicResourceLoader(yaml.SafeLoader):
-    """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
-    Dynamic variables defined in a provided module object.
-    """
-
-    module = None
-    dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
-    @classmethod
-    def with_module(cls, module: ModuleType):
-        """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
-        Args:
-            module: A global vars module with all the dynamic values defined in it.
-
-        Returns:
-            type
-        """
-        return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-    def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
-        """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
-        Args:
-            node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-                are replaced with the respective attributes in te provided module.
-
-        Returns:
-            Constructed `str` or numerical.
-        """
-        value = node.value
-
-        while result := self.dynamic_data_matcher.match(value):
-            ref = result.group(3)
-            replacement = getattr(self.module, ref)
-
-            value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
-        return value
-
-

Ancestors

-
    -
  • yaml.loader.SafeLoader
  • -
  • yaml.reader.Reader
  • -
  • yaml.scanner.Scanner
  • -
  • yaml.parser.Parser
  • -
  • yaml.composer.Composer
  • -
  • yaml.constructor.SafeConstructor
  • -
  • yaml.constructor.BaseConstructor
  • -
  • yaml.resolver.Resolver
  • -
  • yaml.resolver.BaseResolver
  • -
-

Class variables

-
-
var dynamic_data_matcher
-
-
-
-
var module
-
-
-
-
var yaml_constructors
-
-
-
-
-

Static methods

-
-
-def with_module(module: module) -
-
-

Creates a dynamic subclass of SafeDynamicLoader with the data_module attribute set to module.

-

Args

-
-
module
-
A global vars module with all the dynamic values defined in it.
-
-

Returns

-

type

-
- -Expand source code - -
@classmethod
-def with_module(cls, module: ModuleType):
-    """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
-    Args:
-        module: A global vars module with all the dynamic values defined in it.
-
-    Returns:
-        type
-    """
-    return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-
-
-

Methods

-
-
-def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) ‑> str -
-
-

Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.

-

Args

-
-
node
-
Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention -are replaced with the respective attributes in te provided module.
-
-

Returns

-

Constructed str or numerical.

-
- -Expand source code - -
def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str:
-    """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
-    Args:
-        node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-            are replaced with the respective attributes in te provided module.
-
-    Returns:
-        Constructed `str` or numerical.
-    """
-    value = node.value
-
-    while result := self.dynamic_data_matcher.match(value):
-        ref = result.group(3)
-        replacement = getattr(self.module, ref)
-
-        value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
-    return value
-
-
-
-
-
-class SafeDynamicSchemaLoader -(stream) -
-
-

Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].

-

Dynamic variables defined in a provided module object.

-

Initialize the scanner.

-
- -Expand source code - -
class SafeDynamicSchemaLoader(yaml.SafeLoader):
-    """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]].
-
-    Dynamic variables defined in a provided module object.
-    """
-
-    module = None
-    dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)")
-
-    @classmethod
-    def with_module(cls, module: ModuleType):
-        """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
-        Args:
-            module: A global vars module with all the dynamic values defined in it.
-
-        Returns:
-            type
-        """
-        return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-    def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
-        """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
-        Args:
-            node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-                are replaced with the respective attributes in te provided module.
-
-        Returns:
-            Constructed `str` or numerical.
-        """
-        value = node.value
-
-        while result := self.dynamic_data_matcher.match(value):
-            ref = result.group(3)
-            replacement = getattr(self.module, ref)
-
-            value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
-            try:
-                value = float(value)
-                return value
-            except ValueError:
-                pass
-
-        return value
-
-

Ancestors

-
    -
  • yaml.loader.SafeLoader
  • -
  • yaml.reader.Reader
  • -
  • yaml.scanner.Scanner
  • -
  • yaml.parser.Parser
  • -
  • yaml.composer.Composer
  • -
  • yaml.constructor.SafeConstructor
  • -
  • yaml.constructor.BaseConstructor
  • -
  • yaml.resolver.Resolver
  • -
  • yaml.resolver.BaseResolver
  • -
-

Class variables

-
-
var dynamic_data_matcher
-
-
-
-
var module
-
-
-
-
var yaml_constructors
-
-
-
-
-

Static methods

-
-
-def with_module(module: module) -
-
-

Creates a dynamic subclass of SafeDynamicLoader with the data_module attribute set to module.

-

Args

-
-
module
-
A global vars module with all the dynamic values defined in it.
-
-

Returns

-

type

-
- -Expand source code - -
@classmethod
-def with_module(cls, module: ModuleType):
-    """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`.
-
-    Args:
-        module: A global vars module with all the dynamic values defined in it.
-
-    Returns:
-        type
-    """
-    return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module})
-
-
-
-

Methods

-
-
-def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) ‑> Any -
-
-

Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.

-

Args

-
-
node
-
Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention -are replaced with the respective attributes in te provided module.
-
-

Returns

-

Constructed str or numerical.

-
- -Expand source code - -
def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any:
-    """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module.
-
-    Args:
-        node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention
-            are replaced with the respective attributes in te provided module.
-
-    Returns:
-        Constructed `str` or numerical.
-    """
-    value = node.value
-
-    while result := self.dynamic_data_matcher.match(value):
-        ref = result.group(3)
-        replacement = getattr(self.module, ref)
-
-        value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value)
-
-        try:
-            value = float(value)
-            return value
-        except ValueError:
-            pass
-
-    return value
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/core.html b/docs/core.html deleted file mode 100644 index 2de5292..0000000 --- a/docs/core.html +++ /dev/null @@ -1,855 +0,0 @@ - - - - - - -dynamicio.core API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.core

-
-
-

Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation.

-
- -Expand source code - -
"""Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation."""
-# pylint: disable=no-member
-__all__ = ["DynamicDataIO", "SCHEMA_FROM_FILE"]
-
-import asyncio
-import inspect
-import re
-from concurrent.futures import ThreadPoolExecutor
-from typing import Any, Mapping, MutableMapping, Optional
-
-import pandas as pd  # type: ignore
-from magic_logger import logger
-
-from dynamicio import validations
-from dynamicio.errors import CASTING_WARNING_MSG, ColumnsDataTypeError, MissingSchemaDefinition, NOTICE_MSG, SchemaNotFoundError, SchemaValidationError
-from dynamicio.metrics import get_metric
-
-SCHEMA_FROM_FILE = {"schema": object()}
-
-pool = ThreadPoolExecutor()
-
-
-class DynamicDataIO:
-    """Given a `src.utils.dynamicio.config.IOConfig` object, it generates an object with access to a series of methods for cloud I/O operations and data validations.
-
-    Example:
-       >>> input_sources_config = IOConfig(
-       >>>     "path_to/input.yaml",
-       >>>     os.getenv("ENVIRONMENT",default="LOCAL")
-       >>> )
-       >>>
-       >>> class IO(WithS3File, WithLocal, DynamicDataIO):
-       >>>     schema = S
-       >>>
-       >>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET")
-       >>> my_dataset_io = IO(my_dataset_local_mapping)
-       >>> my_dataset_df = my_dataset_io.read()
-    """
-
-    schema: Mapping
-
-    def __init__(
-        self,
-        source_config: Mapping,
-        apply_schema_validations: bool = False,
-        log_schema_metrics: bool = False,
-        show_casting_warnings: bool = False,
-        **options: MutableMapping[str, Any],
-    ):
-        """Class constructor.
-
-        Args:
-            source_config: Configuration to use when reading/writing data from/to a source
-            apply_schema_validations: Applies schema validations on either read() or write()
-            log_schema_metrics: Logs schema metrics on either read() or write()
-            show_casting_warnings: Logs casting warnings on either read() or write() if set to True
-            options: Any additional kwargs that may be used throughout the lifecycle of the object
-        """
-        if type(self) is DynamicDataIO:  # pylint: disable=unidiomatic-typecheck
-            raise TypeError("Abstract class DynamicDataIO cannot be used to instantiate an object...")
-
-        self.sources_config = source_config
-        self.name = self._transform_class_name_to_dataset_name(self.__class__.__name__)
-        self.apply_schema_validations = apply_schema_validations
-        self.log_schema_metrics = log_schema_metrics
-        self.show_casting_warnings = show_casting_warnings
-        self.options = self._get_options(options, source_config.get("options"))
-        source_name = self.sources_config.get("type")
-        if self.schema is SCHEMA_FROM_FILE:
-            try:
-                self.schema = self.sources_config["schema"]
-                self.name = self.sources_config["name"].upper()
-                self.schema_validations = self.sources_config["validations"]
-                self.schema_metrics = self.sources_config["metrics"]
-            except KeyError as _error:
-                raise SchemaNotFoundError() from _error
-
-        assert hasattr(self, f"_read_from_{source_name}") or hasattr(
-            self, f"_write_to_{source_name}"
-        ), f"No method '_read_from_{source_name}' or '_write_to_{source_name}'. Have you registered a mixin for {source_name}?"
-
-    def __init_subclass__(cls):
-        """Ensure that all subclasses have a `schema` attribute and a `validate` method.
-
-        Raises:
-            AssertionError: If either of the attributes is not implemented
-        """
-        if not inspect.getmodule(cls).__name__.startswith("dynamicio"):
-            assert "schema" in cls.__dict__
-
-            if cls.schema is None or (cls.schema is not SCHEMA_FROM_FILE and len(cls.schema) == 0):
-                raise ValueError(f"schema for class {cls} cannot be None or empty...")
-
-    async def async_read(self):
-        """Allows the use of asyncio to concurrently read files in memory.
-
-        Returns:
-            A pandas dataframe or an iterable.
-        """
-        loop = asyncio.get_running_loop()
-        return await loop.run_in_executor(pool, self.read)
-
-    def read(self) -> pd.DataFrame:
-        """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
-        Returns:
-            A pandas dataframe or an iterable.
-        """
-        source_name = self.sources_config.get("type")
-        df = getattr(self, f"_read_from_{source_name}")()
-
-        df = self._apply_schema(df)
-        if self.apply_schema_validations:
-            self.validate_from_schema(df)
-        if self.log_schema_metrics:
-            self.log_metrics_from_schema(df)
-
-        return df
-
-    async def async_write(self, df: pd.DataFrame):
-        """Allows the use of asyncio to concurrently write files out.
-
-        Args:
-            df: The data to be written
-        """
-        loop = asyncio.get_running_loop()
-        return await loop.run_in_executor(pool, self.write, df)
-
-    def write(self, df: pd.DataFrame):
-        """Sink data to a given source based on the sources_config.
-
-        Args:
-            df: The data to be written
-        """
-        source_name = self.sources_config.get("type")
-        if set(df.columns) != set(self.schema.keys()):  # pylint: disable=E1101
-            columns = [column for column in df.columns.to_list() if column in self.schema.keys()]
-            df = df[columns]
-
-        if self.apply_schema_validations:
-            self.validate_from_schema(df)
-        if self.log_schema_metrics:
-            self.log_metrics_from_schema(df)
-
-        getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
-
-    def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
-        """Validates a dataframe based on the validations present in its schema definition.
-
-        All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
-        Args:
-            df:
-
-        Returns:
-             self (to allow for method chaining).
-
-        Raises:
-            SchemaValidationError: if any of the validations failed. The `message` attribute of
-                the exception object is a `List[str]`, where each element is the name of a
-                validation that failed.
-        """
-        if not hasattr(self, "schema_validations"):
-            raise MissingSchemaDefinition(self.__class__)
-
-        failed_validations = {}
-        for column in self.schema_validations.keys():
-            for validation in self.schema_validations[column].keys():
-                if self.schema_validations[column][validation]["apply"] is True:
-                    validation_result = getattr(validations, validation)(self.name, df, column, **self.schema_validations[column][validation]["options"])
-                    if not validation_result.valid:
-                        failed_validations[validation] = validation_result.message
-
-        if len(failed_validations) > 0:
-            raise SchemaValidationError(failed_validations)
-
-        return self
-
-    def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
-        """Calculates and logs metrics based on the metrics present in its schema definition.
-
-        Args:
-            df: A dataframe for which metrics are generated and logged
-
-        Returns:
-             self (to allow for method chaining).
-        """
-        if not hasattr(self, "schema_metrics"):
-            raise MissingSchemaDefinition(self.__class__)
-
-        for column in self.schema_metrics.keys():
-            for metric in self.schema_metrics[column]:
-                get_metric(metric)(self.name, df, column)()  # type: ignore
-
-        return self
-
-    def _apply_schema(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Called by the `self.read()` and the `self._write_to_local()` methods.
-
-        Contrasts a dataframe's read from a given source against the class's schema dictionary,
-        checking that columns are the same (by means of _has_columns and _has_valid_dtypes). Then,
-        check if the columns are fine, it further validates if the types of columns conform to the
-        expected schema. Finally, if schema types are different, then it attempts to apply schema;
-        if possible then the schema validation is successful.
-
-        Args:
-            df: A pandas dataframe.
-
-        Returns:
-            A schema validated dataframe.
-        """
-        if not self._has_valid_dtypes(df):
-            raise ColumnsDataTypeError()
-        return df
-
-    @staticmethod
-    def _transform_class_name_to_dataset_name(string_to_transform: str) -> str:
-        """Called by the init function to fetch dataset names from class name.
-
-        Used to create dataset name from class name, turns camel case into upper snake case.
-        For example: 'ThisNameABC' -> 'THIS_NAME_ABC'.
-        """
-        words = re.findall(r"\d[A-Z]+|[A-Z]?[a-z\d]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+|[A-Z]{2,}|[A-Z]", string_to_transform)
-        return "_".join(map(str.lower, words)).upper()
-
-    def _has_valid_dtypes(self, df: pd.DataFrame) -> bool:
-        """Checks if `df` has the expected dtypes defined in `schema`.
-
-        Schema is a dictionary object where keys are column names and values are dtypes in string format as returned by e.g.
-        `df[column].dtype.name`.
-
-        This function issues `error` level logs describing the first column that caused the check to fail.
-
-        It is assumed that `df` only has the columns defined in `schema`.
-
-        Args:
-            df:
-
-        Returns:
-            bool - `True` if `df` has the given dtypes, `False` otherwise
-        """
-        dtypes = df.dtypes
-
-        for column_name, expected_dtype in self.schema.items():
-            found_dtype = dtypes[column_name].name
-            if found_dtype != expected_dtype:
-                if self.show_casting_warnings:
-                    logger.info(f"Expected: '{expected_dtype}' dtype for {self.name}['{column_name}]', found '{found_dtype}'")
-                try:
-                    if len(set(type(v) for v in df[column_name].values)) > 1:  # pylint: disable=consider-using-set-comprehension
-                        logger.warning(CASTING_WARNING_MSG.format(column_name, expected_dtype, found_dtype))  # pylint: disable=logging-format-interpolation
-                        logger.info(NOTICE_MSG.format(column_name))  # pylint: disable=logging-format-interpolation
-                    df[column_name] = df[column_name].astype(self.schema[column_name])
-                except (ValueError, TypeError):
-                    logger.error(f"ValueError: Tried casting column {self.name}['{column_name}]' to '{expected_dtype}' " f"from '{found_dtype}', but failed")
-                    return False
-        return True
-
-    @staticmethod
-    def _get_options(options_from_code: MutableMapping[str, Any], options_from_resource_definition: Optional[Mapping[str, Any]]) -> MutableMapping[str, Any]:
-        """Retrieves options either from code or from a resource-definition.
-
-        Options are merged if they are provided by both sources, while in the case of conflicts, the options from the code
-        take precedence.
-
-        Args:
-            options_from_code (Optional[Mapping])
-            options_from_resource_definition (Optional[Mapping])
-
-        Returns:
-            [Optional[Mapping]]: options that are going to be used
-        """
-        if options_from_resource_definition:
-            return {**options_from_resource_definition, **options_from_code}
-        return options_from_code
-
-
-
-
-
-
-
-
-
-

Classes

-
-
-class DynamicDataIO -(source_config: Mapping[~KT, +VT_co], apply_schema_validations: bool = False, log_schema_metrics: bool = False, show_casting_warnings: bool = False, **options: MutableMapping[str, Any]) -
-
-

Given a src.utils.dynamicio.config.IOConfig object, it generates an object with access to a series of methods for cloud I/O operations and data validations.

-

Example

-
>>> input_sources_config = IOConfig(
->>>     "path_to/input.yaml",
->>>     os.getenv("ENVIRONMENT",default="LOCAL")
->>> )
->>>
->>> class IO(WithS3File, WithLocal, DynamicDataIO):
->>>     schema = S
->>>
->>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET")
->>> my_dataset_io = IO(my_dataset_local_mapping)
->>> my_dataset_df = my_dataset_io.read()
-
-

Class constructor.

-

Args

-
-
source_config
-
Configuration to use when reading/writing data from/to a source
-
apply_schema_validations
-
Applies schema validations on either read() or write()
-
log_schema_metrics
-
Logs schema metrics on either read() or write()
-
show_casting_warnings
-
Logs casting warnings on either read() or write() if set to True
-
options
-
Any additional kwargs that may be used throughout the lifecycle of the object
-
-
- -Expand source code - -
class DynamicDataIO:
-    """Given a `src.utils.dynamicio.config.IOConfig` object, it generates an object with access to a series of methods for cloud I/O operations and data validations.
-
-    Example:
-       >>> input_sources_config = IOConfig(
-       >>>     "path_to/input.yaml",
-       >>>     os.getenv("ENVIRONMENT",default="LOCAL")
-       >>> )
-       >>>
-       >>> class IO(WithS3File, WithLocal, DynamicDataIO):
-       >>>     schema = S
-       >>>
-       >>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET")
-       >>> my_dataset_io = IO(my_dataset_local_mapping)
-       >>> my_dataset_df = my_dataset_io.read()
-    """
-
-    schema: Mapping
-
-    def __init__(
-        self,
-        source_config: Mapping,
-        apply_schema_validations: bool = False,
-        log_schema_metrics: bool = False,
-        show_casting_warnings: bool = False,
-        **options: MutableMapping[str, Any],
-    ):
-        """Class constructor.
-
-        Args:
-            source_config: Configuration to use when reading/writing data from/to a source
-            apply_schema_validations: Applies schema validations on either read() or write()
-            log_schema_metrics: Logs schema metrics on either read() or write()
-            show_casting_warnings: Logs casting warnings on either read() or write() if set to True
-            options: Any additional kwargs that may be used throughout the lifecycle of the object
-        """
-        if type(self) is DynamicDataIO:  # pylint: disable=unidiomatic-typecheck
-            raise TypeError("Abstract class DynamicDataIO cannot be used to instantiate an object...")
-
-        self.sources_config = source_config
-        self.name = self._transform_class_name_to_dataset_name(self.__class__.__name__)
-        self.apply_schema_validations = apply_schema_validations
-        self.log_schema_metrics = log_schema_metrics
-        self.show_casting_warnings = show_casting_warnings
-        self.options = self._get_options(options, source_config.get("options"))
-        source_name = self.sources_config.get("type")
-        if self.schema is SCHEMA_FROM_FILE:
-            try:
-                self.schema = self.sources_config["schema"]
-                self.name = self.sources_config["name"].upper()
-                self.schema_validations = self.sources_config["validations"]
-                self.schema_metrics = self.sources_config["metrics"]
-            except KeyError as _error:
-                raise SchemaNotFoundError() from _error
-
-        assert hasattr(self, f"_read_from_{source_name}") or hasattr(
-            self, f"_write_to_{source_name}"
-        ), f"No method '_read_from_{source_name}' or '_write_to_{source_name}'. Have you registered a mixin for {source_name}?"
-
-    def __init_subclass__(cls):
-        """Ensure that all subclasses have a `schema` attribute and a `validate` method.
-
-        Raises:
-            AssertionError: If either of the attributes is not implemented
-        """
-        if not inspect.getmodule(cls).__name__.startswith("dynamicio"):
-            assert "schema" in cls.__dict__
-
-            if cls.schema is None or (cls.schema is not SCHEMA_FROM_FILE and len(cls.schema) == 0):
-                raise ValueError(f"schema for class {cls} cannot be None or empty...")
-
-    async def async_read(self):
-        """Allows the use of asyncio to concurrently read files in memory.
-
-        Returns:
-            A pandas dataframe or an iterable.
-        """
-        loop = asyncio.get_running_loop()
-        return await loop.run_in_executor(pool, self.read)
-
-    def read(self) -> pd.DataFrame:
-        """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
-        Returns:
-            A pandas dataframe or an iterable.
-        """
-        source_name = self.sources_config.get("type")
-        df = getattr(self, f"_read_from_{source_name}")()
-
-        df = self._apply_schema(df)
-        if self.apply_schema_validations:
-            self.validate_from_schema(df)
-        if self.log_schema_metrics:
-            self.log_metrics_from_schema(df)
-
-        return df
-
-    async def async_write(self, df: pd.DataFrame):
-        """Allows the use of asyncio to concurrently write files out.
-
-        Args:
-            df: The data to be written
-        """
-        loop = asyncio.get_running_loop()
-        return await loop.run_in_executor(pool, self.write, df)
-
-    def write(self, df: pd.DataFrame):
-        """Sink data to a given source based on the sources_config.
-
-        Args:
-            df: The data to be written
-        """
-        source_name = self.sources_config.get("type")
-        if set(df.columns) != set(self.schema.keys()):  # pylint: disable=E1101
-            columns = [column for column in df.columns.to_list() if column in self.schema.keys()]
-            df = df[columns]
-
-        if self.apply_schema_validations:
-            self.validate_from_schema(df)
-        if self.log_schema_metrics:
-            self.log_metrics_from_schema(df)
-
-        getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
-
-    def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
-        """Validates a dataframe based on the validations present in its schema definition.
-
-        All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
-        Args:
-            df:
-
-        Returns:
-             self (to allow for method chaining).
-
-        Raises:
-            SchemaValidationError: if any of the validations failed. The `message` attribute of
-                the exception object is a `List[str]`, where each element is the name of a
-                validation that failed.
-        """
-        if not hasattr(self, "schema_validations"):
-            raise MissingSchemaDefinition(self.__class__)
-
-        failed_validations = {}
-        for column in self.schema_validations.keys():
-            for validation in self.schema_validations[column].keys():
-                if self.schema_validations[column][validation]["apply"] is True:
-                    validation_result = getattr(validations, validation)(self.name, df, column, **self.schema_validations[column][validation]["options"])
-                    if not validation_result.valid:
-                        failed_validations[validation] = validation_result.message
-
-        if len(failed_validations) > 0:
-            raise SchemaValidationError(failed_validations)
-
-        return self
-
-    def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
-        """Calculates and logs metrics based on the metrics present in its schema definition.
-
-        Args:
-            df: A dataframe for which metrics are generated and logged
-
-        Returns:
-             self (to allow for method chaining).
-        """
-        if not hasattr(self, "schema_metrics"):
-            raise MissingSchemaDefinition(self.__class__)
-
-        for column in self.schema_metrics.keys():
-            for metric in self.schema_metrics[column]:
-                get_metric(metric)(self.name, df, column)()  # type: ignore
-
-        return self
-
-    def _apply_schema(self, df: pd.DataFrame) -> pd.DataFrame:
-        """Called by the `self.read()` and the `self._write_to_local()` methods.
-
-        Contrasts a dataframe's read from a given source against the class's schema dictionary,
-        checking that columns are the same (by means of _has_columns and _has_valid_dtypes). Then,
-        check if the columns are fine, it further validates if the types of columns conform to the
-        expected schema. Finally, if schema types are different, then it attempts to apply schema;
-        if possible then the schema validation is successful.
-
-        Args:
-            df: A pandas dataframe.
-
-        Returns:
-            A schema validated dataframe.
-        """
-        if not self._has_valid_dtypes(df):
-            raise ColumnsDataTypeError()
-        return df
-
-    @staticmethod
-    def _transform_class_name_to_dataset_name(string_to_transform: str) -> str:
-        """Called by the init function to fetch dataset names from class name.
-
-        Used to create dataset name from class name, turns camel case into upper snake case.
-        For example: 'ThisNameABC' -> 'THIS_NAME_ABC'.
-        """
-        words = re.findall(r"\d[A-Z]+|[A-Z]?[a-z\d]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+|[A-Z]{2,}|[A-Z]", string_to_transform)
-        return "_".join(map(str.lower, words)).upper()
-
-    def _has_valid_dtypes(self, df: pd.DataFrame) -> bool:
-        """Checks if `df` has the expected dtypes defined in `schema`.
-
-        Schema is a dictionary object where keys are column names and values are dtypes in string format as returned by e.g.
-        `df[column].dtype.name`.
-
-        This function issues `error` level logs describing the first column that caused the check to fail.
-
-        It is assumed that `df` only has the columns defined in `schema`.
-
-        Args:
-            df:
-
-        Returns:
-            bool - `True` if `df` has the given dtypes, `False` otherwise
-        """
-        dtypes = df.dtypes
-
-        for column_name, expected_dtype in self.schema.items():
-            found_dtype = dtypes[column_name].name
-            if found_dtype != expected_dtype:
-                if self.show_casting_warnings:
-                    logger.info(f"Expected: '{expected_dtype}' dtype for {self.name}['{column_name}]', found '{found_dtype}'")
-                try:
-                    if len(set(type(v) for v in df[column_name].values)) > 1:  # pylint: disable=consider-using-set-comprehension
-                        logger.warning(CASTING_WARNING_MSG.format(column_name, expected_dtype, found_dtype))  # pylint: disable=logging-format-interpolation
-                        logger.info(NOTICE_MSG.format(column_name))  # pylint: disable=logging-format-interpolation
-                    df[column_name] = df[column_name].astype(self.schema[column_name])
-                except (ValueError, TypeError):
-                    logger.error(f"ValueError: Tried casting column {self.name}['{column_name}]' to '{expected_dtype}' " f"from '{found_dtype}', but failed")
-                    return False
-        return True
-
-    @staticmethod
-    def _get_options(options_from_code: MutableMapping[str, Any], options_from_resource_definition: Optional[Mapping[str, Any]]) -> MutableMapping[str, Any]:
-        """Retrieves options either from code or from a resource-definition.
-
-        Options are merged if they are provided by both sources, while in the case of conflicts, the options from the code
-        take precedence.
-
-        Args:
-            options_from_code (Optional[Mapping])
-            options_from_resource_definition (Optional[Mapping])
-
-        Returns:
-            [Optional[Mapping]]: options that are going to be used
-        """
-        if options_from_resource_definition:
-            return {**options_from_resource_definition, **options_from_code}
-        return options_from_code
-
-

Subclasses

- -

Class variables

-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
-

Methods

-
-
-async def async_read(self) -
-
-

Allows the use of asyncio to concurrently read files in memory.

-

Returns

-

A pandas dataframe or an iterable.

-
- -Expand source code - -
async def async_read(self):
-    """Allows the use of asyncio to concurrently read files in memory.
-
-    Returns:
-        A pandas dataframe or an iterable.
-    """
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(pool, self.read)
-
-
-
-async def async_write(self, df: pandas.core.frame.DataFrame) -
-
-

Allows the use of asyncio to concurrently write files out.

-

Args

-
-
df
-
The data to be written
-
-
- -Expand source code - -
async def async_write(self, df: pd.DataFrame):
-    """Allows the use of asyncio to concurrently write files out.
-
-    Args:
-        df: The data to be written
-    """
-    loop = asyncio.get_running_loop()
-    return await loop.run_in_executor(pool, self.write, df)
-
-
-
-def log_metrics_from_schema(self, df: pandas.core.frame.DataFrame) ‑> DynamicDataIO -
-
-

Calculates and logs metrics based on the metrics present in its schema definition.

-

Args

-
-
df
-
A dataframe for which metrics are generated and logged
-
-

Returns

-

self (to allow for method chaining).

-
- -Expand source code - -
def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
-    """Calculates and logs metrics based on the metrics present in its schema definition.
-
-    Args:
-        df: A dataframe for which metrics are generated and logged
-
-    Returns:
-         self (to allow for method chaining).
-    """
-    if not hasattr(self, "schema_metrics"):
-        raise MissingSchemaDefinition(self.__class__)
-
-    for column in self.schema_metrics.keys():
-        for metric in self.schema_metrics[column]:
-            get_metric(metric)(self.name, df, column)()  # type: ignore
-
-    return self
-
-
-
-def read(self) ‑> pandas.core.frame.DataFrame -
-
-

Reads data source and returns a schema validated dataframe (by means of _apply_schema).

-

Returns

-

A pandas dataframe or an iterable.

-
- -Expand source code - -
def read(self) -> pd.DataFrame:
-    """Reads data source and returns a schema validated dataframe (by means of _apply_schema).
-
-    Returns:
-        A pandas dataframe or an iterable.
-    """
-    source_name = self.sources_config.get("type")
-    df = getattr(self, f"_read_from_{source_name}")()
-
-    df = self._apply_schema(df)
-    if self.apply_schema_validations:
-        self.validate_from_schema(df)
-    if self.log_schema_metrics:
-        self.log_metrics_from_schema(df)
-
-    return df
-
-
-
-def validate_from_schema(self, df: pandas.core.frame.DataFrame) ‑> DynamicDataIO -
-
-

Validates a dataframe based on the validations present in its schema definition.

-

All validations are checked and if any of them fails, a SchemaValidationError is raised.

-

Args

-

df:

-

Returns

-

self (to allow for method chaining).

-

Raises

-
-
SchemaValidationError
-
if any of the validations failed. The message attribute of -the exception object is a List[str], where each element is the name of a -validation that failed.
-
-
- -Expand source code - -
def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO":
-    """Validates a dataframe based on the validations present in its schema definition.
-
-    All validations are checked and if any of them fails, a `SchemaValidationError` is raised.
-
-    Args:
-        df:
-
-    Returns:
-         self (to allow for method chaining).
-
-    Raises:
-        SchemaValidationError: if any of the validations failed. The `message` attribute of
-            the exception object is a `List[str]`, where each element is the name of a
-            validation that failed.
-    """
-    if not hasattr(self, "schema_validations"):
-        raise MissingSchemaDefinition(self.__class__)
-
-    failed_validations = {}
-    for column in self.schema_validations.keys():
-        for validation in self.schema_validations[column].keys():
-            if self.schema_validations[column][validation]["apply"] is True:
-                validation_result = getattr(validations, validation)(self.name, df, column, **self.schema_validations[column][validation]["options"])
-                if not validation_result.valid:
-                    failed_validations[validation] = validation_result.message
-
-    if len(failed_validations) > 0:
-        raise SchemaValidationError(failed_validations)
-
-    return self
-
-
-
-def write(self, df: pandas.core.frame.DataFrame) -
-
-

Sink data to a given source based on the sources_config.

-

Args

-
-
df
-
The data to be written
-
-
- -Expand source code - -
def write(self, df: pd.DataFrame):
-    """Sink data to a given source based on the sources_config.
-
-    Args:
-        df: The data to be written
-    """
-    source_name = self.sources_config.get("type")
-    if set(df.columns) != set(self.schema.keys()):  # pylint: disable=E1101
-        columns = [column for column in df.columns.to_list() if column in self.schema.keys()]
-        df = df[columns]
-
-    if self.apply_schema_validations:
-        self.validate_from_schema(df)
-    if self.log_schema_metrics:
-        self.log_metrics_from_schema(df)
-
-    getattr(self, f"_write_to_{source_name}")(self._apply_schema(df))
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/coverage_report/coverage-badge.svg b/docs/coverage_report/coverage-badge.svg deleted file mode 100644 index fe06143..0000000 --- a/docs/coverage_report/coverage-badge.svg +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - coverage - coverage - 94% - 94% - - diff --git a/docs/errors.html b/docs/errors.html deleted file mode 100644 index 755daa6..0000000 --- a/docs/errors.html +++ /dev/null @@ -1,684 +0,0 @@ - - - - - - -dynamicio.errors API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.errors

-
-
-

Hosts exception implementations for different errors.

-
- -Expand source code - -
"""Hosts exception implementations for different errors."""
-# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, super-init-not-called
-__all__ = [
-    "DynamicIOError",
-    "DataSourceError",
-    "ColumnsDataTypeError",
-    "NonUniqueIdColumnError",
-    "NullValueInColumnError",
-    "NotExpectedCategoricalValue",
-    "MissingSchemaDefinition",
-    "SchemaNotFoundError",
-    "SchemaValidationError",
-    "InvalidDatasetTypeError",
-    "CASTING_WARNING_MSG",
-    "NOTICE_MSG",
-]
-
-from typing import Any, Optional
-
-
-class DynamicIOError(Exception):
-    """Base class for DynamicIO errors."""
-
-    ERROR_STR: str = ""
-    ERROR_STR_DETAILED: str = "{0}"
-
-    @property
-    def message(self) -> Optional[Any]:
-        """Easy access for optional message argument.
-
-        Returns:
-            Message or `None` if not set
-        """
-        try:
-            return self.args[0]
-        except IndexError:
-            return None
-
-    def __str__(self):
-        """Enrich and return error message."""
-        message = self.message
-
-        if message is None:
-            return self.ERROR_STR
-
-        return self.ERROR_STR_DETAILED.format(message)
-
-
-class SchemaNotFoundError(DynamicIOError):
-    """Error raised when schema is not specified in the provided source."""
-
-    ERROR_STR = "Schema not specified in the provided source"
-    ERROR_STR_DETAILED = "Schema not specified in the provided source: {0} "
-
-
-class SchemaValidationError(DynamicIOError):
-    """Error raised when schema validation fails."""
-
-
-class MissingSchemaDefinition(DynamicIOError):
-    """Error raised when schema is not specified in the provided source."""
-
-    ERROR_STR = "The resource definition for this class is missing a schema definition"
-    ERROR_STR_DETAILED = "The resource definition for this class is missing a schema definition: {0}"
-
-
-class DataSourceError(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-
-class ColumnsDataTypeError(DynamicIOError):
-    """Error raised when the validated data does not have the expected data types."""
-
-
-class NonUniqueIdColumnError(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-
-class NullValueInColumnError(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-
-class NotExpectedCategoricalValue(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-
-class InvalidDatasetTypeError(DynamicIOError):
-    """Error raised when dataset type is not one of [parquet, json, csv, h5]."""
-
-    ERROR_STR = "The dataset provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
-    ERROR_STR_DETAILED = "Dataset: {0} provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
-
-
-# Warning messages
-CASTING_WARNING_MSG = "Applying casting column: '{0}' to: 'type:{1}' from 'type:{2}' though not advised, as `dtypes`>1 for {0}, which may lead to data corruption!"
-NOTICE_MSG = "Keeping the {0} as is, may anyway cause I/O errors or data corruption issues especially when using `pandas.DataFrame.to_parquet` or `pandas.DataFrame.to_json`."
-
-
-
-
-
-
-
-
-
-

Classes

-
-
-class ColumnsDataTypeError -(*args, **kwargs) -
-
-

Error raised when the validated data does not have the expected data types.

-
- -Expand source code - -
class ColumnsDataTypeError(DynamicIOError):
-    """Error raised when the validated data does not have the expected data types."""
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class DataSourceError -(*args, **kwargs) -
-
-

Error raised when the data source fails to load.

-
- -Expand source code - -
class DataSourceError(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class DynamicIOError -(*args, **kwargs) -
-
-

Base class for DynamicIO errors.

-
- -Expand source code - -
class DynamicIOError(Exception):
-    """Base class for DynamicIO errors."""
-
-    ERROR_STR: str = ""
-    ERROR_STR_DETAILED: str = "{0}"
-
-    @property
-    def message(self) -> Optional[Any]:
-        """Easy access for optional message argument.
-
-        Returns:
-            Message or `None` if not set
-        """
-        try:
-            return self.args[0]
-        except IndexError:
-            return None
-
-    def __str__(self):
-        """Enrich and return error message."""
-        message = self.message
-
-        if message is None:
-            return self.ERROR_STR
-
-        return self.ERROR_STR_DETAILED.format(message)
-
-

Ancestors

-
    -
  • builtins.Exception
  • -
  • builtins.BaseException
  • -
-

Subclasses

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Instance variables

-
-
var message : Optional[Any]
-
-

Easy access for optional message argument.

-

Returns

-

Message or None if not set

-
- -Expand source code - -
@property
-def message(self) -> Optional[Any]:
-    """Easy access for optional message argument.
-
-    Returns:
-        Message or `None` if not set
-    """
-    try:
-        return self.args[0]
-    except IndexError:
-        return None
-
-
-
-
-
-class InvalidDatasetTypeError -(*args, **kwargs) -
-
-

Error raised when dataset type is not one of [parquet, json, csv, h5].

-
- -Expand source code - -
class InvalidDatasetTypeError(DynamicIOError):
-    """Error raised when dataset type is not one of [parquet, json, csv, h5]."""
-
-    ERROR_STR = "The dataset provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
-    ERROR_STR_DETAILED = "Dataset: {0} provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio."
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class MissingSchemaDefinition -(*args, **kwargs) -
-
-

Error raised when schema is not specified in the provided source.

-
- -Expand source code - -
class MissingSchemaDefinition(DynamicIOError):
-    """Error raised when schema is not specified in the provided source."""
-
-    ERROR_STR = "The resource definition for this class is missing a schema definition"
-    ERROR_STR_DETAILED = "The resource definition for this class is missing a schema definition: {0}"
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class NonUniqueIdColumnError -(*args, **kwargs) -
-
-

Error raised when the data source fails to load.

-
- -Expand source code - -
class NonUniqueIdColumnError(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class NotExpectedCategoricalValue -(*args, **kwargs) -
-
-

Error raised when the data source fails to load.

-
- -Expand source code - -
class NotExpectedCategoricalValue(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class NullValueInColumnError -(*args, **kwargs) -
-
-

Error raised when the data source fails to load.

-
- -Expand source code - -
class NullValueInColumnError(DynamicIOError):
-    """Error raised when the data source fails to load."""
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class SchemaNotFoundError -(*args, **kwargs) -
-
-

Error raised when schema is not specified in the provided source.

-
- -Expand source code - -
class SchemaNotFoundError(DynamicIOError):
-    """Error raised when schema is not specified in the provided source."""
-
-    ERROR_STR = "Schema not specified in the provided source"
-    ERROR_STR_DETAILED = "Schema not specified in the provided source: {0} "
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-class SchemaValidationError -(*args, **kwargs) -
-
-

Error raised when schema validation fails.

-
- -Expand source code - -
class SchemaValidationError(DynamicIOError):
-    """Error raised when schema validation fails."""
-
-

Ancestors

- -

Class variables

-
-
var ERROR_STR : str
-
-
-
-
var ERROR_STR_DETAILED : str
-
-
-
-
-

Inherited members

- -
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/images/data-types.png b/docs/images/data-types.png deleted file mode 100644 index 106a2ef..0000000 Binary files a/docs/images/data-types.png and /dev/null differ diff --git a/docs/images/logo-original.png b/docs/images/logo-original.png deleted file mode 100644 index e9ae423..0000000 Binary files a/docs/images/logo-original.png and /dev/null differ diff --git a/docs/images/logo-transparent.png b/docs/images/logo-transparent.png deleted file mode 100644 index 28605a1..0000000 Binary files a/docs/images/logo-transparent.png and /dev/null differ diff --git a/docs/images/sample-pipeline.png b/docs/images/sample-pipeline.png deleted file mode 100644 index 6e30bc3..0000000 Binary files a/docs/images/sample-pipeline.png and /dev/null differ diff --git a/docs/images/supported_sources.png b/docs/images/supported_sources.png deleted file mode 100644 index 190f2ae..0000000 Binary files a/docs/images/supported_sources.png and /dev/null differ diff --git a/docs/images/wrapped-panda.png b/docs/images/wrapped-panda.png deleted file mode 100644 index b234826..0000000 Binary files a/docs/images/wrapped-panda.png and /dev/null differ diff --git a/docs/index.html b/docs/index.html deleted file mode 100644 index c631f77..0000000 --- a/docs/index.html +++ /dev/null @@ -1,230 +0,0 @@ - - - - - - -dynamicio API documentation - - - - - - - - - - - -
-
-
-

Package dynamicio

-
-
-

A package for wrapping your I/O operations.

-
- -Expand source code - -
"""A package for wrapping your I/O operations."""
-import os
-from contextlib import suppress
-
-import pkg_resources
-from magic_logger import logger
-
-with suppress(Exception):
-    __version__ = pkg_resources.get_distribution("dynamicio").version
-
-from dynamicio.core import DynamicDataIO
-from dynamicio.mixins import WithKafka, WithLocal, WithLocalBatch, WithPostgres, WithS3File, WithS3PathPrefix
-
-os.environ["LC_CTYPE"] = "en_US.UTF"  # Set your locale to a unicode-compatible one
-
-
-class UnifiedIO(WithS3File, WithS3PathPrefix, WithLocalBatch, WithLocal, WithKafka, WithPostgres, DynamicDataIO):
-    """A unified io composed of dynamicio.mixins."""
-
-
-logging_config = {
-    "version": 1,
-    "disable_existing_loggers": True,
-    "formatters": {
-        "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"},
-        "generic-metrics": {"format": "%(message)s"},
-    },
-    "handlers": {
-        "default": {
-            "level": "INFO",
-            "formatter": "standard",
-            "class": "logging.StreamHandler",
-            "stream": "ext://sys.stdout",  # Default is stderr
-        },
-        "metrics": {
-            "level": "INFO",
-            "formatter": "generic-metrics",
-            "class": "logging.StreamHandler",
-            "stream": "ext://sys.stdout",  # Default is stderr
-        },
-    },
-    "loggers": {
-        "": {"handlers": ["default"], "level": "INFO", "propagate": False},
-        "dynamicio.metrics": {"handlers": ["metrics"], "level": "INFO", "propagate": False},
-        "awscli": {
-            "handlers": ["default"],
-            "level": "INFO",
-            "propagate": False,
-        },
-    },
-}
-
-logger.dict_config(logging_config)
-
-
-
-

Sub-modules

-
-
dynamicio.cli
-
-

Implements the dynamicio Command Line Interface (CLI).

-
-
dynamicio.config
-
-

Implements the IOConfig class, generating objects used as a configuration parameter for the instantiation …

-
-
dynamicio.core
-
-

Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation.

-
-
dynamicio.errors
-
-

Hosts exception implementations for different errors.

-
-
dynamicio.metrics
-
-

A module responsible for metrics generation and logging.

-
-
dynamicio.mixins
-
-

Default dynamicio mixins module

-
-
dynamicio.validations
-
-

Implements the Validator class responsible for various generic data validations and metrics generation.

-
-
-
-
-
-
-
-
-

Classes

-
-
-class UnifiedIO -(source_config: Mapping[~KT, +VT_co], apply_schema_validations: bool = False, log_schema_metrics: bool = False, show_casting_warnings: bool = False, **options: MutableMapping[str, Any]) -
-
-

A unified io composed of dynamicio.mixins.

-

Class constructor.

-

Args

-
-
source_config
-
Configuration to use when reading/writing data from/to a source
-
apply_schema_validations
-
Applies schema validations on either read() or write()
-
log_schema_metrics
-
Logs schema metrics on either read() or write()
-
show_casting_warnings
-
Logs casting warnings on either read() or write() if set to True
-
options
-
Any additional kwargs that may be used throughout the lifecycle of the object
-
-
- -Expand source code - -
class UnifiedIO(WithS3File, WithS3PathPrefix, WithLocalBatch, WithLocal, WithKafka, WithPostgres, DynamicDataIO):
-    """A unified io composed of dynamicio.mixins."""
-
-

Ancestors

- -

Class variables

-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-

Inherited members

- -
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/metrics.html b/docs/metrics.html deleted file mode 100644 index 30702b1..0000000 --- a/docs/metrics.html +++ /dev/null @@ -1,929 +0,0 @@ - - - - - - -dynamicio.metrics API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.metrics

-
-
-

A module responsible for metrics generation and logging.

-
- -Expand source code - -
"""A module responsible for metrics generation and logging."""
-# pylint: disable=missing-function-docstring,missing-class-docstring
-import json
-import logging
-import sys
-from numbers import Number
-from typing import Any, Dict, Mapping, Type
-
-import pandas as pd  # type: ignore
-from magic_logger import logger
-from pythonjsonlogger import jsonlogger  # type: ignore
-
-logHandler = logging.StreamHandler(sys.stdout)
-formatter = jsonlogger.JsonFormatter()
-logHandler.setFormatter(formatter)
-logger.addHandler(logHandler)
-
-
-__metrics__: Dict[str, Type["Metric"]] = {}
-
-
-def get_metric(name: str) -> Type["Metric"]:
-    return __metrics__[name]
-
-
-def log_metric(dataset: str, column: str, metric: str, value: float):
-    """Logs a metric in a structured way for a given dataset column.
-
-    Args:
-        dataset: The dataset for which the metric is logged
-        column: Column for which the metric is logged
-        metric: name fo the metric, e.g. "unique_vals"
-        value: The metric's value, e.g. "10000"
-    """
-    logger.info(json.dumps({"message": "METRIC", "dataset": dataset, "column": column, "metric": metric, "value": float(value)}))
-
-
-class Metric:
-    """A base class for implementing metrics classes."""
-
-    def __init__(self, dataset_name: str, df: pd.DataFrame, column: str):  # noqa
-        self.dataset_name = dataset_name
-        self.df = df
-        self.column = column
-
-    def __init_subclass__(cls):  # noqa
-        __metrics__[cls.__name__] = cls
-        assert "calculate_metric" in cls.__dict__
-
-    def __call__(self) -> Any:  # noqa
-        metric_value = self.calculate_metric()
-
-        if isinstance(metric_value, Mapping):
-            for entity in sorted(metric_value.keys()):  # pylint: disable=no-member
-                column = metric_value[entity]  # pylint: disable=unsubscriptable-object
-                log_metric(self.dataset_name, entity, self.metric_name, column)
-        else:
-            log_metric(dataset=self.dataset_name, column=self.column, metric=self.metric_name, value=metric_value)
-        return metric_value
-
-    @property
-    def metric_name(self) -> str:
-        """Retrieves the name of the metric from the class name.
-
-        Returns:
-            The name of the metric, e.g. "Min or Mean".
-        """
-        return self.__class__.__name__
-
-    def calculate_metric(self) -> Any:
-        """Dictates that subclasses need to implement this method.
-
-        Returns:
-            NotImplemented is returned if the method is not implemented, by the subclass
-            inevitably pointing to the parent implementation.
-        """
-        return NotImplemented
-
-
-class Min(Metric):
-    """A metric instance that enables generating and returning the minimum value of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the minimum value of a column.
-
-        Returns:
-             The minimum value of a column.
-        """
-        return self.df[self.column].min()
-
-
-class Max(Metric):
-    """A metric instance that enables generating and returning the maximum value of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the maximum value of a column.
-
-        Returns:
-            The maximum value of a column.
-        """
-        return self.df[self.column].max()
-
-
-class Mean(Metric):
-    """A metric instance that enables generating and returning the mean value of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the mean value of a column.
-
-        Returns:
-            The mean value of a column.
-        """
-        return self.df[self.column].mean()
-
-
-class Std(Metric):
-    """A metric instance that enables generating and returning the standard deviation of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the standard deviation of a column.
-
-        Returns:
-            The standard deviation of a column.
-        """
-        return self.df[self.column].std()
-
-
-class Variance(Metric):
-    """A metric instance that generated and returns the variance of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the variance of a column.
-
-        Returns:
-            The variance of a column.
-        """
-        return self.df[self.column].var()
-
-
-class Counts(Metric):
-    """A metric instance that enables generating and returning the length of a column."""
-
-    def calculate_metric(self) -> int:
-        """Generate and return the length of a column.
-
-        Returns:
-            The length of a column.
-        """
-        return len(self.df[self.column])
-
-
-class UniqueCounts(Metric):
-    """A metric instance that enables generating and returning the unique values of a column."""
-
-    def calculate_metric(self) -> int:
-        """Generate and return the unique values of a column.
-
-        Returns:
-            The unique values of a column.
-        """
-        return len(self.df[self.column].unique())
-
-
-class CountsPerLabel(Metric):
-    """A metric instance that enables generating and returning the counts per label in a categorical column."""
-
-    def calculate_metric(self) -> Mapping:
-        """Generate and return the counts per label in a categorical column.
-
-        Returns:
-            The counts per label in a categorical column
-        """
-        column_vs_metric_value = self.df[self.column].value_counts().to_dict()
-        label_vs_metric_value_with_column_prefix = {}
-        for key in column_vs_metric_value.keys():
-            new_key = self.column + "-" + key
-            label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
-        return label_vs_metric_value_with_column_prefix
-
-
-
-
-
-
-
-

Functions

-
-
-def get_metric(name: str) ‑> Type[Metric] -
-
-
-
- -Expand source code - -
def get_metric(name: str) -> Type["Metric"]:
-    return __metrics__[name]
-
-
-
-def log_metric(dataset: str, column: str, metric: str, value: float) -
-
-

Logs a metric in a structured way for a given dataset column.

-

Args

-
-
dataset
-
The dataset for which the metric is logged
-
column
-
Column for which the metric is logged
-
metric
-
name fo the metric, e.g. "unique_vals"
-
value
-
The metric's value, e.g. "10000"
-
-
- -Expand source code - -
def log_metric(dataset: str, column: str, metric: str, value: float):
-    """Logs a metric in a structured way for a given dataset column.
-
-    Args:
-        dataset: The dataset for which the metric is logged
-        column: Column for which the metric is logged
-        metric: name fo the metric, e.g. "unique_vals"
-        value: The metric's value, e.g. "10000"
-    """
-    logger.info(json.dumps({"message": "METRIC", "dataset": dataset, "column": column, "metric": metric, "value": float(value)}))
-
-
-
-
-
-

Classes

-
-
-class Counts -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the length of a column.

-
- -Expand source code - -
class Counts(Metric):
-    """A metric instance that enables generating and returning the length of a column."""
-
-    def calculate_metric(self) -> int:
-        """Generate and return the length of a column.
-
-        Returns:
-            The length of a column.
-        """
-        return len(self.df[self.column])
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> int -
-
-

Generate and return the length of a column.

-

Returns

-

The length of a column.

-
- -Expand source code - -
def calculate_metric(self) -> int:
-    """Generate and return the length of a column.
-
-    Returns:
-        The length of a column.
-    """
-    return len(self.df[self.column])
-
-
-
-

Inherited members

- -
-
-class CountsPerLabel -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the counts per label in a categorical column.

-
- -Expand source code - -
class CountsPerLabel(Metric):
-    """A metric instance that enables generating and returning the counts per label in a categorical column."""
-
-    def calculate_metric(self) -> Mapping:
-        """Generate and return the counts per label in a categorical column.
-
-        Returns:
-            The counts per label in a categorical column
-        """
-        column_vs_metric_value = self.df[self.column].value_counts().to_dict()
-        label_vs_metric_value_with_column_prefix = {}
-        for key in column_vs_metric_value.keys():
-            new_key = self.column + "-" + key
-            label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
-        return label_vs_metric_value_with_column_prefix
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> Mapping[~KT, +VT_co] -
-
-

Generate and return the counts per label in a categorical column.

-

Returns

-

The counts per label in a categorical column

-
- -Expand source code - -
def calculate_metric(self) -> Mapping:
-    """Generate and return the counts per label in a categorical column.
-
-    Returns:
-        The counts per label in a categorical column
-    """
-    column_vs_metric_value = self.df[self.column].value_counts().to_dict()
-    label_vs_metric_value_with_column_prefix = {}
-    for key in column_vs_metric_value.keys():
-        new_key = self.column + "-" + key
-        label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key]
-    return label_vs_metric_value_with_column_prefix
-
-
-
-

Inherited members

- -
-
-class Max -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the maximum value of a column.

-
- -Expand source code - -
class Max(Metric):
-    """A metric instance that enables generating and returning the maximum value of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the maximum value of a column.
-
-        Returns:
-            The maximum value of a column.
-        """
-        return self.df[self.column].max()
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> numbers.Number -
-
-

Generate and return the maximum value of a column.

-

Returns

-

The maximum value of a column.

-
- -Expand source code - -
def calculate_metric(self) -> Number:
-    """Generate and return the maximum value of a column.
-
-    Returns:
-        The maximum value of a column.
-    """
-    return self.df[self.column].max()
-
-
-
-

Inherited members

- -
-
-class Mean -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the mean value of a column.

-
- -Expand source code - -
class Mean(Metric):
-    """A metric instance that enables generating and returning the mean value of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the mean value of a column.
-
-        Returns:
-            The mean value of a column.
-        """
-        return self.df[self.column].mean()
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> numbers.Number -
-
-

Generate and return the mean value of a column.

-

Returns

-

The mean value of a column.

-
- -Expand source code - -
def calculate_metric(self) -> Number:
-    """Generate and return the mean value of a column.
-
-    Returns:
-        The mean value of a column.
-    """
-    return self.df[self.column].mean()
-
-
-
-

Inherited members

- -
-
-class Metric -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A base class for implementing metrics classes.

-
- -Expand source code - -
class Metric:
-    """A base class for implementing metrics classes."""
-
-    def __init__(self, dataset_name: str, df: pd.DataFrame, column: str):  # noqa
-        self.dataset_name = dataset_name
-        self.df = df
-        self.column = column
-
-    def __init_subclass__(cls):  # noqa
-        __metrics__[cls.__name__] = cls
-        assert "calculate_metric" in cls.__dict__
-
-    def __call__(self) -> Any:  # noqa
-        metric_value = self.calculate_metric()
-
-        if isinstance(metric_value, Mapping):
-            for entity in sorted(metric_value.keys()):  # pylint: disable=no-member
-                column = metric_value[entity]  # pylint: disable=unsubscriptable-object
-                log_metric(self.dataset_name, entity, self.metric_name, column)
-        else:
-            log_metric(dataset=self.dataset_name, column=self.column, metric=self.metric_name, value=metric_value)
-        return metric_value
-
-    @property
-    def metric_name(self) -> str:
-        """Retrieves the name of the metric from the class name.
-
-        Returns:
-            The name of the metric, e.g. "Min or Mean".
-        """
-        return self.__class__.__name__
-
-    def calculate_metric(self) -> Any:
-        """Dictates that subclasses need to implement this method.
-
-        Returns:
-            NotImplemented is returned if the method is not implemented, by the subclass
-            inevitably pointing to the parent implementation.
-        """
-        return NotImplemented
-
-

Subclasses

- -

Instance variables

-
-
var metric_name : str
-
-

Retrieves the name of the metric from the class name.

-

Returns

-

The name of the metric, e.g. "Min or Mean".

-
- -Expand source code - -
@property
-def metric_name(self) -> str:
-    """Retrieves the name of the metric from the class name.
-
-    Returns:
-        The name of the metric, e.g. "Min or Mean".
-    """
-    return self.__class__.__name__
-
-
-
-

Methods

-
-
-def calculate_metric(self) ‑> Any -
-
-

Dictates that subclasses need to implement this method.

-

Returns

-

NotImplemented is returned if the method is not implemented, by the subclass -inevitably pointing to the parent implementation.

-
- -Expand source code - -
def calculate_metric(self) -> Any:
-    """Dictates that subclasses need to implement this method.
-
-    Returns:
-        NotImplemented is returned if the method is not implemented, by the subclass
-        inevitably pointing to the parent implementation.
-    """
-    return NotImplemented
-
-
-
-
-
-class Min -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the minimum value of a column.

-
- -Expand source code - -
class Min(Metric):
-    """A metric instance that enables generating and returning the minimum value of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the minimum value of a column.
-
-        Returns:
-             The minimum value of a column.
-        """
-        return self.df[self.column].min()
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> numbers.Number -
-
-

Generate and return the minimum value of a column.

-

Returns

-

The minimum value of a column.

-
- -Expand source code - -
def calculate_metric(self) -> Number:
-    """Generate and return the minimum value of a column.
-
-    Returns:
-         The minimum value of a column.
-    """
-    return self.df[self.column].min()
-
-
-
-

Inherited members

- -
-
-class Std -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the standard deviation of a column.

-
- -Expand source code - -
class Std(Metric):
-    """A metric instance that enables generating and returning the standard deviation of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the standard deviation of a column.
-
-        Returns:
-            The standard deviation of a column.
-        """
-        return self.df[self.column].std()
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> numbers.Number -
-
-

Generate and return the standard deviation of a column.

-

Returns

-

The standard deviation of a column.

-
- -Expand source code - -
def calculate_metric(self) -> Number:
-    """Generate and return the standard deviation of a column.
-
-    Returns:
-        The standard deviation of a column.
-    """
-    return self.df[self.column].std()
-
-
-
-

Inherited members

- -
-
-class UniqueCounts -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that enables generating and returning the unique values of a column.

-
- -Expand source code - -
class UniqueCounts(Metric):
-    """A metric instance that enables generating and returning the unique values of a column."""
-
-    def calculate_metric(self) -> int:
-        """Generate and return the unique values of a column.
-
-        Returns:
-            The unique values of a column.
-        """
-        return len(self.df[self.column].unique())
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> int -
-
-

Generate and return the unique values of a column.

-

Returns

-

The unique values of a column.

-
- -Expand source code - -
def calculate_metric(self) -> int:
-    """Generate and return the unique values of a column.
-
-    Returns:
-        The unique values of a column.
-    """
-    return len(self.df[self.column].unique())
-
-
-
-

Inherited members

- -
-
-class Variance -(dataset_name: str, df: pandas.core.frame.DataFrame, column: str) -
-
-

A metric instance that generated and returns the variance of a column.

-
- -Expand source code - -
class Variance(Metric):
-    """A metric instance that generated and returns the variance of a column."""
-
-    def calculate_metric(self) -> Number:
-        """Generate and return the variance of a column.
-
-        Returns:
-            The variance of a column.
-        """
-        return self.df[self.column].var()
-
-

Ancestors

- -

Methods

-
-
-def calculate_metric(self) ‑> numbers.Number -
-
-

Generate and return the variance of a column.

-

Returns

-

The variance of a column.

-
- -Expand source code - -
def calculate_metric(self) -> Number:
-    """Generate and return the variance of a column.
-
-    Returns:
-        The variance of a column.
-    """
-    return self.df[self.column].var()
-
-
-
-

Inherited members

- -
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/mixins/index.html b/docs/mixins/index.html deleted file mode 100644 index 27c1684..0000000 --- a/docs/mixins/index.html +++ /dev/null @@ -1,107 +0,0 @@ - - - - - - -dynamicio.mixins API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.mixins

-
-
-

Default dynamicio mixins module

-
- -Expand source code - -
"""Default dynamicio mixins module"""
-
-from .with_kafka import (
-    WithKafka,
-)
-from .with_local import (
-    WithLocal,
-    WithLocalBatch,
-)
-from .with_postgres import (
-    WithPostgres,
-)
-from .with_s3 import (
-    WithS3File,
-    WithS3PathPrefix,
-)
-
-
-
-

Sub-modules

-
-
dynamicio.mixins.utils
-
-

Mixin utility functions

-
-
dynamicio.mixins.with_kafka
-
-

This module provides mixins that are providing Kafka I/O support.

-
-
dynamicio.mixins.with_local
-
-

This module provides mixins that are providing Local FS I/O support.

-
-
dynamicio.mixins.with_postgres
-
-

This module provides mixins that are providing Postgres I/O support.

-
-
dynamicio.mixins.with_s3
-
-

This module provides mixins that are providing S3 I/O support.

-
-
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/mixins/utils.html b/docs/mixins/utils.html deleted file mode 100644 index 20d170c..0000000 --- a/docs/mixins/utils.html +++ /dev/null @@ -1,388 +0,0 @@ - - - - - - -dynamicio.mixins.utils API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.mixins.utils

-
-
-

Mixin utility functions

-
- -Expand source code - -
"""Mixin utility functions"""
-# pylint: disable=no-member, protected-access, too-few-public-methods
-
-import inspect
-import string
-from functools import wraps
-from types import FunctionType, MethodType
-from typing import Any, Collection, Iterable, Mapping, MutableMapping, Union
-
-from magic_logger import logger
-
-
-def allow_options(options: Union[Iterable, FunctionType, MethodType]):
-    """Validate **options for a decorated reader function.
-
-    Args:
-        options: A set of valid options for a reader (e.g. `pandas.read_parquet` or `pandas.read_csv`)
-
-    Returns:
-        read_with_valid_options: The input function called with modified options.
-    """
-
-    def _filter_out_irrelevant_options(kwargs: Mapping, valid_options: Iterable):
-        filtered_options = {}
-        invalid_options = {}
-        for key_arg in kwargs.keys():
-            if key_arg in valid_options:
-                filtered_options[key_arg] = kwargs[key_arg]
-            else:
-                invalid_options[key_arg] = kwargs[key_arg]
-        if len(invalid_options) > 0:
-            logger.warning(
-                f"Options {invalid_options} were not used because they were not supported by the read or write method configured for this source. "
-                "Check if you expected any of those to have been used by the operation!"
-            )
-        return filtered_options
-
-    def read_with_valid_options(func):
-        @wraps(func)
-        def _(*args, **kwargs):
-            if callable(options):
-                return func(*args, **_filter_out_irrelevant_options(kwargs, args_of(options)))
-            return func(*args, **_filter_out_irrelevant_options(kwargs, options))
-
-        return _
-
-    return read_with_valid_options
-
-
-def args_of(func):
-    """Retrieve allowed options for a given function.
-
-    Args:
-        func: A function like, e.g., pd.read_csv
-
-    Returns:
-        A set of allowed options
-    """
-    return set(inspect.signature(func).parameters.keys())
-
-
-def get_string_template_field_names(s: str) -> Collection[str]:  # pylint: disable=C0103
-    """Given a string `s`, it parses the string to identify any template fields and returns the names of those fields.
-
-     If `s` is not a string template, the returned `Collection` is empty.
-
-    Args:
-        s:
-
-    Returns:
-        Collection[str]
-
-    Example:
-
-        >>> get_string_template_field_names("abc{def}{efg}")
-        ["def", "efg"]
-        >>> get_string_template_field_names("{0}-{1}")
-        ["0", "1"]
-        >>> get_string_template_field_names("hello world")
-        []
-    """
-    # string.Formatter.parse returns a 4-tuple of:
-    # `literal_text`, `field_name`, `form_at_spec`, `conversion`
-    # More info here https://docs.python.org/3.8/library/string.html#string.Formatter.parse
-    field_names = [group[1] for group in string.Formatter().parse(s) if group[1] is not None]
-
-    return field_names
-
-
-def resolve_template(path: str, options: MutableMapping[str, Any]) -> str:  # pylint: disable=C0103
-    """Given a string `path`, it attempts to replace all templates fields with values provided in `options`.
-
-    If `path` is not a string template, `path` is returned.
-
-    Args:
-        path: A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
-        options: A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
-    Returns:
-        str: Returns a static path replaced with the value in the options mapping.
-
-    Raises:
-        ValueError: if any template fields in s are not named using valid Python identifiers
-        ValueError: if a given template field cannot be resolved in `options`
-    """
-    fields = get_string_template_field_names(path)
-
-    if len(fields) == 0:
-        return path
-
-    if not all(field.isidentifier() for field in fields):
-        raise ValueError(f"Expected valid Python identifiers, found {fields}")
-
-    if not all(field in options for field in fields):
-        raise ValueError(f"Expected values for all fields in {fields}, found {list(options.keys())}")
-
-    path = path.format(**{field: options[field] for field in fields})
-    for field in fields:
-        options.pop(field)
-
-    return path
-
-
-
-
-
-
-
-

Functions

-
-
-def allow_options(options: Union[Iterable[+T_co], function, method]) -
-
-

Validate **options for a decorated reader function.

-

Args

-
-
options
-
A set of valid options for a reader (e.g. pandas.read_parquet or pandas.read_csv)
-
-

Returns

-
-
read_with_valid_options
-
The input function called with modified options.
-
-
- -Expand source code - -
def allow_options(options: Union[Iterable, FunctionType, MethodType]):
-    """Validate **options for a decorated reader function.
-
-    Args:
-        options: A set of valid options for a reader (e.g. `pandas.read_parquet` or `pandas.read_csv`)
-
-    Returns:
-        read_with_valid_options: The input function called with modified options.
-    """
-
-    def _filter_out_irrelevant_options(kwargs: Mapping, valid_options: Iterable):
-        filtered_options = {}
-        invalid_options = {}
-        for key_arg in kwargs.keys():
-            if key_arg in valid_options:
-                filtered_options[key_arg] = kwargs[key_arg]
-            else:
-                invalid_options[key_arg] = kwargs[key_arg]
-        if len(invalid_options) > 0:
-            logger.warning(
-                f"Options {invalid_options} were not used because they were not supported by the read or write method configured for this source. "
-                "Check if you expected any of those to have been used by the operation!"
-            )
-        return filtered_options
-
-    def read_with_valid_options(func):
-        @wraps(func)
-        def _(*args, **kwargs):
-            if callable(options):
-                return func(*args, **_filter_out_irrelevant_options(kwargs, args_of(options)))
-            return func(*args, **_filter_out_irrelevant_options(kwargs, options))
-
-        return _
-
-    return read_with_valid_options
-
-
-
-def args_of(func) -
-
-

Retrieve allowed options for a given function.

-

Args

-
-
func
-
A function like, e.g., pd.read_csv
-
-

Returns

-

A set of allowed options

-
- -Expand source code - -
def args_of(func):
-    """Retrieve allowed options for a given function.
-
-    Args:
-        func: A function like, e.g., pd.read_csv
-
-    Returns:
-        A set of allowed options
-    """
-    return set(inspect.signature(func).parameters.keys())
-
-
-
-def get_string_template_field_names(s: str) ‑> Collection[str] -
-
-

Given a string s, it parses the string to identify any template fields and returns the names of those fields.

-

If s is not a string template, the returned Collection is empty.

-

Args

-

s:

-

Returns

-

Collection[str]

-

Example

-
>>> get_string_template_field_names("abc{def}{efg}")
-["def", "efg"]
->>> get_string_template_field_names("{0}-{1}")
-["0", "1"]
->>> get_string_template_field_names("hello world")
-[]
-
-
- -Expand source code - -
def get_string_template_field_names(s: str) -> Collection[str]:  # pylint: disable=C0103
-    """Given a string `s`, it parses the string to identify any template fields and returns the names of those fields.
-
-     If `s` is not a string template, the returned `Collection` is empty.
-
-    Args:
-        s:
-
-    Returns:
-        Collection[str]
-
-    Example:
-
-        >>> get_string_template_field_names("abc{def}{efg}")
-        ["def", "efg"]
-        >>> get_string_template_field_names("{0}-{1}")
-        ["0", "1"]
-        >>> get_string_template_field_names("hello world")
-        []
-    """
-    # string.Formatter.parse returns a 4-tuple of:
-    # `literal_text`, `field_name`, `form_at_spec`, `conversion`
-    # More info here https://docs.python.org/3.8/library/string.html#string.Formatter.parse
-    field_names = [group[1] for group in string.Formatter().parse(s) if group[1] is not None]
-
-    return field_names
-
-
-
-def resolve_template(path: str, options: MutableMapping[str, Any]) ‑> str -
-
-

Given a string path, it attempts to replace all templates fields with values provided in options.

-

If path is not a string template, path is returned.

-

Args

-
-
path
-
A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
-
options
-
A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
-

Returns

-
-
str
-
Returns a static path replaced with the value in the options mapping.
-
-

Raises

-
-
ValueError
-
if any template fields in s are not named using valid Python identifiers
-
ValueError
-
if a given template field cannot be resolved in options
-
-
- -Expand source code - -
def resolve_template(path: str, options: MutableMapping[str, Any]) -> str:  # pylint: disable=C0103
-    """Given a string `path`, it attempts to replace all templates fields with values provided in `options`.
-
-    If `path` is not a string template, `path` is returned.
-
-    Args:
-        path: A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5
-        options: A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"}
-
-    Returns:
-        str: Returns a static path replaced with the value in the options mapping.
-
-    Raises:
-        ValueError: if any template fields in s are not named using valid Python identifiers
-        ValueError: if a given template field cannot be resolved in `options`
-    """
-    fields = get_string_template_field_names(path)
-
-    if len(fields) == 0:
-        return path
-
-    if not all(field.isidentifier() for field in fields):
-        raise ValueError(f"Expected valid Python identifiers, found {fields}")
-
-    if not all(field in options for field in fields):
-        raise ValueError(f"Expected values for all fields in {fields}, found {list(options.keys())}")
-
-    path = path.format(**{field: options[field] for field in fields})
-    for field in fields:
-        options.pop(field)
-
-    return path
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/mixins/with_kafka.html b/docs/mixins/with_kafka.html deleted file mode 100644 index e708c09..0000000 --- a/docs/mixins/with_kafka.html +++ /dev/null @@ -1,477 +0,0 @@ - - - - - - -dynamicio.mixins.with_kafka API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.mixins.with_kafka

-
-
-

This module provides mixins that are providing Kafka I/O support.

-
- -Expand source code - -
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Kafka I/O support."""
-
-
-from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
-
-import pandas as pd  # type: ignore
-import simplejson
-from kafka import KafkaProducer  # type: ignore
-from magic_logger import logger
-
-
-from . import utils
-
-
-class WithKafka:
-    """Handles I/O operations for Kafka.
-
-    Args:
-        - options:
-            - Standard: Keyword-arguments passed to the KafkaProducer constructor (see `KafkaProducer.DEFAULT_CONFIG.keys()`).
-             - Additional Options:
-
-                - `key_generator: Callable[[Any, Mapping], T]`: defines the keying policy to be used for sending keyed-messages to Kafka. It is a `Callable` that takes a
-                `tuple(idx, row)` and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index
-                (which may not be composed of unique values or string type keys). It goes hand in hand with the default `key-serialiser`, which assumes that the keys
-                are strings and encode's them as such.
-
-                - `key_serializer: Callable[T, bytes]`: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).
-
-                N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.
-
-                - `document_transformer: Callable[[Mapping[Any, Any]`: Manipulates the messages/rows sent to Kafka as values. It is  a `Callable` taking a `Mapping` as its only
-                argument and return a `Mapping`, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each
-                document that will be written to the target  Kafka topic.
-
-                - `value_serializer: Callable[Mapping, bytes]`: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..
-
-    Example:
-        >>> # Given
-        >>> keyed_test_df = pd.DataFrame.from_records(
-        >>>     [
-        >>>         ["key-01", "cm_1", "id_1", 1000, "ABC"],
-        >>>         ["key-02", "cm_2", "id_2", 1000, "ABC"],
-        >>>         ["key-03", "cm_3", "id_3", 1000, "ABC"],
-        >>>     ],
-        >>>     columns=["key", "id", "foo", "bar", "baz"],
-        >>> ).set_index("key")
-        >>>
-        >>> kafka_cloud_config = IOConfig(
-        >>>     path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")),
-        >>>     env_identifier="CLOUD",
-        >>>     dynamic_vars=constants,
-        >>> ).get(source_key="WRITE_TO_KAFKA_JSON")
-        >>>
-        >>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value")
-        >>>
-        >>> # When
-        >>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer:
-        >>>     mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
-        >>>     mock_producer = MockKafkaProducer()
-        >>>     mock__kafka_producer.return_value = mock_producer
-        >>>     write_kafka_io.write(keyed_test_df)
-        >>>
-        >>> # Then
-        >>> assert mock_producer.my_stream == [
-        >>>     {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}},
-        >>>     {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}},
-        >>>     {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}},
-        >>> ]
-    """
-
-    sources_config: Mapping
-    schema: Mapping
-    options: MutableMapping[str, Any]
-    __kafka_config: Optional[Mapping] = None
-    __producer: Optional[KafkaProducer] = None
-    __key_generator: Optional[Callable[[Any, Mapping[Any, Any]], Optional[str]]] = None
-    __document_transformer: Optional[Callable[[Mapping[Any, Any]], Mapping[Any, Any]]] = None
-
-    def _write_to_kafka(self, df: pd.DataFrame) -> None:
-        """Given a dataframe where each row is a message to be sent to a Kafka Topic, iterate through all rows and send them to a Kafka topic.
-
-         The topic is defined in `self.sources_config["kafka"]` and using a kafka producer, which is flushed at the
-         end of this process.
-
-        Args:
-            df: A dataframe where each row is a message to be sent to a Kafka Topic.
-        """
-        if self.__key_generator is None:
-            self.__key_generator = lambda idx, __: idx  # default key generator uses the dataframe's index
-            if self.options.get("key_generator") is not None:
-                self.__key_generator = self.options.pop("key_generator")
-
-        if self.__document_transformer is None:
-            self.__document_transformer = lambda value: value
-            if self.options.get("document_transformer") is not None:
-                self.__document_transformer = self.options.pop("document_transformer")
-
-        if self.__producer is None:
-            self.__producer = self._get_producer(self.sources_config["kafka"]["kafka_server"], **self.options)
-
-        self._send_messages(df=df, topic=self.sources_config["kafka"]["kafka_topic"])
-
-    @utils.allow_options(KafkaProducer.DEFAULT_CONFIG.keys())
-    def _get_producer(self, server: str, **options: MutableMapping[str, Any]) -> KafkaProducer:
-        """Generate and return a Kafka Producer.
-
-        Default options are used to generate the producer. Specifically:
-            - `bootstrap_servers`: Passed on through the source_config
-            - `value_serializer`: Uses a default_value_serializer defined in this mixin
-
-        More options can be added to the producer by passing them as keyword arguments, through valid options.
-
-        These can also override the default options.
-
-        Args:
-            server: The host name.
-            **options: Keyword arguments to pass to the KafkaProducer.
-
-        Returns:
-            A Kafka producer instance.
-        """
-        self.__kafka_config = {
-            **{
-                "bootstrap_servers": server,
-                "compression_type": "snappy",
-                "key_serializer": self._default_key_serializer,
-                "value_serializer": self._default_value_serializer,
-            },
-            **options,
-        }
-        return KafkaProducer(**self.__kafka_config)
-
-    def _send_messages(self, df: pd.DataFrame, topic: str) -> None:
-        logger.info(f"Sending {len(df)} messages to Kafka topic:{topic}.")
-
-        messages = df.reset_index(drop=True).to_dict("records")
-        for idx, message in zip(df.index.values, messages):
-            self.__producer.send(topic, key=self.__key_generator(idx, message), value=self.__document_transformer(message))  # type: ignore
-
-        self.__producer.flush()  # type: ignore
-
-    @staticmethod
-    def _default_key_serializer(key: Optional[str]) -> Optional[bytes]:
-        if key:
-            return key.encode("utf-8")
-        return None
-
-    @staticmethod
-    def _default_value_serializer(value: Mapping) -> bytes:
-        return simplejson.dumps(value, ignore_nan=True).encode("utf-8")
-
-    def _read_from_kafka(self) -> Iterable[Mapping]:  # type: ignore
-        """Read messages from a Kafka Topic and convert them to separate dataframes.
-
-        Returns:
-            Multiple dataframes, one per message read from the Kafka topic of interest.
-        """
-        # TODO: Implement kafka reader
-
-
-
-
-
-
-
-
-
-

Classes

-
-
-class WithKafka -
-
-

Handles I/O operations for Kafka.

-

Args

-
    -
  • options:
      -
    • Standard: Keyword-arguments passed to the KafkaProducer constructor (see KafkaProducer.DEFAULT_CONFIG.keys()).
    • -
    • -

      Additional Options:

      -
        -
      • -

        key_generator: Callable[[Any, Mapping], T]: defines the keying policy to be used for sending keyed-messages to Kafka. It is a Callable that takes a -tuple(idx, row) and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index -(which may not be composed of unique values or string type keys). It goes hand in hand with the default key-serialiser, which assumes that the keys -are strings and encode's them as such.

        -
      • -
      • -

        key_serializer: Callable[T, bytes]: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).

        -
      • -
      -

      N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.

      -
        -
      • -

        document_transformer: Callable[[Mapping[Any, Any]: Manipulates the messages/rows sent to Kafka as values. It is -a Callable taking a Mapping as its only -argument and return a Mapping, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each -document that will be written to the target -Kafka topic.

        -
      • -
      • -

        value_serializer: Callable[Mapping, bytes]: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..

        -
      • -
      -
    • -
    -
  • -
-

Example

-
>>> # Given
->>> keyed_test_df = pd.DataFrame.from_records(
->>>     [
->>>         ["key-01", "cm_1", "id_1", 1000, "ABC"],
->>>         ["key-02", "cm_2", "id_2", 1000, "ABC"],
->>>         ["key-03", "cm_3", "id_3", 1000, "ABC"],
->>>     ],
->>>     columns=["key", "id", "foo", "bar", "baz"],
->>> ).set_index("key")
->>>
->>> kafka_cloud_config = IOConfig(
->>>     path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")),
->>>     env_identifier="CLOUD",
->>>     dynamic_vars=constants,
->>> ).get(source_key="WRITE_TO_KAFKA_JSON")
->>>
->>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value")
->>>
->>> # When
->>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer:
->>>     mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
->>>     mock_producer = MockKafkaProducer()
->>>     mock__kafka_producer.return_value = mock_producer
->>>     write_kafka_io.write(keyed_test_df)
->>>
->>> # Then
->>> assert mock_producer.my_stream == [
->>>     {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}},
->>>     {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}},
->>>     {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}},
->>> ]
-
-
- -Expand source code - -
class WithKafka:
-    """Handles I/O operations for Kafka.
-
-    Args:
-        - options:
-            - Standard: Keyword-arguments passed to the KafkaProducer constructor (see `KafkaProducer.DEFAULT_CONFIG.keys()`).
-             - Additional Options:
-
-                - `key_generator: Callable[[Any, Mapping], T]`: defines the keying policy to be used for sending keyed-messages to Kafka. It is a `Callable` that takes a
-                `tuple(idx, row)` and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index
-                (which may not be composed of unique values or string type keys). It goes hand in hand with the default `key-serialiser`, which assumes that the keys
-                are strings and encode's them as such.
-
-                - `key_serializer: Callable[T, bytes]`: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None).
-
-                N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type.
-
-                - `document_transformer: Callable[[Mapping[Any, Any]`: Manipulates the messages/rows sent to Kafka as values. It is  a `Callable` taking a `Mapping` as its only
-                argument and return a `Mapping`, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each
-                document that will be written to the target  Kafka topic.
-
-                - `value_serializer: Callable[Mapping, bytes]`: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping..
-
-    Example:
-        >>> # Given
-        >>> keyed_test_df = pd.DataFrame.from_records(
-        >>>     [
-        >>>         ["key-01", "cm_1", "id_1", 1000, "ABC"],
-        >>>         ["key-02", "cm_2", "id_2", 1000, "ABC"],
-        >>>         ["key-03", "cm_3", "id_3", 1000, "ABC"],
-        >>>     ],
-        >>>     columns=["key", "id", "foo", "bar", "baz"],
-        >>> ).set_index("key")
-        >>>
-        >>> kafka_cloud_config = IOConfig(
-        >>>     path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")),
-        >>>     env_identifier="CLOUD",
-        >>>     dynamic_vars=constants,
-        >>> ).get(source_key="WRITE_TO_KAFKA_JSON")
-        >>>
-        >>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value")
-        >>>
-        >>> # When
-        >>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer:
-        >>>     mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG
-        >>>     mock_producer = MockKafkaProducer()
-        >>>     mock__kafka_producer.return_value = mock_producer
-        >>>     write_kafka_io.write(keyed_test_df)
-        >>>
-        >>> # Then
-        >>> assert mock_producer.my_stream == [
-        >>>     {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}},
-        >>>     {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}},
-        >>>     {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}},
-        >>> ]
-    """
-
-    sources_config: Mapping
-    schema: Mapping
-    options: MutableMapping[str, Any]
-    __kafka_config: Optional[Mapping] = None
-    __producer: Optional[KafkaProducer] = None
-    __key_generator: Optional[Callable[[Any, Mapping[Any, Any]], Optional[str]]] = None
-    __document_transformer: Optional[Callable[[Mapping[Any, Any]], Mapping[Any, Any]]] = None
-
-    def _write_to_kafka(self, df: pd.DataFrame) -> None:
-        """Given a dataframe where each row is a message to be sent to a Kafka Topic, iterate through all rows and send them to a Kafka topic.
-
-         The topic is defined in `self.sources_config["kafka"]` and using a kafka producer, which is flushed at the
-         end of this process.
-
-        Args:
-            df: A dataframe where each row is a message to be sent to a Kafka Topic.
-        """
-        if self.__key_generator is None:
-            self.__key_generator = lambda idx, __: idx  # default key generator uses the dataframe's index
-            if self.options.get("key_generator") is not None:
-                self.__key_generator = self.options.pop("key_generator")
-
-        if self.__document_transformer is None:
-            self.__document_transformer = lambda value: value
-            if self.options.get("document_transformer") is not None:
-                self.__document_transformer = self.options.pop("document_transformer")
-
-        if self.__producer is None:
-            self.__producer = self._get_producer(self.sources_config["kafka"]["kafka_server"], **self.options)
-
-        self._send_messages(df=df, topic=self.sources_config["kafka"]["kafka_topic"])
-
-    @utils.allow_options(KafkaProducer.DEFAULT_CONFIG.keys())
-    def _get_producer(self, server: str, **options: MutableMapping[str, Any]) -> KafkaProducer:
-        """Generate and return a Kafka Producer.
-
-        Default options are used to generate the producer. Specifically:
-            - `bootstrap_servers`: Passed on through the source_config
-            - `value_serializer`: Uses a default_value_serializer defined in this mixin
-
-        More options can be added to the producer by passing them as keyword arguments, through valid options.
-
-        These can also override the default options.
-
-        Args:
-            server: The host name.
-            **options: Keyword arguments to pass to the KafkaProducer.
-
-        Returns:
-            A Kafka producer instance.
-        """
-        self.__kafka_config = {
-            **{
-                "bootstrap_servers": server,
-                "compression_type": "snappy",
-                "key_serializer": self._default_key_serializer,
-                "value_serializer": self._default_value_serializer,
-            },
-            **options,
-        }
-        return KafkaProducer(**self.__kafka_config)
-
-    def _send_messages(self, df: pd.DataFrame, topic: str) -> None:
-        logger.info(f"Sending {len(df)} messages to Kafka topic:{topic}.")
-
-        messages = df.reset_index(drop=True).to_dict("records")
-        for idx, message in zip(df.index.values, messages):
-            self.__producer.send(topic, key=self.__key_generator(idx, message), value=self.__document_transformer(message))  # type: ignore
-
-        self.__producer.flush()  # type: ignore
-
-    @staticmethod
-    def _default_key_serializer(key: Optional[str]) -> Optional[bytes]:
-        if key:
-            return key.encode("utf-8")
-        return None
-
-    @staticmethod
-    def _default_value_serializer(value: Mapping) -> bytes:
-        return simplejson.dumps(value, ignore_nan=True).encode("utf-8")
-
-    def _read_from_kafka(self) -> Iterable[Mapping]:  # type: ignore
-        """Read messages from a Kafka Topic and convert them to separate dataframes.
-
-        Returns:
-            Multiple dataframes, one per message read from the Kafka topic of interest.
-        """
-        # TODO: Implement kafka reader
-
-

Subclasses

- -

Class variables

-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/mixins/with_local.html b/docs/mixins/with_local.html deleted file mode 100644 index 507ecb4..0000000 --- a/docs/mixins/with_local.html +++ /dev/null @@ -1,698 +0,0 @@ - - - - - - -dynamicio.mixins.with_local API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.mixins.with_local

-
-
-

This module provides mixins that are providing Local FS I/O support.

-
- -Expand source code - -
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Local FS I/O support."""
-
-import glob
-import os
-from contextlib import contextmanager
-from threading import Lock
-from typing import Any, Mapping, MutableMapping, Optional
-
-import pandas as pd  # type: ignore
-from fastparquet import ParquetFile, write  # type: ignore
-from pyarrow.parquet import read_table, write_table  # type: ignore # pylint: disable=no-name-in-module
-
-from . import utils
-
-hdf_lock = Lock()
-
-
-@contextmanager
-def pickle_protocol(protocol: Optional[int]):
-    """Downgrade to the provided pickle protocol within the context manager.
-
-    Args:
-        protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
-    """
-    import pickle  # pylint: disable=import-outside-toplevel
-
-    previous = pickle.HIGHEST_PROTOCOL
-    try:
-        pickle.HIGHEST_PROTOCOL = 4
-        if protocol:
-            pickle.HIGHEST_PROTOCOL = protocol
-        yield
-    finally:
-        pickle.HIGHEST_PROTOCOL = previous
-
-
-class WithLocal:
-    """Handles local I/O operations."""
-
-    sources_config: Mapping
-    schema: Mapping
-    options: MutableMapping[str, Any]
-
-    def _read_from_local(self) -> pd.DataFrame:
-        """Read a local file as a `DataFrame`.
-
-        The configuration object is expected to have two keys:
-            - `file_path`
-            - `file_type`
-
-        To actually read the file, a method is dynamically invoked by name, using
-        "_read_{file_type}_file".
-
-        Returns:
-            DataFrame
-        """
-        local_config = self.sources_config["local"]
-        file_path = utils.resolve_template(local_config["file_path"], self.options)
-        file_type = local_config["file_type"]
-
-        return getattr(self, f"_read_{file_type}_file")(file_path, self.schema, **self.options)
-
-    def _write_to_local(self, df: pd.DataFrame):
-        """Write a dataframe locally based on the {file_type} of the config_io configuration.
-
-        The configuration object is expected to have two keys:
-
-            - `file_path`
-            - `file_type`
-
-        To actually write the file, a method is dynamically invoked by name, using
-        "_write_{file_type}_file".
-
-        Args:
-            df: The dataframe to be written out.
-        """
-        local_config = self.sources_config["local"]
-        file_path = utils.resolve_template(local_config["file_path"], self.options)
-        file_type = local_config["file_type"]
-
-        getattr(self, f"_write_{file_type}_file")(df, file_path, **self.options)
-
-    @staticmethod
-    @utils.allow_options(pd.read_hdf)
-    def _read_hdf_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a HDF file as a DataFrame using `pd.read_hdf`.
-
-        All `options` are passed directly to `pd.read_hdf`.
-
-        Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
-            that when used with asyncio through `async_read()` HDF files will be read sequentially.
-            For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
-        Args:
-            file_path: The path to the hdf file to be read.
-            options: The pandas `read_hdf` options.
-
-        Returns:
-            DataFrame: The dataframe read from the hdf file.
-        """
-        with hdf_lock:
-            df = pd.read_hdf(file_path, **options)
-
-        columns = [column for column in df.columns.to_list() if column in schema.keys()]
-        df = df[columns]
-        return df
-
-    @staticmethod
-    @utils.allow_options(pd.read_csv)
-    def _read_csv_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a CSV file as a DataFrame using `pd.read_csv`.
-
-        All `options` are passed directly to `pd.read_csv`.
-
-        Args:
-            file_path: The path to the csv file to be read.
-            options: The pandas `read_csv` options.
-
-        Returns:
-            DataFrame: The dataframe read from the csv file.
-        """
-        options["usecols"] = list(schema.keys())
-        return pd.read_csv(file_path, **options)
-
-    @staticmethod
-    @utils.allow_options(pd.read_json)
-    def _read_json_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a json file as a DataFrame using `pd.read_hdf`.
-
-        All `options` are passed directly to `pd.read_hdf`.
-
-        Args:
-            file_path:
-            options:
-
-        Returns:
-            DataFrame
-        """
-        df = pd.read_json(file_path, **options)
-        columns = [column for column in df.columns.to_list() if column in schema.keys()]
-        df = df[columns]
-        return df
-
-    @staticmethod
-    def _read_parquet_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a Parquet file as a DataFrame using `pd.read_parquet`.
-
-        All `options` are passed directly to `pd.read_parquet`.
-
-        Args:
-            file_path: The path to the parquet file to be read.
-            options: The pandas `read_parquet` options.
-
-        Returns:
-            DataFrame: The dataframe read from the parquet file.
-        """
-        options["columns"] = list(schema.keys())
-
-        if options.get("engine") == "fastparquet":
-            return WithLocal.__read_with_fastparquet(file_path, **options)
-        return WithLocal.__read_with_pyarrow(file_path, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(read_table)])
-    def __read_with_pyarrow(cls, file_path: str, **options: Any) -> pd.DataFrame:
-        return pd.read_parquet(file_path, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(ParquetFile)])
-    def __read_with_fastparquet(cls, file_path: str, **options: Any) -> pd.DataFrame:
-        return pd.read_parquet(file_path, **options)
-
-    @staticmethod
-    @utils.allow_options([*utils.args_of(pd.DataFrame.to_hdf), *["protocol"]])
-    def _write_hdf_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe to hdf using `df.to_hdf`.
-
-        All `options` are passed directly to `df.to_hdf`.
-
-        Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
-            that when used with asyncio through `async_read()` HDF files will be written sequentially.
-            For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: The pandas `to_hdf` options.
-
-                - The pandas `to_hdf` options, &;
-                - protocol: The pickle protocol to use for writing the hdf file out; a value <=5.
-        """
-        with pickle_protocol(protocol=options.pop("protocol", None)), hdf_lock:
-            df.to_hdf(file_path, key="df", mode="w", **options)
-
-    @staticmethod
-    @utils.allow_options(pd.DataFrame.to_csv)
-    def _write_csv_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe as a CSV file using `df.to_csv`.
-
-        All `options` are passed directly to `df.to_csv`.
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: Options relative to writing a csv file.
-        """
-        df.to_csv(file_path, **options)
-
-    @staticmethod
-    @utils.allow_options(pd.DataFrame.to_json)
-    def _write_json_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe as a json file using `df.to_json`.
-
-        All `options` are passed directly to `df.to_json`.
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: Options relative to writing a json file.
-        """
-        df.to_json(file_path, **options)
-
-    @staticmethod
-    def _write_parquet_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe as a parquet file using `df.to_parquet`.
-
-        All `options` are passed directly to `df.to_parquet`.
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: Options relative to writing a parquet file.
-        """
-        if options.get("engine") == "fastparquet":
-            return WithLocal.__write_with_fastparquet(df, file_path, **options)
-        return WithLocal.__write_with_pyarrow(df, file_path, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write_table)])
-    def __write_with_pyarrow(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
-        return df.to_parquet(filepath, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write)])
-    def __write_with_fastparquet(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
-        return df.to_parquet(filepath, **options)
-
-
-class WithLocalBatch(WithLocal):
-    """Responsible for batch reading local files."""
-
-    def _read_from_local_batch(self) -> pd.DataFrame:
-        """Reads a set of files for a specified file type, concatenates them and returns a dataframe.
-
-        Returns:
-            A concatenated dataframe composed of all files read through local_batch.
-        """
-        local_batch_config = self.sources_config["local"]
-
-        file_type = local_batch_config["file_type"]
-        filtering_file_type = file_type
-        if filtering_file_type == "hdf":
-            filtering_file_type = "h5"
-
-        files = glob.glob(f"{local_batch_config['path_prefix']}/*.{filtering_file_type}")
-
-        dfs_to_concatenate = []
-        for file in files:
-            file_to_load = os.path.join(local_batch_config["path_prefix"], file)
-            dfs_to_concatenate.append(getattr(self, f"_read_{file_type}_file")(file_to_load, self.schema, **self.options))  # type: ignore
-
-        return pd.concat(dfs_to_concatenate).reset_index(drop=True)
-
-
-
-
-
-
-
-

Functions

-
-
-def pickle_protocol(protocol: Optional[int]) -
-
-

Downgrade to the provided pickle protocol within the context manager.

-

Args

-
-
protocol
-
The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
-
-
- -Expand source code - -
@contextmanager
-def pickle_protocol(protocol: Optional[int]):
-    """Downgrade to the provided pickle protocol within the context manager.
-
-    Args:
-        protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher.
-    """
-    import pickle  # pylint: disable=import-outside-toplevel
-
-    previous = pickle.HIGHEST_PROTOCOL
-    try:
-        pickle.HIGHEST_PROTOCOL = 4
-        if protocol:
-            pickle.HIGHEST_PROTOCOL = protocol
-        yield
-    finally:
-        pickle.HIGHEST_PROTOCOL = previous
-
-
-
-
-
-

Classes

-
-
-class WithLocal -
-
-

Handles local I/O operations.

-
- -Expand source code - -
class WithLocal:
-    """Handles local I/O operations."""
-
-    sources_config: Mapping
-    schema: Mapping
-    options: MutableMapping[str, Any]
-
-    def _read_from_local(self) -> pd.DataFrame:
-        """Read a local file as a `DataFrame`.
-
-        The configuration object is expected to have two keys:
-            - `file_path`
-            - `file_type`
-
-        To actually read the file, a method is dynamically invoked by name, using
-        "_read_{file_type}_file".
-
-        Returns:
-            DataFrame
-        """
-        local_config = self.sources_config["local"]
-        file_path = utils.resolve_template(local_config["file_path"], self.options)
-        file_type = local_config["file_type"]
-
-        return getattr(self, f"_read_{file_type}_file")(file_path, self.schema, **self.options)
-
-    def _write_to_local(self, df: pd.DataFrame):
-        """Write a dataframe locally based on the {file_type} of the config_io configuration.
-
-        The configuration object is expected to have two keys:
-
-            - `file_path`
-            - `file_type`
-
-        To actually write the file, a method is dynamically invoked by name, using
-        "_write_{file_type}_file".
-
-        Args:
-            df: The dataframe to be written out.
-        """
-        local_config = self.sources_config["local"]
-        file_path = utils.resolve_template(local_config["file_path"], self.options)
-        file_type = local_config["file_type"]
-
-        getattr(self, f"_write_{file_type}_file")(df, file_path, **self.options)
-
-    @staticmethod
-    @utils.allow_options(pd.read_hdf)
-    def _read_hdf_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a HDF file as a DataFrame using `pd.read_hdf`.
-
-        All `options` are passed directly to `pd.read_hdf`.
-
-        Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
-            that when used with asyncio through `async_read()` HDF files will be read sequentially.
-            For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
-        Args:
-            file_path: The path to the hdf file to be read.
-            options: The pandas `read_hdf` options.
-
-        Returns:
-            DataFrame: The dataframe read from the hdf file.
-        """
-        with hdf_lock:
-            df = pd.read_hdf(file_path, **options)
-
-        columns = [column for column in df.columns.to_list() if column in schema.keys()]
-        df = df[columns]
-        return df
-
-    @staticmethod
-    @utils.allow_options(pd.read_csv)
-    def _read_csv_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a CSV file as a DataFrame using `pd.read_csv`.
-
-        All `options` are passed directly to `pd.read_csv`.
-
-        Args:
-            file_path: The path to the csv file to be read.
-            options: The pandas `read_csv` options.
-
-        Returns:
-            DataFrame: The dataframe read from the csv file.
-        """
-        options["usecols"] = list(schema.keys())
-        return pd.read_csv(file_path, **options)
-
-    @staticmethod
-    @utils.allow_options(pd.read_json)
-    def _read_json_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a json file as a DataFrame using `pd.read_hdf`.
-
-        All `options` are passed directly to `pd.read_hdf`.
-
-        Args:
-            file_path:
-            options:
-
-        Returns:
-            DataFrame
-        """
-        df = pd.read_json(file_path, **options)
-        columns = [column for column in df.columns.to_list() if column in schema.keys()]
-        df = df[columns]
-        return df
-
-    @staticmethod
-    def _read_parquet_file(file_path: str, schema: Mapping[str, str], **options: Any) -> pd.DataFrame:
-        """Read a Parquet file as a DataFrame using `pd.read_parquet`.
-
-        All `options` are passed directly to `pd.read_parquet`.
-
-        Args:
-            file_path: The path to the parquet file to be read.
-            options: The pandas `read_parquet` options.
-
-        Returns:
-            DataFrame: The dataframe read from the parquet file.
-        """
-        options["columns"] = list(schema.keys())
-
-        if options.get("engine") == "fastparquet":
-            return WithLocal.__read_with_fastparquet(file_path, **options)
-        return WithLocal.__read_with_pyarrow(file_path, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(read_table)])
-    def __read_with_pyarrow(cls, file_path: str, **options: Any) -> pd.DataFrame:
-        return pd.read_parquet(file_path, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(ParquetFile)])
-    def __read_with_fastparquet(cls, file_path: str, **options: Any) -> pd.DataFrame:
-        return pd.read_parquet(file_path, **options)
-
-    @staticmethod
-    @utils.allow_options([*utils.args_of(pd.DataFrame.to_hdf), *["protocol"]])
-    def _write_hdf_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe to hdf using `df.to_hdf`.
-
-        All `options` are passed directly to `df.to_hdf`.
-
-        Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means
-            that when used with asyncio through `async_read()` HDF files will be written sequentially.
-            For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: The pandas `to_hdf` options.
-
-                - The pandas `to_hdf` options, &;
-                - protocol: The pickle protocol to use for writing the hdf file out; a value <=5.
-        """
-        with pickle_protocol(protocol=options.pop("protocol", None)), hdf_lock:
-            df.to_hdf(file_path, key="df", mode="w", **options)
-
-    @staticmethod
-    @utils.allow_options(pd.DataFrame.to_csv)
-    def _write_csv_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe as a CSV file using `df.to_csv`.
-
-        All `options` are passed directly to `df.to_csv`.
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: Options relative to writing a csv file.
-        """
-        df.to_csv(file_path, **options)
-
-    @staticmethod
-    @utils.allow_options(pd.DataFrame.to_json)
-    def _write_json_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe as a json file using `df.to_json`.
-
-        All `options` are passed directly to `df.to_json`.
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: Options relative to writing a json file.
-        """
-        df.to_json(file_path, **options)
-
-    @staticmethod
-    def _write_parquet_file(df: pd.DataFrame, file_path: str, **options: Any):
-        """Write a dataframe as a parquet file using `df.to_parquet`.
-
-        All `options` are passed directly to `df.to_parquet`.
-
-        Args:
-            df: A dataframe write out.
-            file_path: The location where the file needs to be written.
-            options: Options relative to writing a parquet file.
-        """
-        if options.get("engine") == "fastparquet":
-            return WithLocal.__write_with_fastparquet(df, file_path, **options)
-        return WithLocal.__write_with_pyarrow(df, file_path, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write_table)])
-    def __write_with_pyarrow(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
-        return df.to_parquet(filepath, **options)
-
-    @classmethod
-    @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write)])
-    def __write_with_fastparquet(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame:
-        return df.to_parquet(filepath, **options)
-
-

Subclasses

- -

Class variables

-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-
-
-class WithLocalBatch -
-
-

Responsible for batch reading local files.

-
- -Expand source code - -
class WithLocalBatch(WithLocal):
-    """Responsible for batch reading local files."""
-
-    def _read_from_local_batch(self) -> pd.DataFrame:
-        """Reads a set of files for a specified file type, concatenates them and returns a dataframe.
-
-        Returns:
-            A concatenated dataframe composed of all files read through local_batch.
-        """
-        local_batch_config = self.sources_config["local"]
-
-        file_type = local_batch_config["file_type"]
-        filtering_file_type = file_type
-        if filtering_file_type == "hdf":
-            filtering_file_type = "h5"
-
-        files = glob.glob(f"{local_batch_config['path_prefix']}/*.{filtering_file_type}")
-
-        dfs_to_concatenate = []
-        for file in files:
-            file_to_load = os.path.join(local_batch_config["path_prefix"], file)
-            dfs_to_concatenate.append(getattr(self, f"_read_{file_type}_file")(file_to_load, self.schema, **self.options))  # type: ignore
-
-        return pd.concat(dfs_to_concatenate).reset_index(drop=True)
-
-

Ancestors

- -

Subclasses

- -

Class variables

-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/mixins/with_postgres.html b/docs/mixins/with_postgres.html deleted file mode 100644 index 93d0c73..0000000 --- a/docs/mixins/with_postgres.html +++ /dev/null @@ -1,504 +0,0 @@ - - - - - - -dynamicio.mixins.with_postgres API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.mixins.with_postgres

-
-
-

This module provides mixins that are providing Postgres I/O support.

-
- -Expand source code - -
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing Postgres I/O support."""
-
-import csv
-import tempfile
-from contextlib import contextmanager
-from typing import Any, Dict, Generator, Mapping, MutableMapping, Union
-
-import pandas as pd  # type: ignore
-from magic_logger import logger
-from sqlalchemy import BigInteger, Boolean, Column, create_engine, Date, DateTime, Float, Integer, String  # type: ignore
-from sqlalchemy.ext.declarative import declarative_base  # type: ignore
-from sqlalchemy.orm import Query  # type: ignore
-from sqlalchemy.orm.decl_api import DeclarativeMeta  # type: ignore
-from sqlalchemy.orm.session import Session as SqlAlchemySession  # type: ignore
-from sqlalchemy.orm.session import sessionmaker  # type: ignore
-
-from . import utils
-
-Session = sessionmaker(autoflush=True)
-
-Base = declarative_base()
-_type_lookup = {
-    "bool": Boolean,
-    "boolean": Boolean,
-    "object": String(64),
-    "int64": Integer,
-    "float64": Float,
-    "int": Integer,
-    "date": Date,
-    "datetime64[ns]": DateTime,
-    "bigint": BigInteger,
-}
-
-
-@contextmanager
-def session_for(connection_string: str) -> Generator[SqlAlchemySession, None, None]:
-    """Connect to a database using `connection_string` and returns an active session to that connection.
-
-    Args:
-        connection_string:
-
-    Yields:
-        Active session
-    """
-    engine = create_engine(connection_string)
-    session = Session(bind=engine)
-
-    try:
-        yield session
-    finally:
-        session.close()  # pylint: disable=no-member
-
-
-class WithPostgres:
-    """Handles I/O operations for Postgres.
-
-    Args:
-       - options:
-           - `truncate_and_append: bool`: If set to `True`, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows.
-    """
-
-    sources_config: Mapping
-    schema: Mapping
-    options: MutableMapping[str, Any]
-
-    def _read_from_postgres(self) -> pd.DataFrame:
-        """Read data from postgres as a `DataFrame`.
-
-        The configuration object is expected to have the following keys:
-            - `db_user`
-            - `db_password`
-            - `db_host`
-            - `db_port`
-            - `db_name`
-
-        Returns:
-            DataFrame
-        """
-        postgres_config = self.sources_config["postgres"]
-        db_user = postgres_config["db_user"]
-        db_password = postgres_config["db_password"]
-        db_host = postgres_config["db_host"]
-        db_port = postgres_config["db_port"]
-        db_name = postgres_config["db_name"]
-
-        connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
-        sql_query = self.options.pop("sql_query", None)
-
-        if "schema" not in self.sources_config:
-            schema_dict = self.schema
-        else:
-            schema_dict = self.sources_config["schema"]
-        schema_name = self.sources_config["name"]
-
-        model = self._generate_model_from_schema(schema_dict, schema_name)
-
-        query = Query(self._get_table_columns(model))
-        if sql_query:
-            query = sql_query
-
-        logger.info(f"[postgres] Started downloading table: {schema_name} from: {db_host}:{db_name}")
-        with session_for(connection_string) as session:
-            return self._read_database(session, query, **self.options)
-
-    @staticmethod
-    def _generate_model_from_schema(schema_dict: Mapping, schema_name: str) -> DeclarativeMeta:
-        json_cls_schema: Dict[str, Any] = {"tablename": schema_name, "columns": []}
-
-        for col, dtype in schema_dict.items():
-            new_col = {"name": col}
-
-            if dtype in _type_lookup:
-                new_col.update({"name": col, "type": _type_lookup[dtype]})
-                json_cls_schema["columns"].append(new_col)
-
-        class_name = "".join(word.capitalize() or "_" for word in schema_name.split("_")) + "Model"
-
-        class_dict = {"clsname": class_name, "__tablename__": schema_name, "__table_args__": {"extend_existing": True}}
-        class_dict.update({column["name"]: Column(column["type"], primary_key=True) if idx == 0 else Column(column["type"]) for idx, column in enumerate(json_cls_schema["columns"])})
-
-        generated_model = type(class_name, (Base,), class_dict)
-        return generated_model
-
-    @staticmethod
-    def _get_table_columns(model):
-        tables_colums = []
-        if model:
-            for col in list(model.__table__.columns):
-                tables_colums.append(getattr(model, col.name))
-        return tables_colums
-
-    @staticmethod
-    @utils.allow_options(pd.read_sql)
-    def _read_database(session: SqlAlchemySession, query: Union[str, Query], **options: Any) -> pd.DataFrame:
-        """Run `query` against active `session` and returns the result as a `DataFrame`.
-
-        Args:
-            session: Active session
-            query: If a `Query` object is given, it should be unbound. If a `str` is given, the
-                value is used as-is.
-
-        Returns:
-            DataFrame
-        """
-        if isinstance(query, Query):
-            query = query.with_session(session).statement
-        return pd.read_sql(sql=query, con=session.get_bind(), **options)
-
-    def _write_to_postgres(self, df: pd.DataFrame):
-        """Write a dataframe to postgres based on the {file_type} of the config_io configuration.
-
-        Args:
-            df: The dataframe to be written
-        """
-        postgres_config = self.sources_config["postgres"]
-        db_user = postgres_config["db_user"]
-        db_password = postgres_config["db_password"]
-        db_host = postgres_config["db_host"]
-        db_port = postgres_config["db_port"]
-        db_name = postgres_config["db_name"]
-
-        connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
-        schema_dict = self.sources_config["schema"]
-        schema_name = self.sources_config["name"]
-        model = self._generate_model_from_schema(schema_dict, schema_name)
-
-        is_truncate_and_append = self.options.get("truncate_and_append", False)
-
-        logger.info(f"[postgres] Started downloading table: {schema_name} from: {db_host}:{db_name}")
-        with session_for(connection_string) as session:
-            self._write_to_database(session, model.__tablename__, df, is_truncate_and_append)  # type: ignore
-
-    @staticmethod
-    def _write_to_database(session: SqlAlchemySession, table_name: str, df: pd.DataFrame, is_truncate_and_append: bool):
-        """Write a dataframe to any database provided a session with a data model and a table name.
-
-        Args:
-            session: Generated from a data model and a table name
-            table_name: The name of the table to read from a DB
-            df: The dataframe to be written out
-            is_truncate_and_append: Supply to truncate the table and append new rows to it; otherwise, delete and replace
-        """
-        if is_truncate_and_append:
-            session.execute(f"TRUNCATE TABLE {table_name};")
-
-            # Below is a speedup hack in place of `df.to_csv` with the multipart option. As of today, even with
-            # `method="multi"`, uploading to Postgres is painfully slow. Hence, we're resorting to dumping the file as
-            # csv and using Postgres's CSV import function.
-            # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table
-            with tempfile.NamedTemporaryFile(mode="r+") as temp_file:
-                df.to_csv(temp_file, index=False, header=False, sep="\t", doublequote=False, escapechar="\\", quoting=csv.QUOTE_NONE)
-                temp_file.flush()
-                temp_file.seek(0)
-
-                cur = session.connection().connection.cursor()
-                cur.copy_from(temp_file, table_name, columns=df.columns, null="")
-        else:
-            df.to_sql(name=table_name, con=session.get_bind(), if_exists="replace", index=False)
-
-        session.commit()
-
-
-
-
-
-
-
-

Functions

-
-
-def session_for(connection_string: str) ‑> Generator[sqlalchemy.orm.session.Session, None, None] -
-
-

Connect to a database using connection_string and returns an active session to that connection.

-

Args

-

connection_string:

-

Yields

-

Active session

-
- -Expand source code - -
@contextmanager
-def session_for(connection_string: str) -> Generator[SqlAlchemySession, None, None]:
-    """Connect to a database using `connection_string` and returns an active session to that connection.
-
-    Args:
-        connection_string:
-
-    Yields:
-        Active session
-    """
-    engine = create_engine(connection_string)
-    session = Session(bind=engine)
-
-    try:
-        yield session
-    finally:
-        session.close()  # pylint: disable=no-member
-
-
-
-
-
-

Classes

-
-
-class WithPostgres -
-
-

Handles I/O operations for Postgres.

-

Args

-
    -
  • options:
      -
    • truncate_and_append: bool: If set to True, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows.
    • -
    -
  • -
-
- -Expand source code - -
class WithPostgres:
-    """Handles I/O operations for Postgres.
-
-    Args:
-       - options:
-           - `truncate_and_append: bool`: If set to `True`, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows.
-    """
-
-    sources_config: Mapping
-    schema: Mapping
-    options: MutableMapping[str, Any]
-
-    def _read_from_postgres(self) -> pd.DataFrame:
-        """Read data from postgres as a `DataFrame`.
-
-        The configuration object is expected to have the following keys:
-            - `db_user`
-            - `db_password`
-            - `db_host`
-            - `db_port`
-            - `db_name`
-
-        Returns:
-            DataFrame
-        """
-        postgres_config = self.sources_config["postgres"]
-        db_user = postgres_config["db_user"]
-        db_password = postgres_config["db_password"]
-        db_host = postgres_config["db_host"]
-        db_port = postgres_config["db_port"]
-        db_name = postgres_config["db_name"]
-
-        connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
-        sql_query = self.options.pop("sql_query", None)
-
-        if "schema" not in self.sources_config:
-            schema_dict = self.schema
-        else:
-            schema_dict = self.sources_config["schema"]
-        schema_name = self.sources_config["name"]
-
-        model = self._generate_model_from_schema(schema_dict, schema_name)
-
-        query = Query(self._get_table_columns(model))
-        if sql_query:
-            query = sql_query
-
-        logger.info(f"[postgres] Started downloading table: {schema_name} from: {db_host}:{db_name}")
-        with session_for(connection_string) as session:
-            return self._read_database(session, query, **self.options)
-
-    @staticmethod
-    def _generate_model_from_schema(schema_dict: Mapping, schema_name: str) -> DeclarativeMeta:
-        json_cls_schema: Dict[str, Any] = {"tablename": schema_name, "columns": []}
-
-        for col, dtype in schema_dict.items():
-            new_col = {"name": col}
-
-            if dtype in _type_lookup:
-                new_col.update({"name": col, "type": _type_lookup[dtype]})
-                json_cls_schema["columns"].append(new_col)
-
-        class_name = "".join(word.capitalize() or "_" for word in schema_name.split("_")) + "Model"
-
-        class_dict = {"clsname": class_name, "__tablename__": schema_name, "__table_args__": {"extend_existing": True}}
-        class_dict.update({column["name"]: Column(column["type"], primary_key=True) if idx == 0 else Column(column["type"]) for idx, column in enumerate(json_cls_schema["columns"])})
-
-        generated_model = type(class_name, (Base,), class_dict)
-        return generated_model
-
-    @staticmethod
-    def _get_table_columns(model):
-        tables_colums = []
-        if model:
-            for col in list(model.__table__.columns):
-                tables_colums.append(getattr(model, col.name))
-        return tables_colums
-
-    @staticmethod
-    @utils.allow_options(pd.read_sql)
-    def _read_database(session: SqlAlchemySession, query: Union[str, Query], **options: Any) -> pd.DataFrame:
-        """Run `query` against active `session` and returns the result as a `DataFrame`.
-
-        Args:
-            session: Active session
-            query: If a `Query` object is given, it should be unbound. If a `str` is given, the
-                value is used as-is.
-
-        Returns:
-            DataFrame
-        """
-        if isinstance(query, Query):
-            query = query.with_session(session).statement
-        return pd.read_sql(sql=query, con=session.get_bind(), **options)
-
-    def _write_to_postgres(self, df: pd.DataFrame):
-        """Write a dataframe to postgres based on the {file_type} of the config_io configuration.
-
-        Args:
-            df: The dataframe to be written
-        """
-        postgres_config = self.sources_config["postgres"]
-        db_user = postgres_config["db_user"]
-        db_password = postgres_config["db_password"]
-        db_host = postgres_config["db_host"]
-        db_port = postgres_config["db_port"]
-        db_name = postgres_config["db_name"]
-
-        connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}"
-
-        schema_dict = self.sources_config["schema"]
-        schema_name = self.sources_config["name"]
-        model = self._generate_model_from_schema(schema_dict, schema_name)
-
-        is_truncate_and_append = self.options.get("truncate_and_append", False)
-
-        logger.info(f"[postgres] Started downloading table: {schema_name} from: {db_host}:{db_name}")
-        with session_for(connection_string) as session:
-            self._write_to_database(session, model.__tablename__, df, is_truncate_and_append)  # type: ignore
-
-    @staticmethod
-    def _write_to_database(session: SqlAlchemySession, table_name: str, df: pd.DataFrame, is_truncate_and_append: bool):
-        """Write a dataframe to any database provided a session with a data model and a table name.
-
-        Args:
-            session: Generated from a data model and a table name
-            table_name: The name of the table to read from a DB
-            df: The dataframe to be written out
-            is_truncate_and_append: Supply to truncate the table and append new rows to it; otherwise, delete and replace
-        """
-        if is_truncate_and_append:
-            session.execute(f"TRUNCATE TABLE {table_name};")
-
-            # Below is a speedup hack in place of `df.to_csv` with the multipart option. As of today, even with
-            # `method="multi"`, uploading to Postgres is painfully slow. Hence, we're resorting to dumping the file as
-            # csv and using Postgres's CSV import function.
-            # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table
-            with tempfile.NamedTemporaryFile(mode="r+") as temp_file:
-                df.to_csv(temp_file, index=False, header=False, sep="\t", doublequote=False, escapechar="\\", quoting=csv.QUOTE_NONE)
-                temp_file.flush()
-                temp_file.seek(0)
-
-                cur = session.connection().connection.cursor()
-                cur.copy_from(temp_file, table_name, columns=df.columns, null="")
-        else:
-            df.to_sql(name=table_name, con=session.get_bind(), if_exists="replace", index=False)
-
-        session.commit()
-
-

Subclasses

- -

Class variables

-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/mixins/with_s3.html b/docs/mixins/with_s3.html deleted file mode 100644 index a7c053c..0000000 --- a/docs/mixins/with_s3.html +++ /dev/null @@ -1,659 +0,0 @@ - - - - - - -dynamicio.mixins.with_s3 API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.mixins.with_s3

-
-
-

This module provides mixins that are providing S3 I/O support.

-
- -Expand source code - -
# pylint: disable=no-member, protected-access, too-few-public-methods
-
-"""This module provides mixins that are providing S3 I/O support."""
-
-import os
-import tempfile
-from contextlib import contextmanager
-from typing import Generator
-
-import boto3  # type: ignore
-import pandas as pd  # type: ignore
-from awscli.clidriver import create_clidriver  # type: ignore
-from magic_logger import logger
-
-
-from . import (
-    utils,
-    with_local,
-)
-
-
-def awscli_runner(*cmd: str):
-    """Runs the awscli command provided.
-
-    Args:
-        *cmd: A list of args used in the command.
-
-    Raises:
-        A runtime error exception is raised if download fails.
-
-    Example:
-
-        >>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".")
-    """
-    # Run
-    exit_code = create_clidriver().main(cmd)
-
-    if exit_code > 0:
-        raise RuntimeError(f"AWS CLI exited with code {exit_code}")
-
-
-class WithS3PathPrefix(with_local.WithLocal):
-    """Handles I/O operations for AWS S3; implements read operations only.
-
-    This mixin assumes that the directories it reads from will only contain a single file-type.
-    """
-
-    def _write_to_s3_path_prefix(self, df: pd.DataFrame):
-        """Write a DataFrame to an S3 path prefix.
-
-        The configuration object is expected to have the following keys:
-            - `bucket`
-            - `path_prefix`
-            - `file_type`
-
-        Args:
-            df (pd.DataFrame): the DataFrame to be written to S3
-
-        Raises:
-            ValueError: In case `path_prefix` is missing from config
-            ValueError: In case the `partition_cols` arg is missing while trying to write a parquet file
-        """
-        s3_config = self.sources_config["s3"]
-        if "path_prefix" not in s3_config:
-            raise ValueError("`path_prefix` is required to write multiple files to an S3 key")
-
-        file_type = s3_config["file_type"]
-        if file_type != "parquet":
-            raise ValueError(f"File type not supported: {file_type}, only parquet files can be written to an S3 key")
-        if "partition_cols" not in self.options:
-            raise ValueError("`partition_cols` is required as an option to write partitioned parquet files to S3")
-
-        bucket = s3_config["bucket"]
-        path_prefix = s3_config["path_prefix"]
-        full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            self._write_parquet_file(df, temp_dir, **self.options)
-            awscli_runner(
-                "s3",
-                "sync",
-                temp_dir,
-                full_path_prefix,
-                "--acl",
-                "bucket-owner-full-control",
-                "--only-show-errors",
-                "--exact-timestamps",
-            )
-
-    def _read_from_s3_path_prefix(self) -> pd.DataFrame:
-        """Read all files under a path prefix from an S3 bucket as a `DataFrame`.
-
-        The configuration object is expected to have the following keys:
-            - `bucket`
-            - `path_prefix`
-            - `file_type`
-
-        To actually read the file, a method is dynamically invoked by name, using
-        "_read_{file_type}_path_prefix".
-
-        Returns:
-            DataFrame
-        """
-        s3_config = self.sources_config["s3"]
-        if "path_prefix" not in s3_config:
-            raise ValueError("`path_prefix` is required to read multiple files from an S3 source")
-
-        file_type = s3_config["file_type"]
-        if file_type not in {"parquet", "csv", "hdf", "json"}:
-            raise ValueError(f"File type not supported: {file_type}")
-
-        bucket = s3_config["bucket"]
-        path_prefix = s3_config["path_prefix"]
-        full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
-        # The `no_disk_space` option should be used only when reading a subset of columns from S3
-        if self.options.pop("no_disk_space", False) and file_type == "parquet":
-            return self._read_parquet_file(full_path_prefix, self.schema, **self.options)
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # aws-cli is shown to be up to 6 times faster when downloading the complete dataset from S3 than using the boto3
-            # client or pandas directly. This is because aws-cli uses the parallel downloader, which is much faster than the
-            # boto3 client.
-            awscli_runner(
-                "s3",
-                "sync",
-                full_path_prefix,
-                temp_dir,
-                "--acl",
-                "bucket-owner-full-control",
-                "--only-show-errors",
-                "--exact-timestamps",
-            )
-
-            dfs = []
-            for file in os.listdir(temp_dir):
-                df = getattr(self, f"_read_{file_type}_file")(os.path.join(temp_dir, file), self.schema, **self.options)  # type: ignore
-                if len(df) > 0:
-                    dfs.append(df)
-
-            return pd.concat(dfs, ignore_index=True)
-
-
-class WithS3File(with_local.WithLocal):
-    """Handles I/O operations for AWS S3.
-
-    All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory.
-    Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support
-    for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.
-
-    Options:
-        no_disk_space: If `True`, then s3fs + fsspec will be used to read data directly into memory.
-    """
-
-    boto3_client = boto3.client("s3")
-
-    @contextmanager
-    def _s3_reader(self, s3_bucket: str, s3_key: str) -> Generator:
-        """Contextmanager to abstract reading different file types in S3.
-
-        Args:
-            s3_bucket: The S3 bucket from where to read the file.
-            s3_key: The file-path to the target file to be read.
-
-        Returns:
-            The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
-
-        """
-        with tempfile.NamedTemporaryFile("wb") as target_file:
-            # Download the file from S3
-            self.boto3_client.download_fileobj(s3_bucket, s3_key, target_file)
-            # Yield local file path to body of `with` statement
-            target_file.flush()
-            yield target_file
-
-    @contextmanager
-    def _s3_writer(self, s3_bucket: str, s3_key: str) -> Generator:
-        """Contextmanager to abstract loading different file types to S3.
-
-        Args:
-            s3_bucket: The S3 bucket to upload the file to.
-            s3_key: The file-path where the target file should be uploaded to.
-
-        Returns:
-            The local file path where to actually write the file, to be read and uploaded by boto3.client.
-        """
-        with tempfile.NamedTemporaryFile("wb") as target_file:
-            # Yield local file path to body of `with` statement
-            yield target_file
-            target_file.flush()
-
-            # Upload the file to S3
-            self.boto3_client.upload_file(target_file.name, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"})
-
-    def _read_from_s3_file(self) -> pd.DataFrame:
-        """Read a file from an S3 bucket as a `DataFrame`.
-
-        The configuration object is expected to have the following keys:
-            - `bucket`
-            - `file_path`
-            - `file_type`
-
-        To actually read the file, a method is dynamically invoked by name, using "_read_{file_type}_file".
-
-        Returns:
-            DataFrame
-        """
-        s3_config = self.sources_config["s3"]
-        if "file_path" not in s3_config:
-            raise ValueError("`file_path` is required for reading a file from an S3 source")
-
-        file_type = s3_config["file_type"]
-        file_path = utils.resolve_template(s3_config["file_path"], self.options)
-        bucket = s3_config["bucket"]
-
-        logger.info(f"[s3] Started downloading: s3://{s3_config['bucket']}/{file_path}")
-        if file_type in ["csv", "json", "parquet"] and self.options.pop("no_disk_space", None):
-            return getattr(self, f"_read_{file_type}_file")(f"s3://{s3_config['bucket']}/{file_path}", self.schema, **self.options)  # type: ignore
-        with self._s3_reader(s3_bucket=bucket, s3_key=file_path) as target_file:  # type: ignore
-            return getattr(self, f"_read_{file_type}_file")(target_file.name, self.schema, **self.options)  # type: ignore
-
-    def _write_to_s3_file(self, df: pd.DataFrame):
-        """Write a dataframe to s3 based on the {file_type} of the config_io configuration.
-
-        The configuration object is expected to have two keys:
-
-            - `file_path`
-            - `file_type`
-
-        To actually write the file, a method is dynamically invoked by name, using "_write_{file_type}_file".
-
-        Args:
-            df: The dataframe to be written out
-        """
-        s3_config = self.sources_config["s3"]
-        file_path = utils.resolve_template(s3_config["file_path"], self.options)
-        file_type = s3_config["file_type"]
-
-        logger.info(f"[s3] Started uploading: s3://{s3_config['bucket']}/{file_path}")
-        if file_type in ["csv", "json", "parquet"]:
-            getattr(self, f"_write_{file_type}_file")(df, f"s3://{s3_config['bucket']}/{file_path}", **self.options)  # type: ignore
-        elif file_type == "hdf":
-            with self._s3_writer(s3_bucket=s3_config["bucket"], s3_key=file_path) as target_file:  # type: ignore
-                self._write_hdf_file(df, target_file.name, **self.options)  # type: ignore
-        else:
-            raise ValueError(f"File type: {file_type} not supported!")
-        logger.info(f"[s3] Finished uploading: s3://{s3_config['bucket']}/{file_path}")
-
-
-
-
-
-
-
-

Functions

-
-
-def awscli_runner(*cmd: str) -
-
-

Runs the awscli command provided.

-

Args

-
-
*cmd
-
A list of args used in the command.
-
-

Raises

-

A runtime error exception is raised if download fails.

-

Example

-
>>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".")
-
-
- -Expand source code - -
def awscli_runner(*cmd: str):
-    """Runs the awscli command provided.
-
-    Args:
-        *cmd: A list of args used in the command.
-
-    Raises:
-        A runtime error exception is raised if download fails.
-
-    Example:
-
-        >>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".")
-    """
-    # Run
-    exit_code = create_clidriver().main(cmd)
-
-    if exit_code > 0:
-        raise RuntimeError(f"AWS CLI exited with code {exit_code}")
-
-
-
-
-
-

Classes

-
-
-class WithS3File -
-
-

Handles I/O operations for AWS S3.

-

All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory. -Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support -for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.

-

Options

-

no_disk_space: If True, then s3fs + fsspec will be used to read data directly into memory.

-
- -Expand source code - -
class WithS3File(with_local.WithLocal):
-    """Handles I/O operations for AWS S3.
-
-    All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory.
-    Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support
-    for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway.
-
-    Options:
-        no_disk_space: If `True`, then s3fs + fsspec will be used to read data directly into memory.
-    """
-
-    boto3_client = boto3.client("s3")
-
-    @contextmanager
-    def _s3_reader(self, s3_bucket: str, s3_key: str) -> Generator:
-        """Contextmanager to abstract reading different file types in S3.
-
-        Args:
-            s3_bucket: The S3 bucket from where to read the file.
-            s3_key: The file-path to the target file to be read.
-
-        Returns:
-            The local file path from where the file can be read, once it has been downloaded there by the boto3.client.
-
-        """
-        with tempfile.NamedTemporaryFile("wb") as target_file:
-            # Download the file from S3
-            self.boto3_client.download_fileobj(s3_bucket, s3_key, target_file)
-            # Yield local file path to body of `with` statement
-            target_file.flush()
-            yield target_file
-
-    @contextmanager
-    def _s3_writer(self, s3_bucket: str, s3_key: str) -> Generator:
-        """Contextmanager to abstract loading different file types to S3.
-
-        Args:
-            s3_bucket: The S3 bucket to upload the file to.
-            s3_key: The file-path where the target file should be uploaded to.
-
-        Returns:
-            The local file path where to actually write the file, to be read and uploaded by boto3.client.
-        """
-        with tempfile.NamedTemporaryFile("wb") as target_file:
-            # Yield local file path to body of `with` statement
-            yield target_file
-            target_file.flush()
-
-            # Upload the file to S3
-            self.boto3_client.upload_file(target_file.name, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"})
-
-    def _read_from_s3_file(self) -> pd.DataFrame:
-        """Read a file from an S3 bucket as a `DataFrame`.
-
-        The configuration object is expected to have the following keys:
-            - `bucket`
-            - `file_path`
-            - `file_type`
-
-        To actually read the file, a method is dynamically invoked by name, using "_read_{file_type}_file".
-
-        Returns:
-            DataFrame
-        """
-        s3_config = self.sources_config["s3"]
-        if "file_path" not in s3_config:
-            raise ValueError("`file_path` is required for reading a file from an S3 source")
-
-        file_type = s3_config["file_type"]
-        file_path = utils.resolve_template(s3_config["file_path"], self.options)
-        bucket = s3_config["bucket"]
-
-        logger.info(f"[s3] Started downloading: s3://{s3_config['bucket']}/{file_path}")
-        if file_type in ["csv", "json", "parquet"] and self.options.pop("no_disk_space", None):
-            return getattr(self, f"_read_{file_type}_file")(f"s3://{s3_config['bucket']}/{file_path}", self.schema, **self.options)  # type: ignore
-        with self._s3_reader(s3_bucket=bucket, s3_key=file_path) as target_file:  # type: ignore
-            return getattr(self, f"_read_{file_type}_file")(target_file.name, self.schema, **self.options)  # type: ignore
-
-    def _write_to_s3_file(self, df: pd.DataFrame):
-        """Write a dataframe to s3 based on the {file_type} of the config_io configuration.
-
-        The configuration object is expected to have two keys:
-
-            - `file_path`
-            - `file_type`
-
-        To actually write the file, a method is dynamically invoked by name, using "_write_{file_type}_file".
-
-        Args:
-            df: The dataframe to be written out
-        """
-        s3_config = self.sources_config["s3"]
-        file_path = utils.resolve_template(s3_config["file_path"], self.options)
-        file_type = s3_config["file_type"]
-
-        logger.info(f"[s3] Started uploading: s3://{s3_config['bucket']}/{file_path}")
-        if file_type in ["csv", "json", "parquet"]:
-            getattr(self, f"_write_{file_type}_file")(df, f"s3://{s3_config['bucket']}/{file_path}", **self.options)  # type: ignore
-        elif file_type == "hdf":
-            with self._s3_writer(s3_bucket=s3_config["bucket"], s3_key=file_path) as target_file:  # type: ignore
-                self._write_hdf_file(df, target_file.name, **self.options)  # type: ignore
-        else:
-            raise ValueError(f"File type: {file_type} not supported!")
-        logger.info(f"[s3] Finished uploading: s3://{s3_config['bucket']}/{file_path}")
-
-

Ancestors

- -

Subclasses

- -

Class variables

-
-
var boto3_client
-
-
-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-
-
-class WithS3PathPrefix -
-
-

Handles I/O operations for AWS S3; implements read operations only.

-

This mixin assumes that the directories it reads from will only contain a single file-type.

-
- -Expand source code - -
class WithS3PathPrefix(with_local.WithLocal):
-    """Handles I/O operations for AWS S3; implements read operations only.
-
-    This mixin assumes that the directories it reads from will only contain a single file-type.
-    """
-
-    def _write_to_s3_path_prefix(self, df: pd.DataFrame):
-        """Write a DataFrame to an S3 path prefix.
-
-        The configuration object is expected to have the following keys:
-            - `bucket`
-            - `path_prefix`
-            - `file_type`
-
-        Args:
-            df (pd.DataFrame): the DataFrame to be written to S3
-
-        Raises:
-            ValueError: In case `path_prefix` is missing from config
-            ValueError: In case the `partition_cols` arg is missing while trying to write a parquet file
-        """
-        s3_config = self.sources_config["s3"]
-        if "path_prefix" not in s3_config:
-            raise ValueError("`path_prefix` is required to write multiple files to an S3 key")
-
-        file_type = s3_config["file_type"]
-        if file_type != "parquet":
-            raise ValueError(f"File type not supported: {file_type}, only parquet files can be written to an S3 key")
-        if "partition_cols" not in self.options:
-            raise ValueError("`partition_cols` is required as an option to write partitioned parquet files to S3")
-
-        bucket = s3_config["bucket"]
-        path_prefix = s3_config["path_prefix"]
-        full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            self._write_parquet_file(df, temp_dir, **self.options)
-            awscli_runner(
-                "s3",
-                "sync",
-                temp_dir,
-                full_path_prefix,
-                "--acl",
-                "bucket-owner-full-control",
-                "--only-show-errors",
-                "--exact-timestamps",
-            )
-
-    def _read_from_s3_path_prefix(self) -> pd.DataFrame:
-        """Read all files under a path prefix from an S3 bucket as a `DataFrame`.
-
-        The configuration object is expected to have the following keys:
-            - `bucket`
-            - `path_prefix`
-            - `file_type`
-
-        To actually read the file, a method is dynamically invoked by name, using
-        "_read_{file_type}_path_prefix".
-
-        Returns:
-            DataFrame
-        """
-        s3_config = self.sources_config["s3"]
-        if "path_prefix" not in s3_config:
-            raise ValueError("`path_prefix` is required to read multiple files from an S3 source")
-
-        file_type = s3_config["file_type"]
-        if file_type not in {"parquet", "csv", "hdf", "json"}:
-            raise ValueError(f"File type not supported: {file_type}")
-
-        bucket = s3_config["bucket"]
-        path_prefix = s3_config["path_prefix"]
-        full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options)
-
-        # The `no_disk_space` option should be used only when reading a subset of columns from S3
-        if self.options.pop("no_disk_space", False) and file_type == "parquet":
-            return self._read_parquet_file(full_path_prefix, self.schema, **self.options)
-
-        with tempfile.TemporaryDirectory() as temp_dir:
-            # aws-cli is shown to be up to 6 times faster when downloading the complete dataset from S3 than using the boto3
-            # client or pandas directly. This is because aws-cli uses the parallel downloader, which is much faster than the
-            # boto3 client.
-            awscli_runner(
-                "s3",
-                "sync",
-                full_path_prefix,
-                temp_dir,
-                "--acl",
-                "bucket-owner-full-control",
-                "--only-show-errors",
-                "--exact-timestamps",
-            )
-
-            dfs = []
-            for file in os.listdir(temp_dir):
-                df = getattr(self, f"_read_{file_type}_file")(os.path.join(temp_dir, file), self.schema, **self.options)  # type: ignore
-                if len(df) > 0:
-                    dfs.append(df)
-
-            return pd.concat(dfs, ignore_index=True)
-
-

Ancestors

- -

Subclasses

- -

Class variables

-
-
var options : MutableMapping[str, Any]
-
-
-
-
var schema : Mapping[~KT, +VT_co]
-
-
-
-
var sources_config : Mapping[~KT, +VT_co]
-
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/docs/validations.html b/docs/validations.html deleted file mode 100644 index 4a52e20..0000000 --- a/docs/validations.html +++ /dev/null @@ -1,923 +0,0 @@ - - - - - - -dynamicio.validations API documentation - - - - - - - - - - - -
-
-
-

Module dynamicio.validations

-
-
-

Implements the Validator class responsible for various generic data validations and metrics generation.

-
- -Expand source code - -
"""Implements the Validator class responsible for various generic data validations and metrics generation."""
-__all__ = [
-    "has_unique_values",
-    "has_no_null_values",
-    "has_acceptable_percentage_of_nulls",
-    "is_in",
-    "is_greater_than",
-    "is_greater_than_or_equal",
-    "is_lower_than",
-    "is_lower_than_or_equal",
-    "is_between",
-]
-
-import operator
-from typing import NamedTuple, Set
-
-import pandas as pd  # type: ignore
-
-
-class ValidationResult(NamedTuple):
-    """A NamedTuple for capturing different outputs after a validation."""
-
-    valid: bool
-    message: str
-    value: float
-
-
-def has_unique_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
-    """Checks if values in column are unique.
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A pandas DataFrame
-        column: The column to be validated
-
-    Returns:
-        An instance of  ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and  `Validation.Result.value` is no_of_duplicated_elements
-    """
-    counts = df[column].value_counts()
-    if not (counts > 1).any():
-        return ValidationResult(valid=True, message=f"{dataset}[{column}] has unique values", value=0)
-
-    duplicates = counts[counts > 1].index.to_list()
-    return ValidationResult(valid=False, message=f"Values {duplicates} for {dataset}[{column}] are duplicated!", value=len(duplicates))
-
-
-def has_no_null_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
-    """Checks if column has any null values (including NaN and NaT values).
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A pandas DataFrame
-        column: The column to be validated
-
-    Returns:
-        An instance of  ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and  `Validation.Result.value` is no_of_nulls
-    """
-    mask = df[column].isnull()
-    no_of_nulls = mask.sum()
-    return ValidationResult(valid=not mask.any(), message=f"{dataset}[{column}] has {no_of_nulls} nulls", value=no_of_nulls)
-
-
-def has_acceptable_percentage_of_nulls(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Checks if a provided threshold of max nulls has been exceeded.
-
-    Note: For an empty df the validation will always be successful
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A pandas DataFrame
-        column: The column to be validated
-        threshold: Maximum allowed threshold
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and  `Validation.Result.value` is percentage_of_nulls
-    """
-    if threshold <= 0 or threshold >= 1:
-        raise ValueError(f"Threshold value: {threshold} must be a value between 0 and 1.")
-
-    no_of_nulls = df[column].isnull().sum()
-    if len(df) == 0:
-        percentage_of_nulls = 0
-    else:
-        percentage_of_nulls = no_of_nulls / len(df)
-
-    if percentage_of_nulls < threshold:
-        return ValidationResult(
-            valid=True,
-            message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls}",
-            value=percentage_of_nulls,
-        )
-    return ValidationResult(
-        valid=False,
-        message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls} which exceeds threshold: {threshold}",
-        value=percentage_of_nulls,
-    )
-
-
-def is_in(dataset: str, df: pd.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) -> ValidationResult:
-    """Checks if the column only has allowed categorical values as per the set provided.
-
-    Note:
-        Ignores nulls
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        categorical_values: The allowed set of categorical values
-        match_all: If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_not_acceptable
-    """
-    unique_values = set(df[column][df[column].notna()].unique())
-
-    if match_all:
-        return _validate_categoricals_are_a_subset_of_the_acceptable(categorical_values, unique_values, column, dataset, df)
-    return _validate_all_acceptable_categoricals_are_present(categorical_values, unique_values, column, dataset, df)
-
-
-def _validate_all_acceptable_categoricals_are_present(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult:
-    if unique_values == acceptable_categoricals:
-        validation_result = ValidationResult(valid=True, message=f"All acceptable categorical values for {dataset}[{column}] are present", value=0)
-    elif unique_values < acceptable_categoricals:
-        validation_result = ValidationResult(
-            valid=False,
-            message=f"Missing categorical values for {dataset}[{column}]: {acceptable_categoricals - unique_values}",
-            value=len(acceptable_categoricals - unique_values),
-        )
-    else:
-        count_invalid = (~df[column].isin(acceptable_categoricals)).sum()
-        validation_result = ValidationResult(
-            valid=False,
-            message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells",
-            value=count_invalid,
-        )
-    return validation_result
-
-
-def _validate_categoricals_are_a_subset_of_the_acceptable(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult:
-    if unique_values.issubset(acceptable_categoricals):
-        return ValidationResult(valid=True, message=f"Categorical values for {dataset}[{column}] are acceptable", value=0)
-    count_invalid = (~df[column].isin(acceptable_categoricals)).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells",
-        value=count_invalid,
-    )
-
-
-def is_greater_than(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are above a given threshold.
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
-        percentage of invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df > threshold
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-def is_greater_than_or_equal(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are above a given threshold.
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
-        percentage of invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df >= threshold
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-def is_lower_than(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are below a given threshold.
-
-    IMPORTANT NOTE: Ignores nulls!
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
-        invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df < threshold  # pd.DataFrame
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-def is_lower_than_or_equal(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are below a given threshold.
-
-    IMPORTANT NOTE: Ignores nulls!
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
-        invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df <= threshold
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-def is_between(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    lower: float,
-    upper: float,
-    include_left: bool = False,
-    include_right: bool = False,
-) -> ValidationResult:
-    """Confirms column values are between a lower bound and an upper bound thresholds.
-
-    IMPORTANT NOTE: Ignores nulls!
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        lower: The lower bound (left)
-        upper: The upper bound (right)
-        include_left: `left <= df[column]`
-        include_right: `df[column] <=right`
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
-        invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    lower_bound_operator = operator.ge if include_left else operator.gt
-    upper_bound_operator = operator.le if include_right else operator.lt
-
-    valid = lower_bound_operator(no_nulls_for_column_df, lower) & upper_bound_operator(no_nulls_for_column_df, upper)
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] is between {lower} and {upper} thresholds", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are either below {lower} or above {upper}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-
-
-
-
-
-

Functions

-
-
-def has_acceptable_percentage_of_nulls(dataset: str, df: pandas.core.frame.DataFrame, column: str, threshold: float) ‑> dynamicio.validations.ValidationResult -
-
-

Checks if a provided threshold of max nulls has been exceeded.

-

Note: For an empty df the validation will always be successful

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A pandas DataFrame
-
column
-
The column to be validated
-
threshold
-
Maximum allowed threshold
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and -Validation.Result.value is percentage_of_nulls

-
- -Expand source code - -
def has_acceptable_percentage_of_nulls(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Checks if a provided threshold of max nulls has been exceeded.
-
-    Note: For an empty df the validation will always be successful
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A pandas DataFrame
-        column: The column to be validated
-        threshold: Maximum allowed threshold
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and  `Validation.Result.value` is percentage_of_nulls
-    """
-    if threshold <= 0 or threshold >= 1:
-        raise ValueError(f"Threshold value: {threshold} must be a value between 0 and 1.")
-
-    no_of_nulls = df[column].isnull().sum()
-    if len(df) == 0:
-        percentage_of_nulls = 0
-    else:
-        percentage_of_nulls = no_of_nulls / len(df)
-
-    if percentage_of_nulls < threshold:
-        return ValidationResult(
-            valid=True,
-            message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls}",
-            value=percentage_of_nulls,
-        )
-    return ValidationResult(
-        valid=False,
-        message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls} which exceeds threshold: {threshold}",
-        value=percentage_of_nulls,
-    )
-
-
-
-def has_no_null_values(dataset: str, df: pandas.core.frame.DataFrame, column: str) ‑> dynamicio.validations.ValidationResult -
-
-

Checks if column has any null values (including NaN and NaT values).

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A pandas DataFrame
-
column
-
The column to be validated
-
-

Returns

-

An instance of -ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and -Validation.Result.value is no_of_nulls

-
- -Expand source code - -
def has_no_null_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
-    """Checks if column has any null values (including NaN and NaT values).
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A pandas DataFrame
-        column: The column to be validated
-
-    Returns:
-        An instance of  ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and  `Validation.Result.value` is no_of_nulls
-    """
-    mask = df[column].isnull()
-    no_of_nulls = mask.sum()
-    return ValidationResult(valid=not mask.any(), message=f"{dataset}[{column}] has {no_of_nulls} nulls", value=no_of_nulls)
-
-
-
-def has_unique_values(dataset: str, df: pandas.core.frame.DataFrame, column: str) ‑> dynamicio.validations.ValidationResult -
-
-

Checks if values in column are unique.

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A pandas DataFrame
-
column
-
The column to be validated
-
-

Returns

-

An instance of -ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and -Validation.Result.value is no_of_duplicated_elements

-
- -Expand source code - -
def has_unique_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult:
-    """Checks if values in column are unique.
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A pandas DataFrame
-        column: The column to be validated
-
-    Returns:
-        An instance of  ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and  `Validation.Result.value` is no_of_duplicated_elements
-    """
-    counts = df[column].value_counts()
-    if not (counts > 1).any():
-        return ValidationResult(valid=True, message=f"{dataset}[{column}] has unique values", value=0)
-
-    duplicates = counts[counts > 1].index.to_list()
-    return ValidationResult(valid=False, message=f"Values {duplicates} for {dataset}[{column}] are duplicated!", value=len(duplicates))
-
-
-
-def is_between(dataset: str, df: pandas.core.frame.DataFrame, column: str, lower: float, upper: float, include_left: bool = False, include_right: bool = False) ‑> dynamicio.validations.ValidationResult -
-
-

Confirms column values are between a lower bound and an upper bound thresholds.

-

IMPORTANT NOTE: Ignores nulls!

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
lower
-
The lower bound (left)
-
upper
-
The upper bound (right)
-
include_left
-
left <= df[column]
-
include_right
-
df[column] <=right
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the percentage of -invalid values

-
- -Expand source code - -
def is_between(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    lower: float,
-    upper: float,
-    include_left: bool = False,
-    include_right: bool = False,
-) -> ValidationResult:
-    """Confirms column values are between a lower bound and an upper bound thresholds.
-
-    IMPORTANT NOTE: Ignores nulls!
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        lower: The lower bound (left)
-        upper: The upper bound (right)
-        include_left: `left <= df[column]`
-        include_right: `df[column] <=right`
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
-        invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    lower_bound_operator = operator.ge if include_left else operator.gt
-    upper_bound_operator = operator.le if include_right else operator.lt
-
-    valid = lower_bound_operator(no_nulls_for_column_df, lower) & upper_bound_operator(no_nulls_for_column_df, upper)
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] is between {lower} and {upper} thresholds", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are either below {lower} or above {upper}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-
-def is_greater_than(dataset: str, df: pandas.core.frame.DataFrame, column: str, threshold: float) ‑> dynamicio.validations.ValidationResult -
-
-

Confirms column values are above a given threshold.

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the -percentage of invalid values

-
- -Expand source code - -
def is_greater_than(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are above a given threshold.
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
-        percentage of invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df > threshold
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-
-def is_greater_than_or_equal(dataset: str, df: pandas.core.frame.DataFrame, column: str, threshold: float) ‑> dynamicio.validations.ValidationResult -
-
-

Confirms column values are above a given threshold.

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the -percentage of invalid values

-
- -Expand source code - -
def is_greater_than_or_equal(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are above a given threshold.
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the
-        percentage of invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df >= threshold
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-
-def is_in(dataset: str, df: pandas.core.frame.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) ‑> dynamicio.validations.ValidationResult -
-
-

Checks if the column only has allowed categorical values as per the set provided.

-

Note

-

Ignores nulls

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
categorical_values
-
The allowed set of categorical values
-
match_all
-
If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is no_of_not_acceptable

-
- -Expand source code - -
def is_in(dataset: str, df: pd.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) -> ValidationResult:
-    """Checks if the column only has allowed categorical values as per the set provided.
-
-    Note:
-        Ignores nulls
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        categorical_values: The allowed set of categorical values
-        match_all: If True, the categorical values must be a subset of the allowed set, otherwise they must be equal
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_not_acceptable
-    """
-    unique_values = set(df[column][df[column].notna()].unique())
-
-    if match_all:
-        return _validate_categoricals_are_a_subset_of_the_acceptable(categorical_values, unique_values, column, dataset, df)
-    return _validate_all_acceptable_categoricals_are_present(categorical_values, unique_values, column, dataset, df)
-
-
-
-def is_lower_than(dataset: str, df: pandas.core.frame.DataFrame, column: str, threshold: float) ‑> dynamicio.validations.ValidationResult -
-
-

Confirms column values are below a given threshold.

-

IMPORTANT NOTE: Ignores nulls!

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the percentage of -invalid values

-
- -Expand source code - -
def is_lower_than(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are below a given threshold.
-
-    IMPORTANT NOTE: Ignores nulls!
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
-        invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df < threshold  # pd.DataFrame
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-
-def is_lower_than_or_equal(dataset: str, df: pandas.core.frame.DataFrame, column: str, threshold: float) ‑> dynamicio.validations.ValidationResult -
-
-

Confirms column values are below a given threshold.

-

IMPORTANT NOTE: Ignores nulls!

-

Args

-
-
dataset
-
Name fo the dataset_name
-
df
-
A DataFrame
-
column
-
The DataFrame column to be validated
-
threshold
-
A lower bound threshold not to be exceeded
-
-

Returns

-

An instance of ValidationResult where Validation.Result.valid is a bool indicate the success of the validation, -Validation.Result.message is a message (usually used in exceptions), and Validation.Result.value is the percentage of -invalid values

-
- -Expand source code - -
def is_lower_than_or_equal(
-    dataset: str,
-    df: pd.DataFrame,
-    column: str,
-    threshold: float,
-) -> ValidationResult:
-    """Confirms column values are below a given threshold.
-
-    IMPORTANT NOTE: Ignores nulls!
-
-    Args:
-        dataset: Name fo the dataset_name
-        df: A DataFrame
-        column: The DataFrame column to be validated
-        threshold: A lower bound threshold not to be exceeded
-
-    Returns:
-        An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation,
-        `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of
-        invalid values
-    """
-    no_nulls_for_column_df = df[~df[column].isnull()][column]
-    valid = no_nulls_for_column_df <= threshold
-
-    if valid.all():
-        return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0)
-
-    no_of_invalid = (~valid).sum()
-    return ValidationResult(
-        valid=False,
-        message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}",
-        value=no_of_invalid / len(no_nulls_for_column_df),
-    )
-
-
-
-
-
-
-
- -
- - - \ No newline at end of file diff --git a/tests/resources/data/processed/.gitkeep b/dynamicio/.gitkeep similarity index 100% rename from tests/resources/data/processed/.gitkeep rename to dynamicio/.gitkeep diff --git a/dynamicio/__init__.py b/dynamicio/__init__.py index 6b037c0..6ca418f 100644 --- a/dynamicio/__init__.py +++ b/dynamicio/__init__.py @@ -1,53 +1,7 @@ """A package for wrapping your I/O operations.""" -import os -from contextlib import suppress -import pkg_resources -from magic_logger import logger +import logging -with suppress(Exception): - __version__ = pkg_resources.get_distribution("dynamicio").version +from dynamicio.io import LocalFileResource, S3Resource, PostgresResource, KafkaResource -from dynamicio.core import DynamicDataIO -from dynamicio.mixins import WithKafka, WithLocal, WithLocalBatch, WithPostgres, WithS3File, WithS3PathPrefix - -os.environ["LC_CTYPE"] = "en_US.UTF" # Set your locale to a unicode-compatible one - - -class UnifiedIO(WithS3File, WithS3PathPrefix, WithLocalBatch, WithLocal, WithKafka, WithPostgres, DynamicDataIO): # type: ignore - """A unified io composed of dynamicio.mixins.""" - - -logging_config = { - "version": 1, - "disable_existing_loggers": True, - "formatters": { - "standard": {"format": "%(asctime)s [%(levelname)s] %(name)s: %(message)s"}, - "generic-metrics": {"format": "%(message)s"}, - }, - "handlers": { - "default": { - "level": "INFO", - "formatter": "standard", - "class": "logging.StreamHandler", - "stream": "ext://sys.stdout", # Default is stderr - }, - "metrics": { - "level": "INFO", - "formatter": "generic-metrics", - "class": "logging.StreamHandler", - "stream": "ext://sys.stdout", # Default is stderr - }, - }, - "loggers": { - "": {"handlers": ["default"], "level": "INFO", "propagate": False}, - "dynamicio.metrics": {"handlers": ["metrics"], "level": "INFO", "propagate": False}, - "awscli": { - "handlers": ["default"], - "level": "INFO", - "propagate": False, - }, - }, -} - -logger.dict_config(logging_config) +logging.getLogger(__name__).addHandler(logging.NullHandler()) diff --git a/dynamicio/__main__.py b/dynamicio/__main__.py deleted file mode 100644 index ba3addf..0000000 --- a/dynamicio/__main__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Invokes dynamicio cli.""" -from dynamicio.cli import run - -run() diff --git a/dynamicio/cli.py b/dynamicio/cli.py deleted file mode 100644 index 9dfeb16..0000000 --- a/dynamicio/cli.py +++ /dev/null @@ -1,103 +0,0 @@ -"""Implements the dynamicio Command Line Interface (CLI).""" -import argparse -import glob -import os -import pprint -from typing import Mapping, MutableMapping, Optional, Sequence - -import pandas as pd # type: ignore -import yaml - -from dynamicio.errors import InvalidDatasetTypeError - - -def parse_args(args: Optional[Sequence] = None) -> argparse.Namespace: - """Arguments parser for dynamicio cli.py. - - Args: - args: List of args to be parsed. Defaults to None, in which case - sys.argv[1:] is used. - - Returns: - An instance of ArgumentParser populated with the provided args. - """ - parser = argparse.ArgumentParser(prog="dynamicio", description="Generate dataset schemas") - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( - "-b", - "--batch", - action="store_true", - help="flag, used to generate multiple schemas provided a datasets directory.", - ) - group.add_argument( - "-s", - "--single", - action="store_true", - help="flag, used to generate a schema provided a single dataset.", - ) - parser.add_argument("-p", "--path", required=True, help="the path to the dataset/datasets-directory.", type=str) - parser.add_argument("-o", "--output", required=True, help="the path to the schemas output directory.", type=str) - return parser.parse_args(args) - - -def generate_schema_for(dataset: str) -> Mapping: - """Generate a schema for a dataset. - - Args: - dataset: The path to the dataset for which we want to generate a schema - - Returns: - A dictionary containing the schema for the dataset, or None if the dataset is not valid. - - Raises: - InvalidDatasetTypeError: If the dataset type is not supported by dynamicio. - """ - dataset_name, file_type = os.path.splitext(os.path.basename(dataset)) - if file_type == ".parquet": - df = pd.read_parquet(dataset) - elif file_type == ".csv": - df = pd.read_csv(dataset) - elif file_type == ".json": - df = pd.read_json(dataset) - elif file_type == ".h5": - df = pd.read_hdf(dataset) - else: - raise InvalidDatasetTypeError(dataset) - - print(f"Generating schema for: {dataset}") - json_schema: MutableMapping = {"name": dataset_name, "columns": {}} - for column, d_type in zip(list(df.columns), list(df.dtypes)): - json_schema["columns"][column] = {"type": "", "validations": {}, "metrics": []} - json_schema["columns"][column]["type"] = d_type.name - - return json_schema - - -def main(args: argparse.Namespace): - """Main function for dynamicio cli.py. - - Args: - args: Parsed args. - """ - if args.batch: - dataset_files = glob.glob(os.path.join(args.path, "*.*")) - for dataset in dataset_files: - try: - json_schema = generate_schema_for(dataset) - except InvalidDatasetTypeError as exception: - print(f"Skipping {exception.message}! You may want to remove this file from the datasets directory") - else: - with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml: # pylint: disable=unspecified-encoding] - yaml.safe_dump(json_schema, yml) - - if args.single: - json_schema = generate_schema_for(str(args.path)) - with open(os.path.join(args.output, f"{json_schema['name']}.yaml"), "w") as yml: # pylint: disable=unspecified-encoding] - yaml.safe_dump(json_schema, yml) - pprint.pprint(json_schema) - - -def run(): - """Entry point for the dynamicio cli.py.""" - args = parse_args() - main(args) diff --git a/dynamicio/config/__init__.py b/dynamicio/config/__init__.py deleted file mode 100644 index 47a7ec6..0000000 --- a/dynamicio/config/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Dynamicio config file handling routines.""" - -from dynamicio.config import pydantic -from dynamicio.config.io_config import IOConfig diff --git a/dynamicio/config/io_config.py b/dynamicio/config/io_config.py deleted file mode 100644 index f1f6033..0000000 --- a/dynamicio/config/io_config.py +++ /dev/null @@ -1,273 +0,0 @@ -"""Implements the `IOConfig` class, generating objects used as a configuration parameter for the instantiation of`src.utils.dynamicio.dataio.DynamicDataIO` objects. - -The `IOConfig` object, essentially parses a yaml file that contains a set of input sources that will be processed by a -task, converting filtering and converting them into dictionaries. - -For example, suppose an `input.yaml` file, containing: - - READ_FROM_S3_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "csv" - -would be loaded with: - - input_sources_config = IOConfig( - "path_to/input.yaml", - env_identifier="CLOUD", - dynamic_vars=config_module - ) - -and: - - input_sources_config.config - -would return: - - { - "READ_FROM_S3_CSV": { - "LOCAL": { - "type": "local", - "local": { - "file_path": f"{test_global_vars.TEST_RESOURCES}/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "type": "s3", - "s3": { - "bucket": "mock-bucket", - "file_path": "mock-key", - "file_type": "csv" - } - }, - } - } -""" -__all__ = ["IOConfig", "SafeDynamicResourceLoader", "SafeDynamicSchemaLoader"] - -import re -from types import ModuleType -from typing import Any, List, MutableMapping - -import pydantic -import yaml -from magic_logger import logger - -from dynamicio.config.pydantic import BindingsYaml, IOEnvironment - - -class SafeDynamicResourceLoader(yaml.SafeLoader): - """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]]. - - Dynamic variables defined in a provided module object. - """ - - module = None - dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)") - - @classmethod - def with_module(cls, module: ModuleType): - """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`. - - Args: - module: A global vars module with all the dynamic values defined in it. - - Returns: - type - """ - return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module}) - - def dyn_str_constructor(self, node: yaml.nodes.ScalarNode) -> str: - """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module. - - Args: - node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention - are replaced with the respective attributes in te provided module. - - Returns: - Constructed `str` or numerical. - """ - value = node.value - - while result := self.dynamic_data_matcher.match(value): - ref = result.group(3) - replacement = getattr(self.module, ref) - - value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value) - - return value - - -class SafeDynamicSchemaLoader(yaml.SafeLoader): - """Implements a dynamic yaml loader that parses yaml files and replaces strings that map to [[ DYNAMIC_VAR ]]. - - Dynamic variables defined in a provided module object. - """ - - module = None - dynamic_data_matcher = re.compile(r"(.*)(\[\[\s*(\S+)\s*]])(.*)") - - @classmethod - def with_module(cls, module: ModuleType): - """Creates a dynamic subclass of SafeDynamicLoader with the `data_module` attribute set to `module`. - - Args: - module: A global vars module with all the dynamic values defined in it. - - Returns: - type - """ - return type(f"{cls.__name__}_{module.__name__}", (cls,), {"module": module}) - - def dyn_value_constructor(self, node: yaml.nodes.ScalarNode) -> Any: - """Responsible for the switching of one or more "[[ DYNAMIC_VAR ]]" strings with the respective attributes value in a given module. - - Args: - node: Parsed item whose dynamic values that map to the "[[ DYNAMIC_VAR ]]" convention - are replaced with the respective attributes in te provided module. - - Returns: - Constructed `str` or numerical. - """ - value = node.value - - while result := self.dynamic_data_matcher.match(value): - ref = result.group(3) - replacement = getattr(self.module, ref) - - value = self.dynamic_data_matcher.sub(f"\\g<1>{replacement}\\g<4>", value) - - try: - value = float(value) - return value - except ValueError: - pass - - return value - - -class IOConfig: - """Generates an object that returns a sub-dictionary of the elements of that yaml file. - - The file serves as a config for setting up DynamicDataIO objects. Requires a resources yaml file, - an ENVIRONMENT value {CLOUD or LOCAL} and a vars module. - - Example: - input_sources_config = IOConfig( - "path_to/input.yaml", - env_identifier="CLOUD", - dynamic_vars=config_module - ) - """ - - YAML_TAG = "tag:yaml.org,2002:str" - SafeDynamicResourceLoader.add_constructor(YAML_TAG, SafeDynamicResourceLoader.dyn_str_constructor) - SafeDynamicSchemaLoader.add_constructor(YAML_TAG, SafeDynamicSchemaLoader.dyn_value_constructor) - - path_to_source_yaml: str - env_identifier: str - config: BindingsYaml - - def __init__(self, path_to_source_yaml: str, env_identifier: str, dynamic_vars: ModuleType): - """Class constructor. - - Args: - path_to_source_yaml: Absolute file path to yaml file containing source definitions - env_identifier: "LOCAL" or "CLOUD". - dynamic_vars: module containing values for dynamic values that the source yaml - may reference. - """ - self.path_to_source_yaml = path_to_source_yaml - self.env_identifier = env_identifier - self.dynamic_vars = dynamic_vars - self.config = self._parse_sources_config() - - def _parse_sources_config(self) -> BindingsYaml: - """Parses the yaml input and return a dictionary. - - Returns: - A dictionary with the list of all file paths pointing to various input sources as those - are defined in their respective data/*.yaml files. - """ - used_file_inputs = [self.path_to_source_yaml] - with open(self.path_to_source_yaml, "r") as stream: # pylint: disable=unspecified-encoding] - logger.debug(f"Parsing {self.path_to_source_yaml}...") - data = yaml.load(stream, SafeDynamicResourceLoader.with_module(self.dynamic_vars)) - - # Load any file_path's found in schema definitions - for io_binding in data.values(): - if isinstance(io_binding, MutableMapping) and io_binding.get("schema", {}).get("file_path"): - file_path = io_binding["schema"]["file_path"] - used_file_inputs.append(file_path) - # schema has `file_path`` in it - with open(file_path, "r", encoding="utf8") as stream: - io_binding["schema"] = yaml.load(stream, SafeDynamicSchemaLoader.with_module(self.dynamic_vars)) - - try: - config = BindingsYaml(bindings=data) - config.update_config_refs() - except pydantic.ValidationError: - logger.exception(f"Error loading {data=!r}, {used_file_inputs=!r}") - raise - return config - - @property - def sources(self) -> List[str]: - """Class property for easy access to a list of sources. - - Returns: - All top level names of the available resources for the used resources yaml config. - """ - return list(self.config.bindings.keys()) - - def get(self, source_key: str) -> IOEnvironment: - """A getter. - - Args: - source_key: The name of the resource for which we want to create a config. - - Returns: - A dictionary with the necessary fields for loading the data from a source. - - Example: - - Given: - - VOYAGE_DATA: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/voyage_data.parquet" - file_type: "parquet" - CLOUD: - type: "kafka" - KAFKA: - KAFKA_SERVER: "[[ KAFKA_SERVER ]]" - KAFKA_TOPIC: "[[ KAFKA_TOPIC ]]" - - If you do: - - input_sources_config = IOConfig( - "path_to/input.yaml", - env_identifier="CLOUD", - dynamic_vars=globals - ) - voyage_data_cloud_mapping = input_config.get(source_key="VOYAGE_DATA") - - then `voyage_data_cloud_mapping` is: - - "KAFKA": { - "KAFKA_SERVER": "mock-kafka-server", - "KAFKA_TOPIC": "mock-kafka-topic" - } - """ - return self.config.bindings[source_key].get_binding_for_environment(self.env_identifier) diff --git a/dynamicio/config/pydantic/__init__.py b/dynamicio/config/pydantic/__init__.py deleted file mode 100644 index 68a6fe8..0000000 --- a/dynamicio/config/pydantic/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Pydantic config models.""" - -from dynamicio.config.pydantic.config import BindingsYaml -from dynamicio.config.pydantic.io_resources import ( - IOEnvironment, - KafkaDataEnvironment, - LocalBatchDataEnvironment, - LocalDataEnvironment, - PostgresDataEnvironment, - S3DataEnvironment, - S3PathPrefixEnvironment, -) -from dynamicio.config.pydantic.table_schema import DataframeSchema diff --git a/dynamicio/config/pydantic/config.py b/dynamicio/config/pydantic/config.py deleted file mode 100644 index acc8772..0000000 --- a/dynamicio/config/pydantic/config.py +++ /dev/null @@ -1,38 +0,0 @@ -# pylint: disable=no-member, no-self-argument, unused-argument -"""Pydantic schema for YAML files""" - -from typing import Mapping, MutableMapping - -import pydantic - -import dynamicio.config.pydantic.io_resources as env_spec - - -class BindingsYaml(pydantic.BaseModel): - """Class controlling structure of the top-level IOConfig yaml file. - - The top-level config is a dictionary of -> - """ - - bindings: Mapping[str, env_spec.IOBinding] - - @pydantic.validator("bindings", pre=True) - def _validate_bindings(cls, value: Mapping): - if not isinstance(value, Mapping): - raise ValueError(f"Bindings must be a mapping. (got {value!r} instead).") - # Tell each binding its name - for (name, sub_config) in value.items(): - if not isinstance(sub_config, MutableMapping): - raise ValueError(f"Each element for the name binding must be a dict. (got {sub_config!r} instead)") - sub_config["__binding_name__"] = name - return value - - def update_config_refs(self) -> "BindingsYaml": - """Updates dynamic parts of the config: - - Configure _parent for all `IOEnvironment`s - - Replace all IOSchemaRef with actual schema objects - """ - for binding in self.bindings.values(): - for io_env in binding.environments.values(): - io_env.set_parent(binding) - return self diff --git a/dynamicio/config/pydantic/io_resources.py b/dynamicio/config/pydantic/io_resources.py deleted file mode 100644 index d799c97..0000000 --- a/dynamicio/config/pydantic/io_resources.py +++ /dev/null @@ -1,220 +0,0 @@ -# pylint: disable=no-member, no-self-argument, unused-argument - -"""This module contains pylint models for physical data sources (places the bytes are being read from)""" - -import enum -import posixpath -from typing import Mapping, Optional, Union - -import pydantic - -import dynamicio.config.pydantic.table_schema as table_spec - - -@enum.unique -class DataBackendType(str, enum.Enum): - """Input file types""" - - # pylint: disable=invalid-name - local = "local" - local_batch = "local_batch" - s3 = "s3" # is there a difference between 's3' and 's3_file' ? - s3_file = "s3_file" - s3_path_prefix = "s3_path_prefix" - postgres = "postgres" - athena = "athena" - kafka = "kafka" - - -@enum.unique -class FileType(str, enum.Enum): - """List of supported file formats.""" - - # pylint: disable=invalid-name - parquet = "parquet" - csv = "csv" - json = "json" - hdf = "hdf" - - -class IOBinding(pydantic.BaseModel): - """A binding for a single i/o object""" - - name: str = pydantic.Field(alias="__binding_name__") - environments: Mapping[str, "IOEnvironment"] - dynamicio_schema: Union[table_spec.DataframeSchema, None] = pydantic.Field(default=None, alias="schema") - - def get_binding_for_environment(self, environment: str) -> "IOEnvironment": - """Fetch the IOEnvironment spec for the name provided.""" - return self.environments[environment] - - @pydantic.validator("environments", pre=True, always=True) - def pick_correct_env_cls(cls, value, values, config, field): - """This pre-validator picks an appropriate IOEnvironment subclass for the `data_backend_type`""" - if not isinstance(value, Mapping): - raise ValueError(f"Environments input should be a dict. Got {value!r} instead.") - config_cls_overrides = { - DataBackendType.local: LocalDataEnvironment, - DataBackendType.local_batch: LocalBatchDataEnvironment, - DataBackendType.s3: S3DataEnvironment, - DataBackendType.s3_file: S3DataEnvironment, - DataBackendType.s3_path_prefix: S3PathPrefixEnvironment, - DataBackendType.kafka: KafkaDataEnvironment, - DataBackendType.postgres: PostgresDataEnvironment, - } - out_dict = {} - for (env_name, env_data) in value.items(): - base_obj: IOEnvironment = field.type_(**env_data) - override_cls = config_cls_overrides.get(base_obj.data_backend_type) - if override_cls: - use_obj = override_cls(**env_data) - else: - use_obj = base_obj - out_dict[env_name] = use_obj - return out_dict - - @pydantic.root_validator(pre=True) - def _preprocess_raw_config(cls, values): - if not isinstance(values, Mapping): - raise ValueError(f"IOBinding must be a dict at the top level. (got {values!r} instead)") - remapped_value = {"environments": {}} - for (key, value) in values.items(): - if key in ("__binding_name__", "schema"): - # Passthrough params - remapped_value[key] = value - else: - # Assuming an environment config - remapped_value["environments"][key] = value - return remapped_value - - -class IOEnvironment(pydantic.BaseModel): - """A section specifiing an data source backed by a particular data backend""" - - _parent: Optional[IOBinding] = None # noqa: F821 - options: Mapping = pydantic.Field(default_factory=dict) - data_backend_type: DataBackendType = pydantic.Field(alias="type", const=None) - - class Config: - """Additional pydantic configuration for the model.""" - - underscore_attrs_are_private = True - - @property - def dynamicio_schema(self) -> Union[table_spec.DataframeSchema, None]: - """Returns tabular data structure definition for the data source (if available)""" - if not self._parent: - raise Exception("Parent field is not set.") - return self._parent.dynamicio_schema - - def set_parent(self, parent: IOBinding): # noqa: F821 - """Helper method to set parent config object.""" - assert self._parent is None - self._parent = parent - - -class LocalDataSubSection(pydantic.BaseModel): - """Config section for local data provider""" - - file_path: str - file_type: FileType - - -class LocalDataEnvironment(IOEnvironment): - """The data is provided by local storage""" - - local: LocalDataSubSection - - -class LocalBatchDataSubSection(pydantic.BaseModel): - """Config section for local batch data (multiple input files)""" - - path_prefix: str - file_type: FileType - - -class LocalBatchDataEnvironment(IOEnvironment): - """Parent section for local batch (multiple files) config.""" - - local: LocalBatchDataSubSection - - -class S3DataSubSection(pydantic.BaseModel): - """Config section for S3 data source""" - - file_path: str - file_type: FileType - bucket: str - - -class S3DataEnvironment(IOEnvironment): - """Parent section for s3 data source config""" - - s3: S3DataSubSection - - -class S3PathPrefixSubSection(pydantic.BaseModel): - """Config section for s3 prefix data source (multiple s3 objects)""" - - path_prefix: str - file_type: FileType - bucket: str - - @pydantic.root_validator(pre=True) - def support_legacy_config_path_prefix(cls, values): - """ - This validator implements support for legacy config format where the - bucket & path_prefix path could've been passed as a single param in 'bucket' field. - - E.g. - bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.hdf" - """ - bucket = values.get("bucket") - path_prefix = values.get("path_prefix") - if (bucket and isinstance(bucket, str) and posixpath.sep in bucket) and (not path_prefix): - (new_bucket, new_path_prefix) = bucket.split(posixpath.sep, 1) - values.update( - { - "bucket": new_bucket, - "path_prefix": new_path_prefix, - } - ) - return values - - -class S3PathPrefixEnvironment(IOEnvironment): - """Parent section for the multi-object s3 data source""" - - s3: S3PathPrefixSubSection - - -class KafkaDataSubSection(pydantic.BaseModel): - """Kafka configuration section.""" - - kafka_server: str - kafka_topic: str - - -class KafkaDataEnvironment(IOEnvironment): - """Parent section for kafka data source config""" - - kafka: KafkaDataSubSection - - -class PostgresDataSubSection(pydantic.BaseModel): - """Postgres data source configuration.""" - - db_host: str - db_port: str - db_name: str - db_user: str - db_password: str - - -class PostgresDataEnvironment(IOEnvironment): - """Parent section for postgres data source.""" - - postgres: PostgresDataSubSection - - -IOBinding.update_forward_refs() diff --git a/dynamicio/config/pydantic/table_schema.py b/dynamicio/config/pydantic/table_schema.py deleted file mode 100644 index f61f383..0000000 --- a/dynamicio/config/pydantic/table_schema.py +++ /dev/null @@ -1,131 +0,0 @@ -# pylint: disable=no-member, no-self-argument, unused-argument - -"""This module defines Config schema for data source (pandas dataframe)""" - -import enum -from typing import Mapping, Sequence - -import pydantic - - -@enum.unique -class MetricsName(str, enum.Enum): - """The list of valid metrics names.""" - - # pylint: disable=invalid-name - min = "Min" - max = "Max" - mean = "Mean" - stddev = "Std" - variance = "Variance" - counts = "Counts" - counts_per_label = "CountsPerLabel" - unique_counts = "UniqueCounts" - - -@enum.unique -class ColumnType(str, enum.Enum): - """The list of valid column types.""" - - # pylint: disable=invalid-name - object = "object" - string = "string" - array = "array" - number = "number" - - float = "float" - float32 = "float32" - float64 = "float64" - double = "double" - - int = "int" - integer = "integer" - - int8 = "int8" - int32 = "int32" - int64 = "int64" - - Int8 = "Int8" - Int32 = "Int32" - Int64 = "Int64" - - uint8 = "uint8" - uint32 = "uint32" - uint64 = "uint64" - - bool = "bool" - boolean = "boolean" - - datetime64_ns = "datetime64[ns]" - datetime64_ns_utc = "datetime64[ns,UTC]" - datetime64_ms = "datetime64[ms]" - - timedelta64_ns = "timedelta64[ns]" - - -class ColumnValidationBase(pydantic.BaseModel): - """A single column validator.""" - - name: str - apply: bool - options: Mapping[str, object] - - -class SchemaColumn(pydantic.BaseModel): - """Definition os a single data source column.""" - - name: str - data_type: ColumnType = pydantic.Field(alias="type") - validations: Sequence[ColumnValidationBase] = pydantic.Field(default_factory=list) - metrics: Sequence[MetricsName] = () - - @pydantic.validator("validations", pre=True) - def remap_validations(cls, field): - """Remap the yaml structure of {validation_type: } to a list with validation_type as a key""" - if not isinstance(field, dict): - raise ValueError(f"{field!r} should be a dict") - out = [] - for (key, params) in field.items(): - new_el = params.copy() - new_el.update({"name": key}) - out.append(new_el) - return out - - @pydantic.validator("metrics", pre=True, always=True) - def validate_metrics(cls, field): - """Remap any false-ish `metrics` value to an empty list.""" - if field: - out = field - else: - out = [] - return out - - -class DataframeSchema(pydantic.BaseModel): - """Pydantic model describing the tabular data provided by the data source.""" - - name: str - columns: Mapping[str, SchemaColumn] - - @pydantic.validator("columns", pre=True) - def supply_column_names(cls, field): - """Tell each column its name (the key it is listed under)""" - if not isinstance(field, Mapping): - raise ValueError(f"{field!r} shoudl be a dict.") - - return {col_name: {**{"name": col_name}, **col_data} for (col_name, col_data) in field.items()} - - @property - def validations(self) -> Mapping[str, Sequence[ColumnValidationBase]]: - """A short-hand property to access the validators for each column.""" - return {col_name: col.validations for (col_name, col) in self.columns.items()} - - @property - def metrics(self) -> Mapping[str, Sequence[MetricsName]]: - """A short-hand property to access the metrics for each column.""" - return {col_name: col.metrics for (col_name, col) in self.columns.items()} - - @property - def column_names(self) -> Sequence[str]: - """Property providing the list of all column names.""" - return tuple(self.columns.keys()) diff --git a/dynamicio/core.py b/dynamicio/core.py deleted file mode 100644 index 0bee13f..0000000 --- a/dynamicio/core.py +++ /dev/null @@ -1,315 +0,0 @@ -"""Implements the DynamicDataIO class which provides functionality for data: loading; sinking, and; schema validation.""" -# pylint: disable=no-member -__all__ = ["DynamicDataIO", "SCHEMA_FROM_FILE"] - -import asyncio -import inspect -import re -from concurrent.futures import ThreadPoolExecutor -from typing import Any, Mapping, MutableMapping, Optional - -import pandas as pd # type: ignore -import pydantic -from magic_logger import logger - -from dynamicio import validations -from dynamicio.config.pydantic import DataframeSchema, IOEnvironment -from dynamicio.errors import CASTING_WARNING_MSG, ColumnsDataTypeError, NOTICE_MSG, SchemaNotFoundError, SchemaValidationError -from dynamicio.metrics import get_metric - -SCHEMA_FROM_FILE = {"schema": object()} - -pool = ThreadPoolExecutor() - - -class DynamicDataIO: - """Given a `src.utils.dynamicio.config.IOConfig` object, it generates an object with access to a series of methods for cloud I/O operations and data validations. - - Example: - >>> input_sources_config = IOConfig( - >>> "path_to/input.yaml", - >>> os.getenv("ENVIRONMENT",default="LOCAL") - >>> ) - >>> - >>> class IO(WithS3File, WithLocal, DynamicDataIO): - >>> schema = S - >>> - >>> my_dataset_local_mapping = input_config.get(source_key="MY_DATASET") - >>> my_dataset_io = IO(my_dataset_local_mapping) - >>> my_dataset_df = my_dataset_io.read() - """ - - schema: DataframeSchema - sources_config: IOEnvironment - - def __init__( - self, - source_config: IOEnvironment, - apply_schema_validations: bool = False, - log_schema_metrics: bool = False, - show_casting_warnings: bool = False, - **options: MutableMapping[str, Any], - ): - """Class constructor. - - Args: - source_config: Configuration to use when reading/writing data from/to a source - apply_schema_validations: Applies schema validations on either read() or write() - log_schema_metrics: Logs schema metrics on either read() or write() - show_casting_warnings: Logs casting warnings on either read() or write() if set to True - options: Any additional kwargs that may be used throughout the lifecycle of the object - """ - if type(self) is DynamicDataIO: # pylint: disable=unidiomatic-typecheck - raise TypeError("Abstract class DynamicDataIO cannot be used to instantiate an object...") - - self.sources_config = source_config - self.name = self._transform_class_name_to_dataset_name(self.__class__.__name__) - self.apply_schema_validations = apply_schema_validations - self.log_schema_metrics = log_schema_metrics - self.show_casting_warnings = show_casting_warnings - self.options = self._get_options(options, source_config.options) - source_name = self.sources_config.data_backend_type - if self.schema is SCHEMA_FROM_FILE: - active_schema = self.sources_config.dynamicio_schema - else: - active_schema = self._schema_from_obj(self) - - if not active_schema: - raise SchemaNotFoundError() - - assert isinstance(active_schema, DataframeSchema) - self.schema = active_schema - self.name = self.schema.name.upper() - self.schema_validations = self.schema.validations - self.schema_metrics = self.schema.metrics - - assert hasattr(self, f"_read_from_{source_name}") or hasattr( - self, f"_write_to_{source_name}" - ), f"No method '_read_from_{source_name}' or '_write_to_{source_name}'. Have you registered a mixin for {source_name}?" - - @staticmethod - def _schema_from_obj(target) -> DataframeSchema: - """Construct `DataframeSchema` from an object. - - The object: - - MUST have `schema` attribute that is a dictionary specifying columns and datatypes - - CAN have `schema_validations` and `schema_metrics` attributes - """ - col_info = {} - for (col_name, dtype) in target.schema.items(): - col_validations = {} - col_metrics = [] - try: - col_validations = target.schema_validations[col_name] - except (KeyError, AttributeError): - pass - try: - col_metrics = target.schema_metrics[col_name] - except (KeyError, AttributeError): - pass - col_info[col_name] = { - "type": dtype, - "validations": col_validations, - "metrics": col_metrics, - } - try: - out = DataframeSchema(name=target.name, columns=col_info) - except pydantic.ValidationError: - logger.exception(f"Error parsing {target.name=!r} {col_info=!r}") - raise - return out - - def __init_subclass__(cls): - """Ensure that all subclasses have a `schema` attribute and a `validate` method. - - Raises: - AssertionError: If either of the attributes is not implemented - """ - if not inspect.getmodule(cls).__name__.startswith("dynamicio"): - assert "schema" in cls.__dict__ - - if cls.schema is None or (cls.schema is not SCHEMA_FROM_FILE and len(cls.schema) == 0): - raise ValueError(f"schema for class {cls} cannot be None or empty...") - - async def async_read(self): - """Allows the use of asyncio to concurrently read files in memory. - - Returns: - A pandas dataframe or an iterable. - """ - loop = asyncio.get_running_loop() - return await loop.run_in_executor(pool, self.read) - - def read(self) -> pd.DataFrame: - """Reads data source and returns a schema validated dataframe (by means of _apply_schema). - - Returns: - A pandas dataframe or an iterable. - """ - source_name = self.sources_config.data_backend_type - df = getattr(self, f"_read_from_{source_name}")() - - df = self._apply_schema(df) - if self.apply_schema_validations: - self.validate_from_schema(df) - if self.log_schema_metrics: - self.log_metrics_from_schema(df) - - return df - - async def async_write(self, df: pd.DataFrame): - """Allows the use of asyncio to concurrently write files out. - - Args: - df: The data to be written - """ - loop = asyncio.get_running_loop() - return await loop.run_in_executor(pool, self.write, df) - - def write(self, df: pd.DataFrame): - """Sink data to a given source based on the sources_config. - - Args: - df: The data to be written - """ - source_name = self.sources_config.data_backend_type - if set(df.columns) != self.schema.column_names: # pylint: disable=E1101 - columns = [column for column in df.columns.to_list() if column in self.schema.column_names] - df = df[columns] - - if self.apply_schema_validations: - self.validate_from_schema(df) - if self.log_schema_metrics: - self.log_metrics_from_schema(df) - - getattr(self, f"_write_to_{source_name}")(self._apply_schema(df)) - - def validate_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO": - """Validates a dataframe based on the validations present in its schema definition. - - All validations are checked and if any of them fails, a `SchemaValidationError` is raised. - - Args: - df: - - Returns: - self (to allow for method chaining). - - Raises: - SchemaValidationError: if any of the validations failed. The `message` attribute of - the exception object is a `List[str]`, where each element is the name of a - validation that failed. - """ - - failed_validations = {} - for column in self.schema_validations.keys(): - col_validations = self.schema_validations[column] - for validation in col_validations: - if validation.apply: - validator = validations.ALL_VALIDATORS[validation.name] - validation_result = validator(self.name, df, column, **validation.options) - if not validation_result.valid: - failed_validations[validation.name] = validation_result.message - - if len(failed_validations) > 0: - raise SchemaValidationError(failed_validations) - - return self - - def log_metrics_from_schema(self, df: pd.DataFrame) -> "DynamicDataIO": - """Calculates and logs metrics based on the metrics present in its schema definition. - - Args: - df: A dataframe for which metrics are generated and logged - - Returns: - self (to allow for method chaining). - """ - - for column in self.schema_metrics.keys(): - for metric in self.schema_metrics[column]: - get_metric(metric)(self.name, df, column)() # type: ignore - - return self - - def _apply_schema(self, df: pd.DataFrame) -> pd.DataFrame: - """Called by the `self.read()` and the `self._write_to_local()` methods. - - Contrasts a dataframe's read from a given source against the class's schema dictionary, - checking that columns are the same (by means of _has_columns and _has_valid_dtypes). Then, - check if the columns are fine, it further validates if the types of columns conform to the - expected schema. Finally, if schema types are different, then it attempts to apply schema; - if possible then the schema validation is successful. - - Args: - df: A pandas dataframe. - - Returns: - A schema validated dataframe. - """ - if not self._has_valid_dtypes(df): - raise ColumnsDataTypeError() - return df - - @staticmethod - def _transform_class_name_to_dataset_name(string_to_transform: str) -> str: - """Called by the init function to fetch dataset names from class name. - - Used to create dataset name from class name, turns camel case into upper snake case. - For example: 'ThisNameABC' -> 'THIS_NAME_ABC'. - """ - words = re.findall(r"\d[A-Z]+|[A-Z]?[a-z\d]+|[A-Z]{2,}(?=[A-Z][a-z]|\d|\W|$)|\d+|[A-Z]{2,}|[A-Z]", string_to_transform) - return "_".join(map(str.lower, words)).upper() - - def _has_valid_dtypes(self, df: pd.DataFrame) -> bool: - """Checks if `df` has the expected dtypes defined in `schema`. - - Schema is a dictionary object where keys are column names and values are dtypes in string format as returned by e.g. - `df[column].dtype.name`. - - This function issues `error` level logs describing the first column that caused the check to fail. - - It is assumed that `df` only has the columns defined in `schema`. - - Args: - df: - - Returns: - bool - `True` if `df` has the given dtypes, `False` otherwise - """ - dtypes = df.dtypes - - for col_info in self.schema.columns.values(): - column_name = col_info.name - expected_dtype = col_info.data_type - found_dtype = dtypes[column_name].name - if found_dtype != expected_dtype: - if self.show_casting_warnings: - logger.info(f"Expected: '{expected_dtype}' dtype for {self.name}['{column_name}]', found '{found_dtype}'") - try: - if len(set(type(v) for v in df[column_name].values)) > 1: # pylint: disable=consider-using-set-comprehension - logger.warning(CASTING_WARNING_MSG.format(column_name, expected_dtype, found_dtype)) # pylint: disable=logging-format-interpolation - logger.info(NOTICE_MSG.format(column_name)) # pylint: disable=logging-format-interpolation - df[column_name] = df[column_name].astype(self.schema.columns[column_name].data_type) - except (ValueError, TypeError): - logger.exception(f"ValueError: Tried casting column {self.name}['{column_name}'] to '{expected_dtype}' from '{found_dtype}', but failed") - return False - return True - - @staticmethod - def _get_options(options_from_code: MutableMapping[str, Any], options_from_resource_definition: Optional[Mapping[str, Any]]) -> MutableMapping[str, Any]: - """Retrieves options either from code or from a resource-definition. - - Options are merged if they are provided by both sources, while in the case of conflicts, the options from the code - take precedence. - - Args: - options_from_code (Optional[Mapping]) - options_from_resource_definition (Optional[Mapping]) - - Returns: - [Optional[Mapping]]: options that are going to be used - """ - if options_from_resource_definition: - return {**options_from_resource_definition, **options_from_code} - return options_from_code diff --git a/dynamicio/errors.py b/dynamicio/errors.py deleted file mode 100644 index 7c41fe8..0000000 --- a/dynamicio/errors.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Hosts exception implementations for different errors.""" -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, super-init-not-called -__all__ = [ - "DynamicIOError", - "DataSourceError", - "ColumnsDataTypeError", - "NonUniqueIdColumnError", - "NullValueInColumnError", - "NotExpectedCategoricalValue", - "MissingSchemaDefinition", - "SchemaNotFoundError", - "SchemaValidationError", - "InvalidDatasetTypeError", - "CASTING_WARNING_MSG", - "NOTICE_MSG", -] - -from typing import Any, Optional - - -class DynamicIOError(Exception): - """Base class for DynamicIO errors.""" - - ERROR_STR: str = "" - ERROR_STR_DETAILED: str = "{0}" - - @property - def message(self) -> Optional[Any]: - """Easy access for optional message argument. - - Returns: - Message or `None` if not set - """ - try: - return self.args[0] - except IndexError: - return None - - def __str__(self): - """Enrich and return error message.""" - message = self.message - - if message is None: - return self.ERROR_STR - - return self.ERROR_STR_DETAILED.format(message) - - -class SchemaNotFoundError(DynamicIOError): - """Error raised when schema is not specified in the provided source.""" - - ERROR_STR = "Schema not specified in the provided source" - ERROR_STR_DETAILED = "Schema not specified in the provided source: {0} " - - -class SchemaValidationError(DynamicIOError): - """Error raised when schema validation fails.""" - - -class MissingSchemaDefinition(DynamicIOError): - """Error raised when schema is not specified in the provided source.""" - - ERROR_STR = "The resource definition for this class is missing a schema definition" - ERROR_STR_DETAILED = "The resource definition for this class is missing a schema definition: {0}" - - -class DataSourceError(DynamicIOError): - """Error raised when the data source fails to load.""" - - -class ColumnsDataTypeError(DynamicIOError): - """Error raised when the validated data does not have the expected data types.""" - - -class NonUniqueIdColumnError(DynamicIOError): - """Error raised when the data source fails to load.""" - - -class NullValueInColumnError(DynamicIOError): - """Error raised when the data source fails to load.""" - - -class NotExpectedCategoricalValue(DynamicIOError): - """Error raised when the data source fails to load.""" - - -class InvalidDatasetTypeError(DynamicIOError): - """Error raised when dataset type is not one of [parquet, json, csv, h5].""" - - ERROR_STR = "The dataset provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio." - ERROR_STR_DETAILED = "Dataset: {0} provided is not amongst the supported types (parquet, json, csv, h5) handled by dynamicio." - - -# Warning messages -CASTING_WARNING_MSG = "Applying casting column: '{0}' to: 'type:{1}' from 'type:{2}' though not advised, as `dtypes`>1 for {0}, which may lead to data corruption!" -NOTICE_MSG = "Keeping the {0} as is, may anyway cause I/O errors or data corruption issues especially when using `pandas.DataFrame.to_parquet` or `pandas.DataFrame.to_json`." diff --git a/dynamicio/inject.py b/dynamicio/inject.py new file mode 100644 index 0000000..8f4baa3 --- /dev/null +++ b/dynamicio/inject.py @@ -0,0 +1,90 @@ +"""Injects dynamic values into a string.""" +from __future__ import annotations + +import re +from pathlib import Path +from typing import Any, Dict, overload + +curly_braces_matcher = re.compile(r"(.*)(\{\s*(\S+)\s*\})(.*)") + + +class InjectionError(ValueError): + """Raised when a string has any dynamic values in the form of "{DYNAMIC_VAR}" or "[[ DYNAMIC_VAR ]]".""" + + +@overload +def inject(value: None, **kwargs: dict[str, Any]) -> None: + ... + + +@overload +def inject(value: Path, **kwargs: dict[str, Any]) -> Path: + ... + + +@overload +def inject(value: str, **kwargs: dict[str, Any]) -> str: + ... + + +def inject(value: str | Path | None, **kwargs: dict[str, Any]) -> str | Path | None: + """Parse a string and replace any "{DYNAMIC_VAR}" and "[[ DYNAMIC_VAR ]]" with the respective values in the kwargs. + + case-insensitive. + Args: + value: An injectable value (str | Path | None) with dynamic values in the form of "{DYNAMIC_VAR}" or "[[ DYNAMIC_VAR ]]". + kwargs: A mapping of values to replace in the path. + + Returns: + str | Path | None: Injectable with all dynamic values replaced. + """ + if value is None: + return value + to_inject = str(value) + injected = _inject_with_matcher(to_inject, curly_braces_matcher, **kwargs) + return type(value)(injected) + + +def check_injections(value: str | Path | None) -> None: + """Raise if a string has any dynamic values in the form of "{DYNAMIC_VAR}" or "[[ DYNAMIC_VAR ]]".""" + if value is None: + return value + to_check: str = str(value) + while _ := curly_braces_matcher.search(to_check): + raise InjectionError(f'Path is not fully injected: "{to_check!r}"') + + +def _inject_with_matcher(value: str, matcher, **kwargs) -> str: + """Replaces any matching dynamic values. + + Args: + path: A string with dynamic values. + matcher: A regex matcher to find the dynamic values. + kwargs: A mapping of values to replace in the path. + + Returns: + str: The path with the dynamic values replaced with the respective values in the kwargs. + """ + kwargs_lower = {k.lower(): v for k, v in kwargs.items()} # case-insensitive + + replacements: Dict[str, Any] = {} + + temp_suffix_value = "" + + while result := matcher.search(value): + str_to_replace = result.group(3).lower() # we want to be case-insensitive + replacement = kwargs_lower.get(str_to_replace, None) + + if replacement is None: + suffix = matcher.sub("\\g<2>\\g<4>", value) + temp_suffix_value = f"{suffix}{temp_suffix_value}" + value = matcher.sub("\\g<1>", value) + else: + replacements[str_to_replace] = replacement + + # finds the first match and replaces it + value = matcher.sub(f"\\g<1>{replacement}\\g<4>", value) + + value = f"{value}{temp_suffix_value}" + + return value diff --git a/dynamicio/io/__init__.py b/dynamicio/io/__init__.py new file mode 100644 index 0000000..06951c0 --- /dev/null +++ b/dynamicio/io/__init__.py @@ -0,0 +1,4 @@ +from .file import LocalFileResource +from .s3 import S3Resource +from .postgres import PostgresResource +from .kafka import KafkaResource diff --git a/dynamicio/io/file.py b/dynamicio/io/file.py new file mode 100644 index 0000000..3eda2bc --- /dev/null +++ b/dynamicio/io/file.py @@ -0,0 +1,60 @@ +from functools import partial +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional + +import pandas as pd + +from dynamicio.io.resource import BaseResource +from dynamicio.io.serde import CsvSerde, HdfSerde, JsonSerde, ParquetSerde, PickleSerde + + +class LocalFileResource(BaseResource): + path: Path + read_kwargs: Dict[str, Any] = {} + write_kwargs: Dict[str, Any] = {} + injectables: List[str] = ["path"] + file_type: Optional[Literal["parquet", "hdf", "csv", "json", "pickle", "h5"]] = None + + def _read(self) -> pd.DataFrame: + return self.get_serde()._read(self.path) + + def _write(self, df: pd.DataFrame) -> None: + self.path.parent.mkdir(parents=True, exist_ok=True) + return self.get_serde()._write(self.path, df) + + def cache_key(self) -> Path: + if self.test_path is not None: + return self.test_path + else: + return self.path + + @property + def serde_class(self): + file_type = self.file_type or (self.path.suffix.replace(".", "") if self.path.suffix else None) + + if file_type == "parquet": + serde_class = ParquetSerde + elif file_type == "hdf" or file_type == "h5": + serde_class = HdfSerde + elif file_type == "csv": + serde_class = CsvSerde + elif file_type == "json": + serde_class = JsonSerde + elif file_type == "pickle": + serde_class = PickleSerde + elif file_type is None: + raise ValueError(f"File type not specified for {self.path}") + else: + raise ValueError(f"Unknown file type {file_type}") + + serde_class_with_kwargs = partial(serde_class, read_kwargs=self.read_kwargs, write_kwargs=self.write_kwargs) + + return serde_class_with_kwargs + + def get_serde(self): + """Return the serde instance, with baked-in validation.""" + validations = [] + if self.pa_schema is not None: + validations.append(self.pa_schema.validate) + + return self.serde_class(validations=validations) diff --git a/dynamicio/io/hdf.py b/dynamicio/io/hdf.py new file mode 100644 index 0000000..d7082c4 --- /dev/null +++ b/dynamicio/io/hdf.py @@ -0,0 +1,78 @@ +"""Hdf ReaderWriter.""" +from __future__ import annotations + +import uuid +from contextlib import contextmanager +from typing import Any, Dict, Generator, IO, Optional + +import boto3 # type: ignore +import pandas as pd # type: ignore +import tables # type: ignore +from pydantic import BaseModel # type: ignore + + +class InMemStore(pd.io.pytables.HDFStore): + """A subclass of pandas HDFStore that does not manage the pytables File object.""" + + _in_mem_table = None + + def __init__(self, path: str, table: tables.File, mode: str = "r"): + """Initialize the store.""" + self._in_mem_table = table + super().__init__(path=path, mode=mode) # type: ignore + + def open(self, *_args, **_kwargs): # noqa: D102 + pd.io.pytables._tables() + self._handle = self._in_mem_table + + def close(self, *_args, **_kwargs): # noqa: D102 + pass + + @property + def is_open(self): # noqa: D102 + return self._handle is not None + + +class HdfIO: # noqa: D102 + """Class providing stream support for HDF tables.""" + + @contextmanager + def create_file(self, label: str, mode: str, data: Optional[bytes] = None) -> Generator[tables.File, None, None]: + """Create an in-memory pytables table.""" + extra_kw = {} + if data: + extra_kw["driver_core_image"] = data + file_handle = tables.File( + f"{label}_{uuid.uuid4()}.h5", + mode, + title=label, + root_uep="/", + filters=None, + driver="H5FD_CORE", + driver_core_backing_store=0, + **extra_kw, + ) + try: + yield file_handle + finally: + file_handle.close() + + def load(self, fobj: IO[bytes], label: str = "unknown_file.h5") -> pd.DataFrame: + """Load the dataframe from a file-like object.""" + with self.create_file(label, mode="r", data=fobj.read()) as file_handle: + return pd.read_hdf(InMemStore(label, file_handle)) # type: ignore + + def save( + self, + df: pd.DataFrame, + fobj: IO[bytes], + label: str = "unknown_file.h5", + **kwargs, + ): + """Load the dataframe to a file-like object.""" + if not kwargs: + kwargs = {} + with self.create_file(label, mode="w", data=fobj.read()) as file_handle: + store = InMemStore(path=label, table=file_handle, mode="w") + store.put(key="df", value=df, **kwargs) + fobj.write(file_handle.get_file_image()) diff --git a/dynamicio/io/kafka.py b/dynamicio/io/kafka.py new file mode 100644 index 0000000..0489423 --- /dev/null +++ b/dynamicio/io/kafka.py @@ -0,0 +1,87 @@ +"""I/O functions and Resource class for kafka targeted operations.""" +import logging +from pathlib import Path +from typing import Any, Callable, Dict, List, Mapping, Optional, Type, Literal + +import pandas as pd # type: ignore +import simplejson +from kafka import KafkaProducer # type: ignore +from pandera import SchemaModel +from pydantic import Field + +from dynamicio.io.resource import BaseResource +from dynamicio.io.serde import BaseSerde, JsonSerde + + +class KafkaResource(BaseResource): + # Required + topic: str + server: str + + # Defaults + key_generator: Callable[[Any, Mapping[Any, Any]], Optional[str]] = Field( + lambda idx, _: idx, + description="""Gets called with dataframe's (idx, row). Defaults to `idx`.""", + ) + key_serializer: Callable[[Any], bytes] = lambda key: key.encode("utf-8") if key else None + value_serializer: Callable[[Mapping], bytes] = lambda val: simplejson.dumps(val, ignore_nan=True).encode("utf-8") + document_transformer: Callable[[Mapping[Any, Any]], Mapping[Any, Any]] = lambda value: value + # TODO: Give descriptions to all these callables that describe what they're being called with + + # Options + kafka_producer: Optional[KafkaProducer] = None # gets instantiated in get_kafka_producer + compression_type: Literal["gzip", "snappy", "lz4", "zstd"] = "snappy" # type: ignore + producer_kwargs: Dict[str, Any] = {} + + # Resource + injectables: List[str] = ["topic", "server"] + pa_schema: Optional[Type[SchemaModel]] = None + test_path: Optional[str] = None + + def get_kafka_producer(self) -> KafkaProducer: + """Get a KafkaProducer instance.""" + if self.kafka_producer is None: + return KafkaProducer( + bootstrap_servers=self.server, + compression_type=self.compression_type, + key_serializer=self.key_serializer, + value_serializer=self.value_serializer, + **self.producer_kwargs, + ) + return self.kafka_producer + + def _write(self, df: pd.DataFrame) -> None: + """Handles Write operations for Kafka.""" + kafka_producer = self.get_kafka_producer() + + logging.info(f"Sending {len(df)} messages to Kafka topic:{self.topic}") + + messages = df.reset_index(drop=True).to_dict("records") + + for idx, message in zip(df.index.values, messages): + kafka_producer.send( + self.topic, + key=self.key_generator(idx, message), + value=self.document_transformer(message), + ) # type: ignore + + kafka_producer.flush() # type: ignore + + def _read(self) -> pd.DataFrame: + raise NotImplementedError + + def cache_key(self): + """Return the path to the fixture file.""" + if self.test_path is not None: + return Path(self.test_path) + return Path(f"kafka/{self.topic}.json") # Should server be added here? + + @property + def serde_class(self) -> Type[BaseSerde]: + return JsonSerde + + class Config: + """Pydantic Config class.""" + + arbitrary_types_allowed = True + validate_assignment = True diff --git a/dynamicio/io/postgres.py b/dynamicio/io/postgres.py new file mode 100644 index 0000000..19c0de6 --- /dev/null +++ b/dynamicio/io/postgres.py @@ -0,0 +1,155 @@ +import csv +import logging +import tempfile +from contextlib import contextmanager +from pathlib import Path +from typing import Any, Dict, Generator, List, Optional, Type + +import pandas as pd # type: ignore +from pandera import SchemaModel +from pydantic import Field # pylint: disable=no-name-in-module +from sqlalchemy import create_engine # type: ignore +from sqlalchemy.orm import Session as SqlAlchemySession # type: ignore +from sqlalchemy.orm import sessionmaker # type: ignore + +from dynamicio.io.resource import BaseResource +from dynamicio.io.serde import ParquetSerde + +Session = sessionmaker() + + +@contextmanager +def session_scope(connection_string: str, application_name: Optional[str]) -> Generator[SqlAlchemySession, None, None]: + """Connect to a database using `connection_string` and returns an active session to that connection. + + Args: + connection_string: + application_name [optional]: Name of the application that is connecting to the database (repo name). + + + Yields: + Active session + """ + application_name = application_name or "unknown-dynamicio-app" + engine = create_engine(connection_string, connect_args={"application_name": application_name}) + session = Session(bind=engine) + + try: + yield session + session.commit() + except Exception as exc: + session.rollback() + raise exc + finally: + session.close() # pylint: disable=no-member + + +class PostgresResource(BaseResource): + # Postgres Connection + db_user: str + db_password: Optional[str] + db_host: str + db_port: int = 5432 + db_name: str + db_schema: str = "public" + application_name: Optional[str] = Field(None, description="Application name to use for postgres connection.") + + # Postgres IO + truncate_and_append: bool = False + table_name: Optional[str] = Field(None, description="SQL table name. Needs to be given if no sql_query is given") + sql_query: Optional[str] = Field( + None, description="SQL query. Will fetch schema defined columns if this is not given." + ) + read_kwargs: Dict[str, Any] = {} + write_kwargs: Dict[str, Any] = {} + + # Resource + injectables: List[str] = ["table_name", "sql_query", "db_user", "db_password", "db_host", "db_name", "db_schema"] + pa_schema: Optional[Type[SchemaModel]] = None + test_path: Optional[str] = None + + @property + def connection_string(self) -> str: + """Build connection string out of components.""" + password = f":{self.db_password}" if self.db_password else "" + return f"postgresql://{self.db_user}{password}@{self.db_host}:{self.db_port}/{self.db_name}" + + @property + def final_table_name(self) -> str: + """Return schema and table name in a format of schema.table_name.""" + return f"{self.db_schema}.{self.table_name}" + + def _read(self) -> pd.DataFrame: + """Handles Read operations for Postgres.""" + if not (bool(self.sql_query) ^ bool(self.table_name)): # Xor + raise ValueError("PostgresResource must define EITHER sql_query OR table_name.") + + if self.pa_schema is not None and (not self.sql_query and self.pa_schema.Config.strict): + # filtering can now be done at sql level + columns: List[str] = list(self.pa_schema.to_schema().columns.keys()) # type: ignore + columns_str = ", ".join(columns) + sql_query = f"SELECT {columns_str} FROM {self.final_table_name}" + elif self.sql_query is None: + sql_query = f"SELECT * FROM {self.final_table_name}" + else: + sql_query = self.sql_query + + logging.info(f"Downloading table: {self.final_table_name} from: {self.db_host}:{self.db_name}") + with session_scope(self.connection_string, self.application_name) as session: + df = pd.read_sql(sql=sql_query, con=session.get_bind(), **self.read_kwargs) + + return df + + def _write(self, df: pd.DataFrame) -> None: + """Handles Write operations for Postgres.""" + if not self.table_name: + raise ValueError("PostgresResource must specify table_name for writing.") + + with session_scope(self.connection_string, self.application_name) as session: + session: SqlAlchemySession # type: ignore # this is done for IDE purposes + if self.truncate_and_append: + logging.info( + f"Writing to table (csv-hack): {self.final_table_name} from: {self.db_host}:{self.db_name}" + ) + session.execute(f"TRUNCATE TABLE {self.final_table_name};") + + # Speed hack: dump file as csv, use Postgres' CSV import function. + # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table + with tempfile.NamedTemporaryFile(mode="r+") as temp_file: + df.to_csv( + temp_file, + index=False, + header=False, + sep="\t", + doublequote=False, + escapechar="\\", + quoting=csv.QUOTE_NONE, + ) + temp_file.flush() + temp_file.seek(0) + + cur = session.connection().connection.cursor() + cur.execute(f"SET search_path TO {self.db_schema};") + cur.copy_from(temp_file, self.table_name, columns=df.columns, null="") + else: + logging.info(f"Writing to table: {self.final_table_name} from: {self.db_host}:{self.db_name}") + df.to_sql( + name=self.table_name, + con=session.get_bind(), + if_exists="replace", + index=False, + schema=self.db_schema, + ) + + def cache_key(self) -> Path: + if self.test_path is not None: + return self.test_path + elif self.table_name: + return Path("postgres") / (self.final_table_name + ".parquet") + elif self.sql_query: + raise ValueError("test_path must be set if using custom sql query.") + + @property + def serde_class(self): + """Postgres uses a plain ParquetSerde for testing.""" + return ParquetSerde diff --git a/dynamicio/io/resource.py b/dynamicio/io/resource.py new file mode 100644 index 0000000..1257b73 --- /dev/null +++ b/dynamicio/io/resource.py @@ -0,0 +1,99 @@ +from abc import ABC, abstractmethod +from copy import deepcopy +from pathlib import Path +from typing import Callable, List, Optional, Type + +import pandas as pd +from pandera import SchemaModel +from pydantic import BaseModel +from uhura import Readable, Writable + +from dynamicio.inject import InjectionError, inject +from dynamicio.io.serde import BaseSerde, PickleSerde + + +def create_schema_validator(schema) -> Callable[[pd.DataFrame], pd.DataFrame]: + def validate_schema(df: pd.DataFrame): + return schema.validate(df) + + return validate_schema + + +class BaseResource(BaseModel, Readable[pd.DataFrame], Writable[pd.DataFrame], ABC): + """Base class for all resources. + + :injectables: List of attributes that can be injected with format strings. + :test_path (optional): Path to the test data. If set, the resource will be read from and written to this path. + :pa_schema (optional): Pandera schema to validate the resource. If set, the resource will be validated before writing and after reading. + """ + + injectables: List[str] + test_path: Optional[Path] = None + pa_schema: Optional[Type[SchemaModel]] = None + + def inject(self, **kwargs) -> "BaseResource": + """Inject any attributes that are marked as injectable with format strings. + + This includes the test_path and any other relevant attributes.""" + # copy object + clone = deepcopy(self) + for injectable in self.injectables: + # inject attributes + value = getattr(clone, injectable) + if isinstance(value, str) or isinstance(value, Path) or value is None: + formatted_str = inject(value, **kwargs) + setattr(clone, injectable, formatted_str) + + else: + raise InjectionError(f"Cannot inject {injectable} of type {type(value)} in {self.__class__.__name__}") + + # inject test path + if self.test_path is not None: + clone.test_path = inject(self.test_path, **kwargs) + return clone + + @abstractmethod + def _read(self) -> pd.DataFrame: + """Internal read method. Should not be called directly. Use read() instead. + + Overwrite this method to implement custom read logic. + The main read() method is replaced when in uhura testing mode.""" + raise NotImplementedError() + + @abstractmethod + def _write(self, df: pd.DataFrame) -> None: + """Internal write method. Should not be called directly. Use write() instead. + + Overwrite this method to implement custom write logic. + The main write() method is replaced when in uhura testing mode.""" + raise NotImplementedError() + + def read(self) -> pd.DataFrame: + """Read the resource.""" + df = self._read() + df = self.get_serde().validate(df) + return df + + def write(self, df: pd.DataFrame) -> None: + """Write the resource.""" + df = self.get_serde().validate(df) + self._write(df) + + def cache_key(self): + """Return the test path.""" + if self.test_path is None: + raise ValueError("No test path set.") + return str(self.test_path) + + @property + def serde_class(self) -> Type[BaseSerde]: + """Return the serde class. Default is PickleSerde.""" + return PickleSerde + + def get_serde(self) -> BaseSerde: + """Return the serde instance, with baked-in validation.""" + validations = [] + if self.pa_schema is not None: + validations.append(self.pa_schema.validate) + + return self.serde_class(validations=validations) diff --git a/dynamicio/io/s3.py b/dynamicio/io/s3.py new file mode 100644 index 0000000..dcb48d6 --- /dev/null +++ b/dynamicio/io/s3.py @@ -0,0 +1,70 @@ +from functools import partial +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional, Type + +import boto3 +import pandas as pd + +from dynamicio.io.s3_contexts import s3_named_file_reader, s3_writer, s3_reader +from dynamicio.io.resource import BaseResource +from dynamicio.io.serde import BaseSerde, CsvSerde, HdfSerde, JsonSerde, ParquetSerde, PickleSerde + + +class S3Resource(BaseResource): + bucket: str + path: Path + read_kwargs: Dict[str, Any] = {} + write_kwargs: Dict[str, Any] = {} + injectables: List[str] = ["path"] + file_type: Optional[Literal["parquet", "hdf", "csv", "json", "pickle"]] = None + force_read_to_memory: bool = False + + @property + def _s3_path(self) -> str: + """For logging purposes only.""" + return f"s3://{self.bucket}/{self.path}" + + def _read(self) -> pd.DataFrame: + if self.force_read_to_memory: + with s3_reader(boto3.client("s3"), s3_bucket=self.bucket, s3_key=str(self.path)) as fobj: # type: ignore + df = self.get_serde()._read(fobj, **self.read_kwargs) # type: ignore + if df is not None: + return df + else: + raise ValueError(f"Could not read {self._s3_path}") + + with s3_named_file_reader(boto3.client("s3"), s3_bucket=self.bucket, s3_key=str(self.path)) as target_file: + return self.get_serde()._read(target_file.name, **self.read_kwargs) # type: ignore + + def _write(self, df: pd.DataFrame) -> None: + with s3_writer(boto3.client("s3"), s3_bucket=self.bucket, s3_key=str(self.path)) as fobj: + return self.get_serde()._write(fobj, df) + + @property + def serde_class(self): + file_type = self.file_type or (self.path.suffix.replace(".", "") if self.path.suffix else None) + + if file_type == "parquet": + serde_class = ParquetSerde + elif file_type == "hdf" or file_type == "h5": + serde_class = HdfSerde + elif file_type == "csv": + serde_class = CsvSerde + elif file_type == "json": + serde_class = JsonSerde + elif file_type == "pickle": + serde_class = PickleSerde + elif file_type is None: + raise ValueError(f"File type not specified for {self.path}") + else: + raise ValueError(f"Unknown file type {file_type}") + + serde_class_with_kwargs = partial(serde_class, read_kwargs=self.read_kwargs, write_kwargs=self.write_kwargs) + + return serde_class_with_kwargs + + def cache_key(self) -> Path: + if self.test_path is not None: + return self.test_path + else: + return Path("s3") / self.bucket / self.path diff --git a/dynamicio/io/s3_contexts.py b/dynamicio/io/s3_contexts.py new file mode 100644 index 0000000..a8489ef --- /dev/null +++ b/dynamicio/io/s3_contexts.py @@ -0,0 +1,68 @@ +# flake8: noqa: I101 +"""Context managers for reading and writing to S3.""" +import io +import tempfile +from contextlib import contextmanager +from pathlib import Path +from typing import IO, Generator + + +@contextmanager +def s3_named_file_reader(boto3_client, s3_bucket: str, s3_key: str) -> Generator: + """Contextmanager to abstract reading different file types in S3. + + This implementation saves the downloaded data to a temporary file. + + Args: + s3_bucket: The S3 bucket from where to read the file. + s3_key: The file-path to the target file to be read. + + Returns: + The local file path from where the file can be read, once it has been downloaded there by the boto3.client. + + """ + with tempfile.NamedTemporaryFile("wb") as target_file: + # Download the file from S3 + boto3_client.download_fileobj(s3_bucket, s3_key, target_file) + # Yield local file path to body of `with` statement + target_file.flush() + yield target_file + + +@contextmanager +def s3_reader(boto3_client, s3_bucket: str, s3_key: Path) -> Generator[io.BytesIO, None, None]: + """Contextmanager to abstract reading different file types in S3. + + This implementation only retains data in-memory, avoiding creating any temp files. + + Args: + s3_bucket: The S3 bucket from where to read the file. + s3_key: The file-path to the target file to be read. + + Returns: + The local file path from where the file can be read, once it has been downloaded there by the boto3.client. + + """ + fobj = io.BytesIO() + # Download the file from S3 + boto3_client.download_fileobj(s3_bucket, str(s3_key), fobj) + # Yield the buffer + fobj.seek(0) + yield fobj + + +@contextmanager +def s3_writer(boto3_client, s3_bucket: str, s3_key: str) -> Generator[IO[bytes], None, None]: + """Contextmanager to abstract loading different file types to S3. + + Args: + s3_bucket: The S3 bucket to upload the file to. + s3_key: The file-path where the target file should be uploaded to. + + Returns: + The local file path where to actually write the file, to be read and uploaded by boto3.client. + """ + fobj = io.BytesIO() + yield fobj + fobj.seek(0) + boto3_client.upload_fileobj(fobj, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"}) diff --git a/dynamicio/io/serde.py b/dynamicio/io/serde.py new file mode 100644 index 0000000..ab58caa --- /dev/null +++ b/dynamicio/io/serde.py @@ -0,0 +1,115 @@ +"""These are the base serde classes, used for testing & when appropriate for actual IO.""" +import pickle +from abc import ABC, abstractmethod +from io import BytesIO +from threading import Lock +from typing import Callable, Optional, TypeVar, Union + +import pandas as pd +from uhura.serde import Serde + +from dynamicio import utils +from dynamicio.io.hdf import HdfIO + +SerdeType = TypeVar("SerdeType") + + +class BaseSerde(ABC, Serde[pd.DataFrame]): + file_extension = "_" + + def __init__(self, validations: Optional[Callable] = None, **kwargs): + self.validations = validations or [] + + def read_from_file(self, file) -> pd.DataFrame: + df = self._read(file) + return self.validate(df) + + @abstractmethod + def _read(self, file) -> pd.DataFrame: + raise NotImplementedError + + def write_to_file(self, path: str, obj: pd.DataFrame) -> None: + return self._write(path, obj) + + @abstractmethod + def _write(self, path: str, obj: pd.DataFrame) -> None: + raise NotImplementedError + + def validate(self, df: pd.DataFrame): + """Validation is done here to avoid double validations in the framework.""" + for validator in self.validations: + validator(df) + return df + + +class PickleSerde(BaseSerde): + def _read(self, file) -> SerdeType: + with open(file, "rb") as infile: + return pickle.load(infile) + + def _write(self, file, obj: SerdeType) -> None: + with open(file, "wb") as outfile: + pickle.dump(obj, outfile) + + +class ParquetSerde(BaseSerde): + def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs): + self._read_kwargs = read_kwargs or {} + self._write_kwargs = write_kwargs or {} + super().__init__(**kwargs) + + def _read(self, file: str) -> pd.DataFrame: + return pd.read_parquet(file, **self._read_kwargs) + + def _write(self, file: str, obj: pd.DataFrame) -> None: + obj.to_parquet(file, **self._write_kwargs) + + +HDF_LOCK = Lock() + + +class HdfSerde(BaseSerde): + def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs): + self._read_kwargs = read_kwargs or {} + self._write_kwargs = write_kwargs or {} + super().__init__(**kwargs) + + def _read(self, file: Union[str, BytesIO]) -> pd.DataFrame: + if isinstance(file, BytesIO): + return HdfIO().load(file) + with HDF_LOCK: + return pd.read_hdf(file, **self._read_kwargs) + + def _write(self, file: Union[str, BytesIO], obj: pd.DataFrame) -> None: + if isinstance(file, BytesIO): + with utils.pickle_protocol(protocol=4), HDF_LOCK: + HdfIO().save(obj, file, **self._write_kwargs) + else: + with utils.pickle_protocol(protocol=4), HDF_LOCK: + obj.to_hdf(file, key="df", mode="w", **self._write_kwargs) + + +class CsvSerde(BaseSerde): + def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs): + self._read_kwargs = read_kwargs or {} + self._write_kwargs = write_kwargs or {"index": False} + super().__init__(**kwargs) + + def _read(self, file: str) -> pd.DataFrame: + return pd.read_csv(file, **self._read_kwargs) + + def _write(self, file: str, obj: pd.DataFrame) -> None: + obj.to_csv(file, **self._write_kwargs) + + +class JsonSerde(BaseSerde): + def __init__(self, read_kwargs=None, write_kwargs=None, **kwargs): + self._read_kwargs = read_kwargs or {} + self._write_kwargs = write_kwargs or {} + super().__init__(**kwargs) + + def _read(self, file: str) -> pd.DataFrame: + return pd.read_json(file, **self._read_kwargs) + + def _write(self, file: str, obj: pd.DataFrame) -> None: + obj.to_json(file, **self._write_kwargs) diff --git a/dynamicio/metrics.py b/dynamicio/metrics.py index 711a9e4..a193c45 100644 --- a/dynamicio/metrics.py +++ b/dynamicio/metrics.py @@ -1,178 +1,149 @@ -"""A module responsible for metrics generation and logging.""" -# pylint: disable=missing-function-docstring,missing-class-docstring import json import logging -import sys -from numbers import Number -from typing import Any, Dict, Mapping, Type +from enum import Enum +from typing import Mapping -import pandas as pd # type: ignore -from magic_logger import logger -from pythonjsonlogger import jsonlogger # type: ignore +import pandas as pd +from pandera import extensions -logHandler = logging.StreamHandler(sys.stdout) -formatter = jsonlogger.JsonFormatter() -logHandler.setFormatter(formatter) -logger.addHandler(logHandler) +logger = logging.getLogger(__name__) -__metrics__: Dict[str, Type["Metric"]] = {} +class Metric(str, Enum): + MIN = "Min" + MAX = "Max" + MEAN = "Mean" + STD = "Std" + VARIANCE = "Variance" + COUNTS = "Counts" + UNIQUE_COUNTS = "UniqueCounts" + COUNTS_PER_LABEL = "CountsPerLabel" -def get_metric(name: str) -> Type["Metric"]: - return __metrics__[name] - - -def log_metric(dataset: str, column: str, metric: str, value: float): +def log_metric(column: str, metric: str, value: float): """Logs a metric in a structured way for a given dataset column. Args: - dataset: The dataset for which the metric is logged column: Column for which the metric is logged metric: name fo the metric, e.g. "unique_vals" value: The metric's value, e.g. "10000" """ - logger.info(json.dumps({"message": "METRIC", "dataset": dataset, "column": column, "metric": metric, "value": float(value)})) - - -class Metric: - """A base class for implementing metrics classes.""" - - def __init__(self, dataset_name: str, df: pd.DataFrame, column: str): # noqa - self.dataset_name = dataset_name - self.df = df - self.column = column - - def __init_subclass__(cls): # noqa - __metrics__[cls.__name__] = cls - assert "calculate_metric" in cls.__dict__ - - def __call__(self) -> Any: # noqa - metric_value = self.calculate_metric() + logger.info(json.dumps({"message": "METRIC", "column": column, "metric": metric, "value": float(value)})) + +# This function needs to be specifically in this file. Pandera needs a chance to initialise this custom validation +# before the user can specify metrics in their Pandera classes. The moment they import any metric +# (such as `Metric.MIN`), this whole module gets executed, and this custom validation is simultaneously executed, +# so it's available on demand +@extensions.register_check_method(statistics=["metrics"]) +def log_statistics(pandas_obj, *, metrics): + """ + Implements column-level data metrics as a workaround through custom metrics + """ - if isinstance(metric_value, Mapping): - for entity in sorted(metric_value.keys()): # pylint: disable=no-member - column = metric_value[entity] # pylint: disable=unsubscriptable-object - log_metric(self.dataset_name, entity, self.metric_name, column) + col_name = str(pandas_obj.name) + + for metric in metrics: + computed_metric = None + + if metric == Metric.MIN: + computed_metric = calculate_min(pandas_obj) + elif metric == Metric.MAX: + computed_metric = calculate_max(pandas_obj) + elif metric == Metric.MEAN: + computed_metric = calculate_mean(pandas_obj) + elif metric == Metric.STD: + computed_metric = calculate_std(pandas_obj) + elif metric == Metric.VARIANCE: + computed_metric = calculate_variance(pandas_obj) + elif metric == Metric.COUNTS: + computed_metric = calculate_counts(pandas_obj) + elif metric == Metric.UNIQUE_COUNTS: + computed_metric = calculate_unique_counts(pandas_obj) + elif metric == Metric.COUNTS_PER_LABEL: + computed_metric = calculate_counts_per_label(pandas_obj) + + if isinstance(computed_metric, Mapping): + for entity in sorted(computed_metric.keys()): # pylint: disable=no-member + value = computed_metric[entity] # pylint: disable=unsubscriptable-object + log_metric(column=col_name, metric=metric, value=value) else: - log_metric(dataset=self.dataset_name, column=self.column, metric=self.metric_name, value=metric_value) - return metric_value - - @property - def metric_name(self) -> str: - """Retrieves the name of the metric from the class name. - - Returns: - The name of the metric, e.g. "Min or Mean". - """ - return self.__class__.__name__ - - def calculate_metric(self) -> Any: - """Dictates that subclasses need to implement this method. - - Returns: - NotImplemented is returned if the method is not implemented, by the subclass - inevitably pointing to the parent implementation. - """ - return NotImplemented - + log_metric(column=col_name, metric=metric, value=computed_metric) -class Min(Metric): - """A metric instance that enables generating and returning the minimum value of a column.""" + return True - def calculate_metric(self) -> Number: - """Generate and return the minimum value of a column. - Returns: - The minimum value of a column. - """ - return self.df[self.column].min() +def calculate_min(series: pd.Series) -> float: + """Generate and return the minimum value of a column. + Returns: + The minimum value of a column. + """ + return series.min() -class Max(Metric): - """A metric instance that enables generating and returning the maximum value of a column.""" - - def calculate_metric(self) -> Number: - """Generate and return the maximum value of a column. - - Returns: - The maximum value of a column. - """ - return self.df[self.column].max() - - -class Mean(Metric): - """A metric instance that enables generating and returning the mean value of a column.""" - - def calculate_metric(self) -> Number: - """Generate and return the mean value of a column. - - Returns: - The mean value of a column. - """ - return self.df[self.column].mean() +def calculate_max(series: pd.Series) -> float: + """Generate and return the maximum value of a column. -class Std(Metric): - """A metric instance that enables generating and returning the standard deviation of a column.""" + Returns: + The maximum value of a column. + """ + return series.max() - def calculate_metric(self) -> Number: - """Generate and return the standard deviation of a column. - Returns: - The standard deviation of a column. - """ - return self.df[self.column].std() +def calculate_mean(series: pd.Series) -> float: + """Generate and return the mean value of a column. + Returns: + The mean value of a column. + """ + return series.mean() -class Variance(Metric): - """A metric instance that generated and returns the variance of a column.""" - def calculate_metric(self) -> Number: - """Generate and return the variance of a column. +def calculate_std(series: pd.Series) -> float: + """Generate and return the standard deviation of a column. - Returns: - The variance of a column. - """ - return self.df[self.column].var() + Returns: + The standard deviation of a column. + """ + return series.std() -class Counts(Metric): - """A metric instance that enables generating and returning the length of a column.""" +def calculate_variance(series: pd.Series) -> float: + """Generate and return the variance of a column. - def calculate_metric(self) -> int: - """Generate and return the length of a column. + Returns: + The variance of a column. + """ + return series.var() - Returns: - The length of a column. - """ - return len(self.df[self.column]) +def calculate_counts(series: pd.Series) -> int: + """Generate and return the length of a column. -class UniqueCounts(Metric): - """A metric instance that enables generating and returning the unique values of a column.""" + Returns: + The length of a column. + """ + return len(series) - def calculate_metric(self) -> int: - """Generate and return the unique values of a column. - Returns: - The unique values of a column. - """ - return len(self.df[self.column].unique()) +def calculate_unique_counts(series: pd.Series) -> int: + """Generate and return the unique values of a column. + Returns: + The unique values of a column. + """ + return len(series.unique()) -class CountsPerLabel(Metric): - """A metric instance that enables generating and returning the counts per label in a categorical column.""" - def calculate_metric(self) -> Mapping: - """Generate and return the counts per label in a categorical column. +def calculate_counts_per_label(series: pd.Series) -> dict: + """Generate and return the counts per label in a categorical column. - Returns: - The counts per label in a categorical column - """ - column_vs_metric_value = self.df[self.column].value_counts().to_dict() - label_vs_metric_value_with_column_prefix = {} - for key in column_vs_metric_value.keys(): - new_key = self.column + "-" + key - label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key] - return label_vs_metric_value_with_column_prefix + Returns: + The counts per label in a categorical column + """ + column_vs_metric_value = series.value_counts().to_dict() + label_vs_metric_value_with_column_prefix = {} + for key in column_vs_metric_value.keys(): + new_key = str(series.name) + "-" + key + label_vs_metric_value_with_column_prefix[new_key] = column_vs_metric_value[key] + return label_vs_metric_value_with_column_prefix diff --git a/dynamicio/mixins/__init__.py b/dynamicio/mixins/__init__.py deleted file mode 100644 index f928c7f..0000000 --- a/dynamicio/mixins/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Default dynamicio mixins module""" - -from dynamicio.mixins.with_kafka import ( - WithKafka, -) -from dynamicio.mixins.with_local import ( - WithLocal, - WithLocalBatch, -) -from dynamicio.mixins.with_postgres import ( - WithPostgres, -) -from dynamicio.mixins.with_s3 import ( - WithS3File, - WithS3PathPrefix, -) diff --git a/dynamicio/mixins/utils.py b/dynamicio/mixins/utils.py deleted file mode 100644 index 23a66c7..0000000 --- a/dynamicio/mixins/utils.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Mixin utility functions""" -# pylint: disable=no-member, protected-access, too-few-public-methods - -import inspect -import string -from contextlib import contextmanager -from functools import wraps -from types import FunctionType, MethodType -from typing import Any, Collection, Iterable, Mapping, MutableMapping, Optional, Union - -from magic_logger import logger - - -def allow_options(options: Union[Iterable, FunctionType, MethodType]): - """Validate **options for a decorated reader function. - - Args: - options: A set of valid options for a reader (e.g. `pandas.read_parquet` or `pandas.read_csv`) - - Returns: - read_with_valid_options: The input function called with modified options. - """ - - def _filter_out_irrelevant_options(kwargs: Mapping, valid_options: Iterable): - filtered_options = {} - invalid_options = {} - for key_arg in kwargs.keys(): - if key_arg in valid_options: - filtered_options[key_arg] = kwargs[key_arg] - else: - invalid_options[key_arg] = kwargs[key_arg] - if len(invalid_options) > 0: - logger.warning( - f"Options {invalid_options} were not used because they were not supported by the read or write method configured for this source. " - "Check if you expected any of those to have been used by the operation!" - ) - return filtered_options - - def read_with_valid_options(func): - @wraps(func) - def _(*args, **kwargs): - if callable(options): - return func(*args, **_filter_out_irrelevant_options(kwargs, args_of(options))) - return func(*args, **_filter_out_irrelevant_options(kwargs, options)) - - return _ - - return read_with_valid_options - - -def args_of(func): - """Retrieve allowed options for a given function. - - Args: - func: A function like, e.g., pd.read_csv - - Returns: - A set of allowed options - """ - return set(inspect.signature(func).parameters.keys()) - - -def get_string_template_field_names(s: str) -> Collection[str]: # pylint: disable=C0103 - """Given a string `s`, it parses the string to identify any template fields and returns the names of those fields. - - If `s` is not a string template, the returned `Collection` is empty. - - Args: - s: - - Returns: - Collection[str] - - Example: - - >>> get_string_template_field_names("abc{def}{efg}") - ["def", "efg"] - >>> get_string_template_field_names("{0}-{1}") - ["0", "1"] - >>> get_string_template_field_names("hello world") - [] - """ - # string.Formatter.parse returns a 4-tuple of: - # `literal_text`, `field_name`, `form_at_spec`, `conversion` - # More info here https://docs.python.org/3.8/library/string.html#string.Formatter.parse - field_names = [group[1] for group in string.Formatter().parse(s) if group[1] is not None] - - return field_names - - -def resolve_template(path: str, options: MutableMapping[str, Any]) -> str: # pylint: disable=C0103 - """Given a string `path`, it attempts to replace all templates fields with values provided in `options`. - - If `path` is not a string template, `path` is returned. - - Args: - path: A string which is either a template, e.g. /path/to/file/{replace_me}.h5 or just a path /path/to/file/dont_replace_me.h5 - options: A dynamic name for the "replace_me" field in the templated string. e.g. {"replace_me": "name_of_file"} - - Returns: - str: Returns a static path replaced with the value in the options mapping. - - Raises: - ValueError: if any template fields in s are not named using valid Python identifiers - ValueError: if a given template field cannot be resolved in `options` - """ - fields = get_string_template_field_names(path) - - if len(fields) == 0: - return path - - if not all(field.isidentifier() for field in fields): - raise ValueError(f"Expected valid Python identifiers, found {fields}") - - if not all(field in options for field in fields): - raise ValueError(f"Expected values for all fields in {fields}, found {list(options.keys())}") - - path = path.format(**{field: options[field] for field in fields}) - for field in fields: - options.pop(field) - - return path - - -@contextmanager -def pickle_protocol(protocol: Optional[int]): - """Downgrade to the provided pickle protocol within the context manager. - - Args: - protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. Defaults to 4, which covers python 3.4 and higher. - """ - import pickle # pylint: disable=import-outside-toplevel - - previous = pickle.HIGHEST_PROTOCOL - try: - pickle.HIGHEST_PROTOCOL = 4 - if protocol: - pickle.HIGHEST_PROTOCOL = protocol - yield - finally: - pickle.HIGHEST_PROTOCOL = previous diff --git a/dynamicio/mixins/with_kafka.py b/dynamicio/mixins/with_kafka.py deleted file mode 100644 index 13b1019..0000000 --- a/dynamicio/mixins/with_kafka.py +++ /dev/null @@ -1,162 +0,0 @@ -# pylint: disable=no-member, protected-access, too-few-public-methods - -"""This module provides mixins that are providing Kafka I/O support.""" - - -from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional - -import pandas as pd # type: ignore -import simplejson -from kafka import KafkaProducer # type: ignore -from magic_logger import logger - - -from dynamicio.config.pydantic import DataframeSchema, KafkaDataEnvironment -from dynamicio.mixins import utils - - -class WithKafka: - """Handles I/O operations for Kafka. - - Args: - - options: - - Standard: Keyword-arguments passed to the KafkaProducer constructor (see `KafkaProducer.DEFAULT_CONFIG.keys()`). - - Additional Options: - - - `key_generator: Callable[[Any, Mapping], T]`: defines the keying policy to be used for sending keyed-messages to Kafka. It is a `Callable` that takes a - `tuple(idx, row)` and returns a string that will serve as the message's key, invoked prior to serialising the key. It defaults to the dataframe's index - (which may not be composed of unique values or string type keys). It goes hand in hand with the default `key-serialiser`, which assumes that the keys - are strings and encode's them as such. - - - `key_serializer: Callable[T, bytes]`: Custom key serialiser; if not provided, a default key-serializer will be used, applied on a string-key (unless key is None). - - N.B. Providing a custom key-generator that generates a non-string key is best provided alongside a custom key-serializer best suited to handle the custom key-type. - - - `document_transformer: Callable[[Mapping[Any, Any]`: Manipulates the messages/rows sent to Kafka as values. It is a `Callable` taking a `Mapping` as its only - argument and return a `Mapping`, then this callable will be invoked prior to serializing each document. This can be used, for example, to add metadata to each - document that will be written to the target Kafka topic. - - - `value_serializer: Callable[Mapping, bytes]`: Custom value serialiser; if not provided, a default value-serializer will be used applied on a Mapping.. - - Example: - >>> # Given - >>> keyed_test_df = pd.DataFrame.from_records( - >>> [ - >>> ["key-01", "cm_1", "id_1", 1000, "ABC"], - >>> ["key-02", "cm_2", "id_2", 1000, "ABC"], - >>> ["key-03", "cm_3", "id_3", 1000, "ABC"], - >>> ], - >>> columns=["key", "id", "foo", "bar", "baz"], - >>> ).set_index("key") - >>> - >>> kafka_cloud_config = IOConfig( - >>> path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "processed.yaml")), - >>> env_identifier="CLOUD", - >>> dynamic_vars=constants, - >>> ).get(source_key="WRITE_TO_KAFKA_JSON") - >>> - >>> write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda key, _: key, document_transformer=lambda doc: doc["new_field"]="new_value") - >>> - >>> # When - >>> with patch.object(mixins, "KafkaProducer") as mock__kafka_producer: - >>> mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - >>> mock_producer = MockKafkaProducer() - >>> mock__kafka_producer.return_value = mock_producer - >>> write_kafka_io.write(keyed_test_df) - >>> - >>> # Then - >>> assert mock_producer.my_stream == [ - >>> {"key": "key-01", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1", "new_field": "new_value"}}, - >>> {"key": "key-02", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2", "new_field": "new_value"}}, - >>> {"key": "key-03", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3", "new_field": "new_value"}}, - >>> ] - """ - - sources_config: KafkaDataEnvironment - schema: DataframeSchema - options: MutableMapping[str, Any] - __kafka_config: Optional[Mapping] = None - __producer: Optional[KafkaProducer] = None - __key_generator: Optional[Callable[[Any, Mapping[Any, Any]], Optional[str]]] = None - __document_transformer: Optional[Callable[[Mapping[Any, Any]], Mapping[Any, Any]]] = None - - def _write_to_kafka(self, df: pd.DataFrame) -> None: - """Given a dataframe where each row is a message to be sent to a Kafka Topic, iterate through all rows and send them to a Kafka topic. - - The topic is defined in `self.sources_config["kafka"]` and using a kafka producer, which is flushed at the - end of this process. - - Args: - df: A dataframe where each row is a message to be sent to a Kafka Topic. - """ - if self.__key_generator is None: - self.__key_generator = lambda idx, __: idx # default key generator uses the dataframe's index - if self.options.get("key_generator") is not None: - self.__key_generator = self.options.pop("key_generator") - - if self.__document_transformer is None: - self.__document_transformer = lambda value: value - if self.options.get("document_transformer") is not None: - self.__document_transformer = self.options.pop("document_transformer") - - if self.__producer is None: - self.__producer = self._get_producer(self.sources_config.kafka.kafka_server, **self.options) - - self._send_messages(df=df, topic=self.sources_config.kafka.kafka_topic) - - @utils.allow_options(KafkaProducer.DEFAULT_CONFIG.keys()) - def _get_producer(self, server: str, **options: MutableMapping[str, Any]) -> KafkaProducer: - """Generate and return a Kafka Producer. - - Default options are used to generate the producer. Specifically: - - `bootstrap_servers`: Passed on through the source_config - - `value_serializer`: Uses a default_value_serializer defined in this mixin - - More options can be added to the producer by passing them as keyword arguments, through valid options. - - These can also override the default options. - - Args: - server: The host name. - **options: Keyword arguments to pass to the KafkaProducer. - - Returns: - A Kafka producer instance. - """ - self.__kafka_config = { - **{ - "bootstrap_servers": server, - "compression_type": "snappy", - "key_serializer": self._default_key_serializer, - "value_serializer": self._default_value_serializer, - }, - **options, - } - return KafkaProducer(**self.__kafka_config) - - def _send_messages(self, df: pd.DataFrame, topic: str) -> None: - logger.info(f"Sending {len(df)} messages to Kafka topic:{topic}.") - - messages = df.reset_index(drop=True).to_dict("records") - for idx, message in zip(df.index.values, messages): - self.__producer.send(topic, key=self.__key_generator(idx, message), value=self.__document_transformer(message)) # type: ignore - - self.__producer.flush() # type: ignore - - @staticmethod - def _default_key_serializer(key: Optional[str]) -> Optional[bytes]: - if key: - return key.encode("utf-8") - return None - - @staticmethod - def _default_value_serializer(value: Mapping) -> bytes: - return simplejson.dumps(value, ignore_nan=True).encode("utf-8") - - def _read_from_kafka(self) -> Iterable[Mapping]: # type: ignore - """Read messages from a Kafka Topic and convert them to separate dataframes. - - Returns: - Multiple dataframes, one per message read from the Kafka topic of interest. - """ - # TODO: Implement kafka reader diff --git a/dynamicio/mixins/with_local.py b/dynamicio/mixins/with_local.py deleted file mode 100644 index 89c951d..0000000 --- a/dynamicio/mixins/with_local.py +++ /dev/null @@ -1,257 +0,0 @@ -# pylint: disable=no-member, protected-access, too-few-public-methods - -"""This module provides mixins that are providing Local FS I/O support.""" - -import glob -import os -from threading import Lock -from typing import Any, MutableMapping - -import pandas as pd # type: ignore -from fastparquet import ParquetFile, write # type: ignore -from pyarrow.parquet import read_table, write_table # type: ignore # pylint: disable=no-name-in-module - -from dynamicio.config.pydantic import DataframeSchema, LocalBatchDataEnvironment, LocalDataEnvironment -from dynamicio.mixins import utils - -hdf_lock = Lock() - - -class WithLocal: - """Handles local I/O operations.""" - - schema: DataframeSchema - sources_config: LocalDataEnvironment - options: MutableMapping[str, Any] - - def _read_from_local(self) -> pd.DataFrame: - """Read a local file as a `DataFrame`. - - The configuration object is expected to have two keys: - - `file_path` - - `file_type` - - To actually read the file, a method is dynamically invoked by name, using - "_read_{file_type}_file". - - Returns: - DataFrame - """ - local_config = self.sources_config.local - file_path = utils.resolve_template(local_config.file_path, self.options) - file_type = local_config.file_type - - return getattr(self, f"_read_{file_type}_file")(file_path, self.schema, **self.options) - - def _write_to_local(self, df: pd.DataFrame): - """Write a dataframe locally based on the {file_type} of the config_io configuration. - - The configuration object is expected to have two keys: - - - `file_path` - - `file_type` - - To actually write the file, a method is dynamically invoked by name, using - "_write_{file_type}_file". - - Args: - df: The dataframe to be written out. - """ - local_config = self.sources_config.local - file_path = utils.resolve_template(local_config.file_path, self.options) - file_type = local_config.file_type - - getattr(self, f"_write_{file_type}_file")(df, file_path, **self.options) - - @staticmethod - @utils.allow_options(pd.read_hdf) - def _read_hdf_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame: - """Read a HDF file as a DataFrame using `pd.read_hdf`. - - All `options` are passed directly to `pd.read_hdf`. - - Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means - that when used with asyncio through `async_read()` HDF files will be read sequentially. - For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats - - Args: - file_path: The path to the hdf file to be read. - options: The pandas `read_hdf` options. - - Returns: - DataFrame: The dataframe read from the hdf file. - """ - with hdf_lock: - df = pd.read_hdf(file_path, **options) - - columns = [column for column in df.columns.to_list() if column in schema.column_names] - df = df[columns] - return df - - @staticmethod - @utils.allow_options(pd.read_csv) - def _read_csv_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame: - """Read a CSV file as a DataFrame using `pd.read_csv`. - - All `options` are passed directly to `pd.read_csv`. - - Args: - file_path: The path to the csv file to be read. - options: The pandas `read_csv` options. - - Returns: - DataFrame: The dataframe read from the csv file. - """ - options["usecols"] = list(schema.column_names) - return pd.read_csv(file_path, **options) - - @staticmethod - @utils.allow_options(pd.read_json) - def _read_json_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame: - """Read a json file as a DataFrame using `pd.read_hdf`. - - All `options` are passed directly to `pd.read_hdf`. - - Args: - file_path: - options: - - Returns: - DataFrame - """ - df = pd.read_json(file_path, **options) - columns = [column for column in df.columns.to_list() if column in schema.column_names] - df = df[columns] - return df - - @staticmethod - def _read_parquet_file(file_path: str, schema: DataframeSchema, **options: Any) -> pd.DataFrame: - """Read a Parquet file as a DataFrame using `pd.read_parquet`. - - All `options` are passed directly to `pd.read_parquet`. - - Args: - file_path: The path to the parquet file to be read. - options: The pandas `read_parquet` options. - - Returns: - DataFrame: The dataframe read from the parquet file. - """ - options["columns"] = list(schema.column_names) - - if options.get("engine") == "fastparquet": - return WithLocal.__read_with_fastparquet(file_path, **options) - return WithLocal.__read_with_pyarrow(file_path, **options) - - @classmethod - @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(read_table)]) - def __read_with_pyarrow(cls, file_path: str, **options: Any) -> pd.DataFrame: - return pd.read_parquet(file_path, **options) - - @classmethod - @utils.allow_options([*utils.args_of(pd.read_parquet), *utils.args_of(ParquetFile)]) - def __read_with_fastparquet(cls, file_path: str, **options: Any) -> pd.DataFrame: - return pd.read_parquet(file_path, **options) - - @staticmethod - @utils.allow_options([*utils.args_of(pd.DataFrame.to_hdf), *["protocol"]]) - def _write_hdf_file(df: pd.DataFrame, file_path: str, **options: Any): - """Write a dataframe to hdf using `df.to_hdf`. - - All `options` are passed directly to `df.to_hdf`. - - Caveats: As HDFs are not thread-safe, we use a Lock on this operation. This, practically means - that when used with asyncio through `async_read()` HDF files will be written sequentially. - For more information see: https://pandas.pydata.org/pandas-docs/dev/user_guide/io.html#caveats - - Args: - df: A dataframe write out. - file_path: The location where the file needs to be written. - options: The pandas `to_hdf` options. - - - The pandas `to_hdf` options, &; - - protocol: The pickle protocol to use for writing the hdf file out; a value <=5. - """ - with utils.pickle_protocol(protocol=options.pop("protocol", None)), hdf_lock: - df.to_hdf(file_path, key="df", mode="w", **options) - - @staticmethod - @utils.allow_options(pd.DataFrame.to_csv) - def _write_csv_file(df: pd.DataFrame, file_path: str, **options: Any): - """Write a dataframe as a CSV file using `df.to_csv`. - - All `options` are passed directly to `df.to_csv`. - - Args: - df: A dataframe write out. - file_path: The location where the file needs to be written. - options: Options relative to writing a csv file. - """ - df.to_csv(file_path, **options) - - @staticmethod - @utils.allow_options(pd.DataFrame.to_json) - def _write_json_file(df: pd.DataFrame, file_path: str, **options: Any): - """Write a dataframe as a json file using `df.to_json`. - - All `options` are passed directly to `df.to_json`. - - Args: - df: A dataframe write out. - file_path: The location where the file needs to be written. - options: Options relative to writing a json file. - """ - df.to_json(file_path, **options) - - @staticmethod - def _write_parquet_file(df: pd.DataFrame, file_path: str, **options: Any): - """Write a dataframe as a parquet file using `df.to_parquet`. - - All `options` are passed directly to `df.to_parquet`. - - Args: - df: A dataframe write out. - file_path: The location where the file needs to be written. - options: Options relative to writing a parquet file. - """ - if options.get("engine") == "fastparquet": - return WithLocal.__write_with_fastparquet(df, file_path, **options) - return WithLocal.__write_with_pyarrow(df, file_path, **options) - - @classmethod - @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write_table)]) - def __write_with_pyarrow(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame: - return df.to_parquet(filepath, **options) - - @classmethod - @utils.allow_options([*utils.args_of(pd.DataFrame.to_parquet), *utils.args_of(write)]) - def __write_with_fastparquet(cls, df: pd.DataFrame, filepath: str, **options: Any) -> pd.DataFrame: - return df.to_parquet(filepath, **options) - - -class WithLocalBatch(WithLocal): - """Responsible for batch reading local files.""" - - sources_config: LocalBatchDataEnvironment # type: ignore - - def _read_from_local_batch(self) -> pd.DataFrame: - """Reads a set of files for a specified file type, concatenates them and returns a dataframe. - - Returns: - A concatenated dataframe composed of all files read through local_batch. - """ - local_batch_config = self.sources_config.local - - file_type = local_batch_config.file_type - filtering_file_type = file_type.value - if filtering_file_type == "hdf": - filtering_file_type = "h5" - - files = glob.glob(os.path.join(local_batch_config.path_prefix, f"*.{filtering_file_type}")) - - dfs_to_concatenate = [] - for file in files: - file_to_load = os.path.join(local_batch_config.path_prefix, file) - dfs_to_concatenate.append(getattr(self, f"_read_{file_type}_file")(file_to_load, self.schema, **self.options)) # type: ignore - - return pd.concat(dfs_to_concatenate).reset_index(drop=True) diff --git a/dynamicio/mixins/with_postgres.py b/dynamicio/mixins/with_postgres.py deleted file mode 100644 index 051d893..0000000 --- a/dynamicio/mixins/with_postgres.py +++ /dev/null @@ -1,197 +0,0 @@ -# pylint: disable=no-member, protected-access, too-few-public-methods - -"""This module provides mixins that are providing Postgres I/O support.""" - -import csv -import tempfile -from contextlib import contextmanager -from typing import Any, Dict, Generator, MutableMapping, Union - -import pandas as pd # type: ignore -from magic_logger import logger -from sqlalchemy import BigInteger, Boolean, Column, create_engine, Date, DateTime, Float, Integer, String # type: ignore -from sqlalchemy.ext.declarative import declarative_base # type: ignore -from sqlalchemy.orm import Query # type: ignore -from sqlalchemy.orm.decl_api import DeclarativeMeta # type: ignore -from sqlalchemy.orm.session import Session as SqlAlchemySession # type: ignore -from sqlalchemy.orm.session import sessionmaker # type: ignore - -from dynamicio.config.pydantic import DataframeSchema, PostgresDataEnvironment -from dynamicio.mixins import utils - -Session = sessionmaker(autoflush=True) - -Base = declarative_base() -_type_lookup = { - "bool": Boolean, - "boolean": Boolean, - "object": String(64), - "int64": Integer, - "float64": Float, - "int": Integer, - "date": Date, - "datetime64[ns]": DateTime, - "bigint": BigInteger, -} - - -@contextmanager -def session_for(connection_string: str) -> Generator[SqlAlchemySession, None, None]: - """Connect to a database using `connection_string` and returns an active session to that connection. - - Args: - connection_string: - - Yields: - Active session - """ - engine = create_engine(connection_string) - session = Session(bind=engine) - - try: - yield session - finally: - session.close() # pylint: disable=no-member - - -class WithPostgres: - """Handles I/O operations for Postgres. - - Args: - - options: - - `truncate_and_append: bool`: If set to `True`, truncates the table and then appends the new rows. Otherwise, it drops the table and recreates it with the new rows. - """ - - sources_config: PostgresDataEnvironment - schema: DataframeSchema - options: MutableMapping[str, Any] - - def _read_from_postgres(self) -> pd.DataFrame: - """Read data from postgres as a `DataFrame`. - - The configuration object is expected to have the following keys: - - `db_user` - - `db_password` - - `db_host` - - `db_port` - - `db_name` - - Returns: - DataFrame - """ - postgres_config = self.sources_config.postgres - db_user = postgres_config.db_user - db_password = postgres_config.db_password - db_host = postgres_config.db_host - db_port = postgres_config.db_port - db_name = postgres_config.db_name - - connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}" - - sql_query = self.options.pop("sql_query", None) - - assert self.sources_config.dynamicio_schema is not None, "The schema must be specified for SQL tables" - model = self._generate_model_from_schema(self.sources_config.dynamicio_schema) - - query = Query(self._get_table_columns(model)) - if sql_query: - query = sql_query - - logger.info(f"[postgres] Started downloading table: {self.sources_config.dynamicio_schema.name} from: {db_host}:{db_name}") - with session_for(connection_string) as session: - return self._read_database(session, query, **self.options) - - @staticmethod - def _generate_model_from_schema(schema: DataframeSchema) -> DeclarativeMeta: - json_cls_schema: Dict[str, Any] = {"tablename": schema.name, "columns": []} - - for col in schema.columns.values(): - sql_type = _type_lookup.get(col.data_type) - if sql_type: - json_cls_schema["columns"].append({"name": col.name, "type": sql_type}) - - class_name = "".join(word.capitalize() or "_" for word in schema.name.split("_")) + "Model" - - class_dict = {"clsname": class_name, "__tablename__": schema.name, "__table_args__": {"extend_existing": True}} - class_dict.update({column["name"]: Column(column["type"], primary_key=True) if idx == 0 else Column(column["type"]) for idx, column in enumerate(json_cls_schema["columns"])}) - - generated_model = type(class_name, (Base,), class_dict) - return generated_model - - @staticmethod - def _get_table_columns(model): - tables_colums = [] - if model: - for col in list(model.__table__.columns): - tables_colums.append(getattr(model, col.name)) - return tables_colums - - @staticmethod - @utils.allow_options(pd.read_sql) - def _read_database(session: SqlAlchemySession, query: Union[str, Query], **options: Any) -> pd.DataFrame: - """Run `query` against active `session` and returns the result as a `DataFrame`. - - Args: - session: Active session - query: If a `Query` object is given, it should be unbound. If a `str` is given, the - value is used as-is. - - Returns: - DataFrame - """ - if isinstance(query, Query): - query = query.with_session(session).statement - return pd.read_sql(sql=query, con=session.get_bind(), **options) - - def _write_to_postgres(self, df: pd.DataFrame): - """Write a dataframe to postgres based on the {file_type} of the config_io configuration. - - Args: - df: The dataframe to be written - """ - postgres_config = self.sources_config.postgres - db_user = postgres_config.db_user - db_password = postgres_config.db_password - db_host = postgres_config.db_host - db_port = postgres_config.db_port - db_name = postgres_config.db_name - - connection_string = f"postgresql://{db_user}:{db_password}@{db_host}:{db_port}/{db_name}" - - assert self.sources_config.dynamicio_schema is not None, "The schema must be specified for SQL tables" - model = self._generate_model_from_schema(self.sources_config.dynamicio_schema) - - is_truncate_and_append = self.options.get("truncate_and_append", False) - - logger.info(f"[postgres] Started downloading table: {self.sources_config.dynamicio_schema.name} from: {db_host}:{db_name}") - with session_for(connection_string) as session: - self._write_to_database(session, model.__tablename__, df, is_truncate_and_append) # type: ignore - - @staticmethod - def _write_to_database(session: SqlAlchemySession, table_name: str, df: pd.DataFrame, is_truncate_and_append: bool): - """Write a dataframe to any database provided a session with a data model and a table name. - - Args: - session: Generated from a data model and a table name - table_name: The name of the table to read from a DB - df: The dataframe to be written out - is_truncate_and_append: Supply to truncate the table and append new rows to it; otherwise, delete and replace - """ - if is_truncate_and_append: - session.execute(f"TRUNCATE TABLE {table_name};") - - # Below is a speedup hack in place of `df.to_csv` with the multipart option. As of today, even with - # `method="multi"`, uploading to Postgres is painfully slow. Hence, we're resorting to dumping the file as - # csv and using Postgres's CSV import function. - # https://stackoverflow.com/questions/2987433/how-to-import-csv-file-data-into-a-postgresql-table - with tempfile.NamedTemporaryFile(mode="r+") as temp_file: - df.to_csv(temp_file, index=False, header=False, sep="\t", doublequote=False, escapechar="\\", quoting=csv.QUOTE_NONE) - temp_file.flush() - temp_file.seek(0) - - cur = session.connection().connection.cursor() - cur.copy_from(temp_file, table_name, columns=df.columns, null="") - else: - df.to_sql(name=table_name, con=session.get_bind(), if_exists="replace", index=False) - - session.commit() diff --git a/dynamicio/mixins/with_s3.py b/dynamicio/mixins/with_s3.py deleted file mode 100644 index 92ad958..0000000 --- a/dynamicio/mixins/with_s3.py +++ /dev/null @@ -1,397 +0,0 @@ -# pylint: disable=no-member, protected-access, too-few-public-methods - -"""This module provides mixins that are providing S3 I/O support.""" - -import dataclasses -import io -import os -import tempfile -import urllib.parse -import uuid -from contextlib import contextmanager -from typing import Generator, IO, Optional - -import boto3 # type: ignore -import pandas as pd # type: ignore -import s3transfer.futures # type: ignore -import tables # type: ignore -from awscli.clidriver import create_clidriver # type: ignore -from magic_logger import logger - -from dynamicio.config.pydantic import DataframeSchema, S3DataEnvironment, S3PathPrefixEnvironment -from dynamicio.mixins import ( - utils, - with_local, -) - - -class InMemStore(pd.io.pytables.HDFStore): - """A subclass of pandas HDFStore that does not manage the pytables File object""" - - _in_mem_table = None - - def __init__(self, path: str, table: tables.File, mode: str = "r"): - self._in_mem_table = table - super().__init__(path=path, mode=mode) - - def open(self, *_args, **_kwargs): - pd.io.pytables._tables() - self._handle = self._in_mem_table - - def close(self, *_args, **_kwargs): - pass - - @property - def is_open(self): - return self._handle is not None - - -class HdfIO: - """Class providing stream support for HDF tables""" - - @contextmanager - def create_file(self, label: str, mode: str, data: bytes = None) -> Generator[tables.File, None, None]: - """Create an in-memory pytables table""" - extra_kw = {} - if data: - extra_kw["driver_core_image"] = data - file_handle = tables.File(f"{label}_{uuid.uuid4()}.h5", mode, title=label, root_uep="/", filters=None, driver="H5FD_CORE", driver_core_backing_store=0, **extra_kw) - try: - yield file_handle - finally: - file_handle.close() - - def load(self, fobj: IO[bytes], label: str = "unknown_file.h5") -> pd.DataFrame: - """Load the dataframe from an file-like object""" - with self.create_file(label, mode="r", data=fobj.read()) as file_handle: - return pd.read_hdf(InMemStore(label, file_handle)) - - def save(self, df: pd.DataFrame, fobj: IO[bytes], label: str = "unknown_file.h5", options: Optional[dict] = None): - """Load the dataframe to a file-like object""" - if not options: - options = {} - with self.create_file(label, mode="w", data=fobj.read()) as file_handle: - store = InMemStore(path=label, table=file_handle, mode="w") - store.put(key="df", value=df, **options) - fobj.write(file_handle.get_file_image()) - - -def awscli_runner(*cmd: str): - """Runs the awscli command provided. - - Args: - *cmd: A list of args used in the command. - - Raises: - A runtime error exception is raised if download fails. - - Example: - - >>> awscli_runner("s3", "sync", "s3://mock-bucket/mock-key", ".") - """ - # Run - exit_code = create_clidriver().main(cmd) - - if exit_code > 0: - raise RuntimeError(f"AWS CLI exited with code {exit_code}") - - -@dataclasses.dataclass -class S3TransferHandle: - """A dataclass used to track an ongoing data download from the s3""" - - s3_object: object # boto3.resource('s3').ObjectSummary - fobj: IO[bytes] # file-like object the data is being downloaded to - done_future: s3transfer.futures.BaseTransferFuture - - -class WithS3PathPrefix(with_local.WithLocal): - """Handles I/O operations for AWS S3; implements read operations only. - - This mixin assumes that the directories it reads from will only contain a single file-type. - """ - - sources_config: S3PathPrefixEnvironment # type: ignore - schema: DataframeSchema - - boto3_resource = boto3.resource("s3") - boto3_client = boto3.client("s3") - - def _write_to_s3_path_prefix(self, df: pd.DataFrame): - """Write a DataFrame to an S3 path prefix. - - The configuration object is expected to have the following keys: - - `bucket` - - `path_prefix` - - `file_type` - - Args: - df (pd.DataFrame): the DataFrame to be written to S3 - - Raises: - ValueError: In case `path_prefix` is missing from config - ValueError: In case the `partition_cols` arg is missing while trying to write a parquet file - """ - s3_config = self.sources_config.s3 - - file_type = s3_config.file_type - if file_type != "parquet": - raise ValueError(f"File type not supported: {file_type}, only parquet files can be written to an S3 key") - if "partition_cols" not in self.options: - raise ValueError("`partition_cols` is required as an option to write partitioned parquet files to S3") - - bucket = s3_config.bucket - path_prefix = s3_config.path_prefix - full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options) - - with tempfile.TemporaryDirectory() as temp_dir: - self._write_parquet_file(df, temp_dir, **self.options) - awscli_runner( - "s3", - "sync", - temp_dir, - full_path_prefix, - "--acl", - "bucket-owner-full-control", - "--only-show-errors", - "--exact-timestamps", - ) - - def _read_from_s3_path_prefix(self) -> pd.DataFrame: - """Read all files under a path prefix from an S3 bucket as a `DataFrame`. - - The configuration object is expected to have the following keys: - - `bucket` - - `path_prefix` - - `file_type` - - To actually read the file, a method is dynamically invoked by name, using - "_read_{file_type}_path_prefix". - - Returns: - DataFrame - """ - s3_config = self.sources_config.s3 - file_type = s3_config.file_type - if file_type not in {"parquet", "csv", "hdf", "json"}: - raise ValueError(f"File type not supported: {file_type}") - - bucket = s3_config.bucket - path_prefix = s3_config.path_prefix - full_path_prefix = utils.resolve_template(f"s3://{bucket}/{path_prefix}", self.options) - - # The `no_disk_space` option should be used only when reading a subset of columns from S3 - if self.options.pop("no_disk_space", False): - if file_type == "parquet": - return self._read_parquet_file(full_path_prefix, self.schema, **self.options) - if file_type == "hdf": - dfs = [] - for fobj in self._iter_s3_files( - full_path_prefix, - file_ext=".h5", - max_memory_use=1024**3, # 1 gib - ): - dfs.append(HdfIO().load(fobj)) - df = pd.concat(dfs, ignore_index=True) - columns = [column for column in df.columns.to_list() if column in self.schema.columns.keys()] - return df[columns] - - with tempfile.TemporaryDirectory() as temp_dir: - # aws-cli is shown to be up to 6 times faster when downloading the complete dataset from S3 than using the boto3 - # client or pandas directly. This is because aws-cli uses the parallel downloader, which is much faster than the - # boto3 client. - awscli_runner( - "s3", - "sync", - full_path_prefix, - temp_dir, - "--acl", - "bucket-owner-full-control", - "--only-show-errors", - "--exact-timestamps", - ) - - dfs = [] - for file in os.listdir(temp_dir): - df = getattr(self, f"_read_{file_type}_file")(os.path.join(temp_dir, file), self.schema, **self.options) # type: ignore - if len(df) > 0: - dfs.append(df) - - return pd.concat(dfs, ignore_index=True) - - def _iter_s3_files(self, s3_prefix: str, file_ext: Optional[str] = None, max_memory_use: int = -1) -> Generator[IO[bytes], None, None]: # pylint: disable=too-many-locals - """Download sways of S3 objects. - - Parameters: - s3_prefix: s3 url to fetch objects with - file_ext: extension of s3 objects to allow through - max_memory_use: The approximate number of bytes to allocate on each yield of Generator - """ - parsed_url = urllib.parse.urlparse(s3_prefix) - assert parsed_url.scheme == "s3", f"{s3_prefix!r} should be an s3 url" - bucket_name = parsed_url.netloc - file_prefix = f"{parsed_url.path.strip('/')}/" - s3_objects_to_fetch = [] - # Collect objects to be loaded - for s3_object in self.boto3_resource.Bucket(bucket_name).objects.filter(Prefix=file_prefix): - good_object = (not file_ext) or (s3_object.key.endswith(file_ext)) - if good_object: - s3_objects_to_fetch.append(s3_object) - - if max_memory_use < 0: - # Unlimited memory use - fetch ALL - max_memory_use = sum(s3_obj.size for s3_obj in s3_objects_to_fetch) * 2 - transfer_config = boto3.s3.transfer.TransferConfig(max_concurrency=20) - while s3_objects_to_fetch: - mem_use_left = max_memory_use - handles = [] - with boto3.s3.transfer.create_transfer_manager(self.boto3_client, transfer_config) as transfer_manager: - while mem_use_left > 0 and s3_objects_to_fetch: - s3_object = s3_objects_to_fetch.pop() - fobj = io.BytesIO() - future = transfer_manager.download(bucket_name, s3_object.key, fobj) - handles.append(S3TransferHandle(s3_object, fobj, future)) - mem_use_left -= s3_object.size - # Leaving the `transfer_manager` context implicitly waits for all downloads to complete - # Rewind and yield all fobjs - for handle in handles: - handle.fobj.seek(0) - yield handle.fobj - - -class WithS3File(with_local.WithLocal): - """Handles I/O operations for AWS S3. - - All files are persisted to disk first using boto3 as this has proven to be faster than reading them into memory. - Note that reading things into memory is available for csv, json and parquet types only. Unfortunately, until support - for generic buffer is added to read_hdf, we need to download and persists the file to disk first anyway. - - Options: - no_disk_space: If `True`, then s3fs + fsspec will be used to read data directly into memory. - """ - - sources_config: S3DataEnvironment # type: ignore - schema: DataframeSchema - - boto3_client = boto3.client("s3") - - @contextmanager - def _s3_named_file_reader(self, s3_bucket: str, s3_key: str) -> Generator: - """Contextmanager to abstract reading different file types in S3. - - This implementation saves the downloaded data to a temporary file. - - Args: - s3_bucket: The S3 bucket from where to read the file. - s3_key: The file-path to the target file to be read. - - Returns: - The local file path from where the file can be read, once it has been downloaded there by the boto3.client. - - """ - with tempfile.NamedTemporaryFile("wb") as target_file: - # Download the file from S3 - self.boto3_client.download_fileobj(s3_bucket, s3_key, target_file) - # Yield local file path to body of `with` statement - target_file.flush() - yield target_file - - @contextmanager - def _s3_reader(self, s3_bucket: str, s3_key: str) -> Generator[io.BytesIO, None, None]: - """Contextmanager to abstract reading different file types in S3. - - This implementation only retains data in-memory, avoiding creating any temp files. - - Args: - s3_bucket: The S3 bucket from where to read the file. - s3_key: The file-path to the target file to be read. - - Returns: - The local file path from where the file can be read, once it has been downloaded there by the boto3.client. - - """ - fobj = io.BytesIO() - # Download the file from S3 - self.boto3_client.download_fileobj(s3_bucket, s3_key, fobj) - # Yield the buffer - fobj.seek(0) - yield fobj - - @contextmanager - def _s3_writer(self, s3_bucket: str, s3_key: str) -> Generator[IO[bytes], None, None]: - """Contextmanager to abstract loading different file types to S3. - - Args: - s3_bucket: The S3 bucket to upload the file to. - s3_key: The file-path where the target file should be uploaded to. - - Returns: - The local file path where to actually write the file, to be read and uploaded by boto3.client. - """ - fobj = io.BytesIO() - yield fobj - fobj.seek(0) - self.boto3_client.upload_fileobj(fobj, s3_bucket, s3_key, ExtraArgs={"ACL": "bucket-owner-full-control"}) - - def _read_from_s3_file(self) -> pd.DataFrame: - """Read a file from an S3 bucket as a `DataFrame`. - - The configuration object is expected to have the following keys: - - `bucket` - - `file_path` - - `file_type` - - To actually read the file, a method is dynamically invoked by name, using "_read_{file_type}_file". - - Returns: - DataFrame - """ - s3_config = self.sources_config.s3 - file_type = s3_config.file_type - file_path = utils.resolve_template(s3_config.file_path, self.options) - bucket = s3_config.bucket - - logger.info(f"[s3] Started downloading: s3://{s3_config.bucket}/{file_path}") - if self.options.pop("no_disk_space", None): - no_disk_space_rv = None - if file_type in ["csv", "json", "parquet"]: - no_disk_space_rv = getattr(self, f"_read_{file_type}_file")(f"s3://{s3_config.bucket}/{file_path}", self.schema, **self.options) # type: ignore - elif file_type == "hdf": - with self._s3_reader(s3_bucket=bucket, s3_key=file_path) as fobj: # type: ignore - no_disk_space_rv = HdfIO().load(fobj) # type: ignore - else: - raise NotImplementedError(f"Unsupported file type {file_type!r}.") - if no_disk_space_rv is not None: - return no_disk_space_rv - with self._s3_named_file_reader(s3_bucket=bucket, s3_key=file_path) as target_file: # type: ignore - return getattr(self, f"_read_{file_type}_file")(target_file.name, self.schema, **self.options) # type: ignore - - def _write_to_s3_file(self, df: pd.DataFrame): - """Write a dataframe to s3 based on the {file_type} of the config_io configuration. - - The configuration object is expected to have two keys: - - - `file_path` - - `file_type` - - To actually write the file, a method is dynamically invoked by name, using "_write_{file_type}_file". - - Args: - df: The dataframe to be written out - """ - s3_config = self.sources_config.s3 - bucket = s3_config.bucket - file_path = utils.resolve_template(s3_config.file_path, self.options) - file_type = s3_config.file_type - - logger.info(f"[s3] Started uploading: s3://{bucket}/{file_path}") - if file_type in ["csv", "json", "parquet"]: - getattr(self, f"_write_{file_type}_file")(df, f"s3://{bucket}/{file_path}", **self.options) # type: ignore - elif file_type == "hdf": - hdf_options = dict(self.options) - pickle_protocol = hdf_options.pop("pickle_protocol", None) - with self._s3_writer(s3_bucket=s3_config.bucket, s3_key=file_path) as target_file, utils.pickle_protocol(protocol=pickle_protocol): - HdfIO().save(df, target_file, hdf_options) # type: ignore - else: - raise ValueError(f"File type: {file_type} not supported!") - logger.info(f"[s3] Finished uploading: s3://{bucket}/{file_path}") diff --git a/dynamicio/py.typed b/dynamicio/py.typed deleted file mode 100644 index e69de29..0000000 diff --git a/dynamicio/utils.py b/dynamicio/utils.py new file mode 100644 index 0000000..9bc6cbb --- /dev/null +++ b/dynamicio/utils.py @@ -0,0 +1,20 @@ +"""Utilities for dynamicio.""" + +from contextlib import contextmanager + + +@contextmanager +def pickle_protocol(protocol: int): + """Downgrade to the provided pickle protocol within the context manager. + + Args: + protocol: The number of the protocol HIGHEST_PROTOCOL to downgrade to. + """ + import pickle # pylint: disable=import-outside-toplevel + + previous = pickle.HIGHEST_PROTOCOL + try: + pickle.HIGHEST_PROTOCOL = protocol + yield + finally: + pickle.HIGHEST_PROTOCOL = previous diff --git a/demo/__init__.py b/dynamicio/v5_migration/__init__.py similarity index 100% rename from demo/__init__.py rename to dynamicio/v5_migration/__init__.py diff --git a/dynamicio/v5_migration/__main__.py b/dynamicio/v5_migration/__main__.py new file mode 100644 index 0000000..013ae30 --- /dev/null +++ b/dynamicio/v5_migration/__main__.py @@ -0,0 +1,8 @@ +# pylint: skip-file +# noqa +# type: ignore + + +from dynamicio.v5_migration.app import app + +app() diff --git a/dynamicio/v5_migration/app.py b/dynamicio/v5_migration/app.py new file mode 100644 index 0000000..1927f4d --- /dev/null +++ b/dynamicio/v5_migration/app.py @@ -0,0 +1,203 @@ +# pylint: skip-file +# noqa +# type: ignore + +from __future__ import annotations + +from copy import deepcopy +from dataclasses import dataclass +from pathlib import Path +from typing import Callable + +import typer +import yaml +from rich import print as rich_print + +from dynamicio.v5_migration.resource_migration import ( + convert_single_resource_file, + is_resource_dict, + resources_import_str, +) +from dynamicio.v5_migration.schema_migration import convert_single_schema_file, is_schema_dict, schema_import_str + +app = typer.Typer() + + +@app.command() +def convert_everything(source: Path, destination: Path): + """Converts every item as far as possible. Paths can be dirs or files.""" + schemas_source_destination, schemas_to_be_written = gather_schema_migration_actions(source, destination) + resources_source_destination, resources_to_be_written = gather_resource_migration_actions(source, destination) + + source_destination_pairs = schemas_source_destination + resources_source_destination + files_to_be_written = schemas_to_be_written + resources_to_be_written + + confirm_migration_actions(source_destination_pairs, files_to_be_written) + write_files(files_to_be_written) + + +@dataclass +class SourceDestinationPair: + source: Path + destination: Path + + +@dataclass +class FilesToBeWritten: + target_file: Path + target_content: str + + +@app.command() +def convert_resources(source: Path, destination: Path): + """Converts only resource yamls.""" + files_to_be_written, source_destination_pairs = gather_resource_migration_actions(destination, source) + + confirm_migration_actions(source_destination_pairs, files_to_be_written) + write_files(files_to_be_written) + + +def gather_resource_migration_actions(source: Path, destination: Path): + source_content, source_path = handle_source_path(source) + # make lower case 2 levels + source_content = { + source: { + k.lower(): {kk.lower(): vv for kk, vv in v.items()} if isinstance(v, dict) else v + for k, v in deepcopy(contents).items() + } + for source, contents in source_content.items() + } + + source_content = {source: contents for source, contents in source_content.items() if is_resource_dict(contents)} + source_destination_pairs: list[SourceDestinationPair] + files_to_be_written: list[FilesToBeWritten] + source_destination_pairs, files_to_be_written = generate_source_destination_actions( + source_path, + source_content, + destination, + resources_import_str, + convert_single_resource_file, + ) + return source_destination_pairs, files_to_be_written + + +@app.command() +def convert_schemas(source: Path, destination: Path): + """Converts only schemas.""" + source_destination_pairs, files_to_be_written = gather_schema_migration_actions(source, destination) + + confirm_migration_actions(source_destination_pairs, files_to_be_written) + write_files(files_to_be_written) + + +def gather_schema_migration_actions( + source: Path, destination: Path +) -> tuple[list[SourceDestinationPair], list[FilesToBeWritten]]: + """Gathers the source destination pairs and files to be written.""" + + source_content, source_path = handle_source_path(source) + source_content = {source: contents for source, contents in source_content.items() if is_schema_dict(contents)} + + source_destination_pairs: list[SourceDestinationPair] + files_to_be_written: list[FilesToBeWritten] + source_destination_pairs, files_to_be_written = generate_source_destination_actions( + source_path, + source_content, + destination, + schema_import_str, + convert_single_schema_file, + ) + + return source_destination_pairs, files_to_be_written + + +# ------------------ + + +def generate_source_destination_actions( + source_path: Path, + source_content: dict[Path, dict], + destination: Path, + import_str: str, + contents_to_code_conversion_func: Callable[[dict], str], +) -> tuple[list[SourceDestinationPair], list[FilesToBeWritten]]: + """Generates the source destination pairs and files to be written.""" + source_destination_pairs: list[SourceDestinationPair] = [] + files_to_be_written: list[FilesToBeWritten] = [] + + if not source_content: + return [], [] + + if destination.suffix == ".py": + python_str = import_str + for _source, contents in source_content.items(): + rich_print(f"Converting [green]{len(source_destination_pairs)}[/green]") + python_str += contents_to_code_conversion_func(contents) + source_destination_pairs.append(SourceDestinationPair(_source, destination)) + + files_to_be_written.append(FilesToBeWritten(destination.with_suffix(".py"), python_str)) + + elif destination.suffix == "": + for _source, contents in source_content.items(): + python_str = import_str + python_str += contents_to_code_conversion_func(contents) + + sub_path = _source.relative_to(source_path) + destination_path = destination / sub_path.with_suffix(".py") + + source_destination_pairs.append(SourceDestinationPair(_source, destination_path)) + files_to_be_written.append(FilesToBeWritten(destination_path, python_str)) + else: + raise ValueError( + f"Destination {destination} is not a directory or python file. Found suffix {destination.suffix}." + ) + return source_destination_pairs, files_to_be_written + + +def handle_source_path(source: Path) -> tuple[dict[Path, dict], Path]: + """returns a tuple of source_content and source_path + + source_content is a dict of paths and their yaml contents. + source_path is the path of the source directory or parent directory of source if source is a path. + """ + if source.is_file(): + sources = [source] + source_path = source.parent + elif source.is_dir(): + sources = list(source.glob("**/*.yaml")) + source_path = source + else: + raise ValueError(f"Source {source} is not a file or directory") + + source_content = {source: yaml.safe_load(source.open()) for source in sources} + # Make yaml keys lowercase + source_content = {sc_key: {k.lower(): v for k, v in sc_val.items()} for sc_key, sc_val in source_content.items()} + return source_content, source_path + + +def write_files(files_to_be_written: list[FilesToBeWritten]): + """Writes the files to be written.""" + for write_file in files_to_be_written: + write_file.target_file.parent.mkdir(parents=True, exist_ok=True) + write_file.target_file.write_text(write_file.target_content) + rich_print( + f"[red]WARNING [/red][blue]Fix warnings emitted above (if any), some validations may need manual edits.[/blue]" + ) + + +def confirm_migration_actions( + source_destination_pairs: list[SourceDestinationPair], + files_to_be_written: list[FilesToBeWritten], +): + """Confirms the migration actions.""" + rich_print(f"[bold red]Found [green]{len(source_destination_pairs)}[/green] source destination pairs:[/bold red]") + for pair in source_destination_pairs: + rich_print(f"[blue] - [/blue]{pair.source} -> {pair.destination}") + + rich_print(f"[bold red]Found [green]{len(files_to_be_written)}[/green] files to be written:[/bold red]") + + for write_file in files_to_be_written: + loc = write_file.target_content.count("\n") + rich_print(f"[bold blue] - [/bold blue]{write_file.target_file} - ({loc} lines of code.)") + + typer.confirm("\nProceed writing?", abort=True) diff --git a/dynamicio/v5_migration/resource_migration.py b/dynamicio/v5_migration/resource_migration.py new file mode 100644 index 0000000..a2d61fb --- /dev/null +++ b/dynamicio/v5_migration/resource_migration.py @@ -0,0 +1,86 @@ +# pylint: skip-file +# noqa +# type: ignore + + +from __future__ import annotations + +from copy import deepcopy + +from rich import print as rich_print + +from dynamicio.v5_migration.resource_templates import ( + KafkaTemplate, + LocalTemplate, + PostgresTemplate, + ReadyTemplate, + S3Template, +) + + +def is_resource_dict(candidate_dict: dict) -> bool: + """Checks if a dict is a resource dict.""" + # make lower case + check_dict = { + k.lower(): {kk.lower(): vv for kk, vv in v.items()} if isinstance(v, dict) else v + for k, v in deepcopy(candidate_dict).items() + } + for key, value in check_dict.items(): + if not isinstance(value, dict): + return False + if "cloud" not in value: + rich_print(f"[red]No cloud key in {key} resource - not parsing as resource.[/red]") + return False + return True + + +def convert_single_resource_file(file_contents: dict) -> str: + """Converts a single resource file (yaml) to valid python code without imports.""" + + result = convert_resource_dict(file_contents) + + return "\n".join([resource.render_template() for resource in result]) + + +def convert_resource_dict(parsed_yaml: dict) -> list[ReadyTemplate]: + """Converts a single resource dict to a list of keyed resource templates.""" + ready_templates = [] + for resource_key, resource_dict in parsed_yaml.items(): + resource_name = f"{resource_key.lower()}_resource" + has_parsed = False + for resource_type in [S3Template, LocalTemplate, KafkaTemplate, PostgresTemplate]: + if resource_type.is_dict_parseable(resource_dict): # type: ignore + try: + ready_template = resource_type.from_dict(resource_dict, resource_name) + ready_templates.append(ready_template) # type: ignore + has_parsed = True + except Exception as e: + print(e) + has_parsed = False + if not has_parsed: + rich_print(f"Could not parse resource [red]{resource_key}[/red]") + + return ready_templates + + +def parse_resource_configs(parsed_yaml_entry: dict[str, str]) -> list: + """Parses a single resource config dict.""" + resource_configs = [] + + for key, val in parsed_yaml_entry.items(): + if key == "schema": + continue + + for resource_type in [S3Template, LocalTemplate, KafkaTemplate, PostgresTemplate]: + if resource_type.is_dict_parseable(val): # type: ignore + resource_configs.append(resource_type.from_dict(val, key.lower())) # type: ignore + + return resource_configs + + +resources_import_str = ( + "from dynamicio import " + "ParquetResource, CsvResource, JsonResource, HdfResource," + "S3ParquetResource, S3CsvResource, S3JsonResource, S3HdfResource, " + "KafkaResource, PostgresResource\n" +) diff --git a/dynamicio/v5_migration/resource_templates.py b/dynamicio/v5_migration/resource_templates.py new file mode 100644 index 0000000..e6e6e20 --- /dev/null +++ b/dynamicio/v5_migration/resource_templates.py @@ -0,0 +1,231 @@ +# pylint: skip-file +# noqa +# type: ignore + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass +from typing import Optional + + +# @dataclass +# class KeyedResourceTemplate: +# resources: list +# resource_name: str +# template: str = """ +# {resource_name} = KeyedResource( +# {{ +# {resources} +# }} +# ) +# """ +# +# def render_template(self) -> str: +# return self.template.format( +# resource_name=self.resource_name, +# resources="\n".join([resource.render_own_resource() for resource in self.resources]), +# ) + + +class ReadyTemplate(ABC): + @abstractmethod + def render_template(self) -> str: + raise NotImplementedError + + +s3_file_type_class_map = { + "parquet": "S3ParquetResource", + "csv": "S3CsvResource", + "json": "S3JsonResource", + "hdf": "S3HdfResource", +} + + +@dataclass +class S3Template(ReadyTemplate): + resource_name: str + bucket: str + file_path: str + test_path: Optional[str] + class_name: str + template: str = """ +{resource_name} = {class_name}( + bucket="{bucket}", + path="{file_path}"{test_path_str} +) +""" + + @classmethod + def from_dict(cls, resource_dict: dict[str, ...], resource_name: str) -> "S3Template": + file_type = resource_dict["cloud"]["s3"]["file_type"] + test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None) + # Warning: if local filetype does not match cloud filetype. This will not work. + return cls( + resource_name=resource_name, + bucket=resource_dict["cloud"]["s3"]["bucket"], + file_path=resource_dict["cloud"]["s3"]["file_path"], + class_name=s3_file_type_class_map[file_type], + test_path=test_path, + ) + + @staticmethod + def is_dict_parseable(resource_dict: dict[str, ...]): + return resource_dict["cloud"]["type"] == "s3_file" and resource_dict["cloud"]["s3"]["file_type"] in list( + s3_file_type_class_map.keys() + ) + + def render_template(self) -> str: + test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else "" + return self.template.format( + resource_name=self.resource_name, + class_name=self.class_name, + bucket=self.bucket, + file_path=self.file_path, + test_path_str=test_path_str, + ) + + +FILE_TYPES = ["parquet", "csv", "json", "hdf"] + + +def replace_double_brackets(string: Optional[str]) -> Optional[str]: + if not string: + return None + return string.replace("[[", "{").replace("]]", "}") + + +@dataclass +class LocalTemplate(ReadyTemplate): + resource_name: str + file_path: str + file_type: Optional[str] + test_path: Optional[str] + template: str = """ +{resource_name} = FileResource( + path="{file_path}"{test_path_str}{file_type_str} +) +""" + + @classmethod + def from_dict(cls, resource_dict: dict[str, ...], resource_name: str) -> "LocalTemplate": + test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None) + file_path = resource_dict["cloud"]["local"]["file_path"] + file_type = resource_dict["cloud"]["local"]["file_type"] + + test_path = replace_double_brackets(test_path) + file_path = replace_double_brackets(file_path) + # these can be inferred from the file_path + if any([file_path.endswith("." + ext) for ext in FILE_TYPES]): + file_type = None + return cls( + resource_name=resource_name, + file_path=file_path, + file_type=file_type, + test_path=test_path, + ) + + @staticmethod + def is_dict_parseable(resource_dict: dict[str, ...]) -> bool: + file_type = resource_dict["cloud"]["local"]["file_type"] + return resource_dict["cloud"]["type"] == "local" and file_type in FILE_TYPES + + def render_template(self) -> str: + test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else "" + file_type_str = f',\n file_type="{self.file_type}"' if self.file_type else "" + return self.template.format( + resource_name=self.resource_name, + file_type_str=file_type_str, + file_path=self.file_path, + test_path_str=test_path_str, + ) + + +@dataclass +class KafkaTemplate(ReadyTemplate): + resource_name: str + topic: str + server: str + test_path: Optional[str] + template: str = """ +{resource_name} = KafkaResource( + server="{server}", + topic="{topic}"{test_path_str} +) +""" + + @classmethod + def from_dict(cls, resource_dict: dict[str, ...], resource_name: str) -> "KafkaTemplate": + test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None) + return cls( + resource_name=resource_name, + topic=resource_dict["cloud"]["kafka"]["kafka_topic"], + server=resource_dict["cloud"]["kafka"]["kafka_server"], + test_path=test_path, + ) + + @staticmethod + def is_dict_parseable(resource_dict: dict[str, ...]): + return resource_dict["cloud"]["type"] == "kafka" + + def render_template(self) -> str: + test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else "" + return self.template.format( + resource_name=self.resource_name, + server=self.server, + topic=self.topic, + test_path_str=test_path_str, + ) + + +@dataclass +class PostgresTemplate(ReadyTemplate): + resource_name: str + db_host: str + db_port: str + db_name: str + db_user: str + db_password: str + class_name: str = "PostgresResource" + test_path: Optional[str] = None + template: str = """ +{resource_name} = {class_name}( + db_host="{db_host}", + db_port="{db_port}", + db_name="{db_name}", + db_user="{db_user}", + db_password="{db_password}", + table_name=None, + sql_query=...{test_path_str} +) +""" + + @classmethod + def from_dict(cls, resource_dict: dict[str, dict[str, str]], resource_name: str) -> "PostgresTemplate": + test_path = resource_dict.get("local", {}).get("local", {}).get("file_path", None) + return cls( + resource_name=resource_name, + db_host=resource_dict["cloud"]["postgres"]["db_host"], + db_port=resource_dict["cloud"]["postgres"]["db_port"], + db_name=resource_dict["cloud"]["postgres"]["db_name"], + db_user=resource_dict["cloud"]["postgres"]["db_user"], + db_password=resource_dict["cloud"]["postgres"]["db_password"], + test_path=test_path, + ) + + @staticmethod + def is_dict_parseable(resource_dict: dict[str, ...]): + return resource_dict["cloud"]["type"] == "postgres" + + def render_template(self) -> str: + test_path_str = f',\n test_path="{self.test_path}"' if self.test_path else "" + return self.template.format( + resource_name=self.resource_name, + class_name=self.class_name, + db_host=self.db_host, + db_port=self.db_port, + db_name=self.db_name, + db_user=self.db_user, + db_password=self.db_password, + test_path_str=test_path_str, + ) diff --git a/dynamicio/v5_migration/schema_migration.py b/dynamicio/v5_migration/schema_migration.py new file mode 100644 index 0000000..5d7275d --- /dev/null +++ b/dynamicio/v5_migration/schema_migration.py @@ -0,0 +1,498 @@ +# pylint: skip-file +# noqa + + +from __future__ import annotations + +import abc +import re +from dataclasses import dataclass +from string import ascii_lowercase, digits +from typing import Any + +from rich import print as rich_print + +from dynamicio.metrics import Metric + +schema_import_str = """from datetime import datetime + +import pandera as pa +from pandera import SchemaModel +from pandera.typing import Series +from dynamicio.metrics import Metric + +""" + +_numpy_type_to_pandera_mapping = { + r"object": "str", + r"float.*": "float", + r"int.*": "int", + r"datetime.*": "datetime", + r"bool": "bool", +} + + +def is_schema_dict(yaml_schema: dict) -> bool: + if "columns" not in yaml_schema: + return False + for name, info in yaml_schema["columns"].items(): + if not isinstance(name, str): + return False + if "type" not in info: + return False + return True + + +def convert_single_schema_file(yaml_contents: dict) -> str: + name = yaml_contents["name"] + columns = _collect_columns(yaml_contents) + + schema_class = SchemaClass(name=name, columns=columns) + + return schema_class.render_template() + + +class Validation(abc.ABC): + @abc.abstractmethod + def render_own_template(self) -> str: + raise NotImplementedError() + + +@dataclass +class HasNoNulls(Validation): + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "has_no_null_values" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "HasNoNulls": + return cls() + + def render_own_template(self) -> str: + return "nullable=False" + + +@dataclass +class IsIn(Validation): + categories: list[str] + match_all: bool + template: str = "isin=[{categories}]" + + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "is_in" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsIn": + match_all = candidate["options"].get("match_all", True) + if not match_all: + rich_print( + f"[bold red]The migration of validation `is_in` with `match_all = False` is not supported. " + f"`match_all: false` actually means that unique values of column should be equal to given categories, " + f"without any missing (yes that sounds the wrong way round). " + f"Please implement it manually by specifying the a custom check in your pandera schema " + f"as follows: [/bold red]" + ) + rich_print( + f"\n" + f'@pa.check("column_name")\n' + f"def is_in_check(cls, series: Series[str]) -> Series[bool]:\n" + f" # Implementation\n" + f" return ...\n\n" + ) + return cls(categories=candidate["options"]["categorical_values"], match_all=match_all) + + def render_own_template(self) -> str: + if not self.match_all: + return "" + return self.template.format(categories=",".join(f'"{cat}"' for cat in self.categories)) + + +@dataclass +class HasUniqueValues(Validation): + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "has_unique_values" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "HasUniqueValues": + return cls() + + def render_own_template(self) -> str: + return "unique=True" + + +@dataclass +class IsGreaterThan(Validation): + threshold: float + template: str = "gt={threshold}" + + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "is_greater_than" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsGreaterThan": + return cls(threshold=candidate["options"]["threshold"]) + + def render_own_template(self) -> str: + return self.template.format(threshold=self.threshold) + + +@dataclass +class IsGreaterThanOrEquals(Validation): + threshold: float + template: str = "ge={threshold}" + + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "is_greater_than_or_equal" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsGreaterThanOrEquals": + return cls(threshold=candidate["options"]["threshold"]) + + def render_own_template(self) -> str: + return self.template.format(threshold=self.threshold) + + +@dataclass +class IsLessThan(Validation): + threshold: float + template: str = "lt={threshold}" + + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "is_lower_than" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsLessThan": + return cls(threshold=candidate["options"]["threshold"]) + + def render_own_template(self) -> str: + return self.template.format(threshold=self.threshold) + + +@dataclass +class IsLessThanOrEquals(Validation): + threshold: float + template: str = "le={threshold}" + + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "is_lower_than_or_equal" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsLessThanOrEquals": + return cls(threshold=candidate["options"]["threshold"]) + + def render_own_template(self) -> str: + return self.template.format(threshold=self.threshold) + + +@dataclass +class IsBetween(Validation): + min_value: float + max_value: float + include_min: bool + include_max: bool + template: str = 'in_range={{"min_value":{min_value}, "max_value":{max_value}, "include_min":{include_min}, "include_max":{include_max}}}' + + @staticmethod + def is_matched(validation_name: str) -> bool: + return validation_name == "is_between" + + @classmethod + def parse_from_dict(cls, candidate: dict[str, Any]) -> "IsBetween": + return cls( + min_value=candidate["options"]["lower"], + max_value=candidate["options"]["upper"], + include_min=candidate["options"]["include_left"] if "include_left" in candidate["options"] else False, + include_max=candidate["options"]["include_right"] if "include_right" in candidate["options"] else False, + ) + + def render_own_template(self) -> str: + return self.template.format( + min_value=self.min_value, + max_value=self.max_value, + include_min=self.include_min, + include_max=self.include_max, + ) + + +@dataclass +class HasAcceptablePercentageOfNulls(Validation): + @staticmethod + def is_matched(validation_name: str) -> bool: + template: str = """ +@pa.check("column_name") +def has_acceptable_percentage_of_nulls_check(cls, series: Series[str]) -> Series[bool]: + # Implementation + return ... + """ + + if validation_name == "has_acceptable_percentage_of_nulls": + rich_print( + f"[bold red]The migration of validation `has_acceptable_percentage_of_nulls` is not supported. " + f"Please implement it manually by specifying the a custom check in your pandera schema " + f"as follows: [/bold red]" + ) + rich_print(template) + + return False + + def render_own_template(self) -> str: + return "" + + +_supported_validations = [ + HasNoNulls, + IsIn, + HasUniqueValues, + IsGreaterThan, + IsGreaterThanOrEquals, + IsLessThan, + IsLessThanOrEquals, + IsBetween, + HasAcceptablePercentageOfNulls, +] + + +class MetricLogger(abc.ABC): + @abc.abstractmethod + def render_own_template(self) -> str: + raise NotImplementedError() + + +class Min(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.MIN.value + + def render_own_template(self) -> str: + return "Metric.MIN" + + +class Max(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.MAX.value + + def render_own_template(self) -> str: + return "Metric.MAX" + + +class Mean(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.MEAN.value + + def render_own_template(self) -> str: + return "Metric.MEAN" + + +class Std(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.STD.value + + def render_own_template(self) -> str: + return "Metric.STD" + + +class Variance(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.VARIANCE.value + + def render_own_template(self) -> str: + return "Metric.VARIANCE" + + +class Counts(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.COUNTS.value + + def render_own_template(self) -> str: + return "Metric.COUNTS" + + +class UniqueCounts(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.UNIQUE_COUNTS.value + + def render_own_template(self) -> str: + return "Metric.UNIQUE_COUNTS" + + +class CountsPerLabel(MetricLogger): + @staticmethod + def is_matched(metric_name: str) -> bool: + return metric_name == Metric.COUNTS_PER_LABEL.value + + def render_own_template(self) -> str: + return "Metric.COUNTS_PER_LABEL" + + +_supported_metrics = [ + Min, + Max, + Mean, + Std, + Variance, + Counts, + UniqueCounts, + CountsPerLabel, +] + + +@dataclass +class Column: + name: str + data_type: str + validations: list[Validation] + metrics: list[MetricLogger] + template_python_compatible = "{name}: Series[{data_type}] = pa.Field({options})" + _allowed_chars: list[str] = ascii_lowercase + digits + "_" + + @property + def is_python_normalized(self) -> bool: + assert len(self.name) >= 1, "Column name cannot be empty" + + s = self.name + + is_lowercase = s == s.lower() + is_starts_with_alpha = s[0].isalpha() + + is_only_alpha_num_and_underscore = all([(c in self._allowed_chars) for c in s]) + + return is_lowercase and is_starts_with_alpha and is_only_alpha_num_and_underscore + + def _python_normalize(self) -> str: + normalized_name = self.name + + # Lowercase the name + normalized_name = normalized_name.lower() + + normalized_name_tmp = list(normalized_name) + + # Replace all non-allowed characters (including spaces) with underscores + for idx, c in enumerate(list(normalized_name_tmp)): + if c not in self._allowed_chars: + normalized_name_tmp[idx] = "_" + normalized_name = "".join(normalized_name_tmp) + + # Make sure the name doesn't begin with a number + if normalized_name[0] in digits: + normalized_name = normalized_name[1:] + + # Accounts for the edge case when the unnormalized column name is just a single number, + # which results in an empty normalized name + + if not normalized_name: + return f"_{self.name}" + + return normalized_name + + def render_template(self) -> str: + options = [option for option in self._render_options() if option] # Remove empty options + + if self.is_python_normalized: + return self.template_python_compatible.format( + name=self.name, data_type=self.data_type, options=",".join(options) + ) + else: + options.append(f'alias="{self.name}"') + + return self.template_python_compatible.format( + name=self._python_normalize(), data_type=self.data_type, options=",".join(options) + ) + + def _render_options(self) -> list[str]: + options = [] + + for v in self.validations: + options.append(v.render_own_template()) + + # We default to all fields being nullable unless otherwise specified by the validations + if "nullable=False" not in options: + options.append("nullable=True") + + # Optionally parse and append the metrics + if self.metrics: + metrics_template = 'log_statistics={{"metrics": [{metrics}]}}'.format( + metrics=",".join([m.render_own_template() for m in self.metrics]) + ) + + options.append(metrics_template) + + return options + + +@dataclass +class SchemaClass: + name: str + columns: list[Column] + + template = """ +class {class_name}(SchemaModel): +{columns} + + class Config: + coerce = True + strict = "filter" + """ + + def _python_normalize(self) -> str: + normalized_class_name = "" + + for word in self.name.split("_"): + normalized_class_name += word.lower().capitalize() + + return normalized_class_name + + def render_template(self) -> str: + rendered_columns = "\n".join([" " + col.render_template() for col in self.columns]) + + return self.template.format( + class_name=f"{self._python_normalize()}Schema", + columns=rendered_columns, + ) + + +def _collect_columns(yaml_schema) -> list[Column]: + columns = [] + for col_name, col_info in yaml_schema["columns"].items(): + parsed_numpy_dtype = col_info["type"] + parsed_validations = [] + parsed_metrics = [] + + for candidate_type in _numpy_type_to_pandera_mapping: + if re.search(candidate_type, parsed_numpy_dtype) is not None: + derived_pandera_type = _numpy_type_to_pandera_mapping[candidate_type] + + if col_info.get("validations"): + for validation_name, validation_body in col_info.get("validations").items(): + for candidate_validation in _supported_validations: + if candidate_validation.is_matched(validation_name): + parsed_validations.append(candidate_validation.parse_from_dict(validation_body)) + if col_info.get("metrics"): + for metric_name in col_info["metrics"]: + for metric_candidate in _supported_metrics: + if metric_candidate.is_matched(metric_name): + parsed_metrics.append(metric_candidate()) + + assert derived_pandera_type is not None, "Could not match the numpy dtype to pandera type" + + columns.append( + Column( + name=col_name, + data_type=derived_pandera_type, + validations=parsed_validations, + metrics=parsed_metrics, + ) + ) + + return columns diff --git a/dynamicio/validations.py b/dynamicio/validations.py deleted file mode 100644 index 26bd843..0000000 --- a/dynamicio/validations.py +++ /dev/null @@ -1,347 +0,0 @@ -"""Implements the Validator class responsible for various generic data validations and metrics generation.""" -import operator -from typing import Callable, NamedTuple, Set - -import pandas as pd # type: ignore - -ALL_VALIDATORS = {} # name -> function - - -def validator(func: Callable): - """A decorator to add the function to the ALL_VALIDATORS dict""" - name = func.__name__ - assert name not in ALL_VALIDATORS - ALL_VALIDATORS[name] = func - return func - - -class ValidationResult(NamedTuple): - """A NamedTuple for capturing different outputs after a validation.""" - - valid: bool - message: str - value: float - - -@validator -def has_unique_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult: - """Checks if values in column are unique. - - Args: - dataset: Name fo the dataset_name - df: A pandas DataFrame - column: The column to be validated - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_duplicated_elements - """ - counts = df[column].value_counts() - if not (counts > 1).any(): - return ValidationResult(valid=True, message=f"{dataset}[{column}] has unique values", value=0) - - duplicates = counts[counts > 1].index.to_list() - return ValidationResult(valid=False, message=f"Values {duplicates} for {dataset}[{column}] are duplicated!", value=len(duplicates)) - - -@validator -def has_no_null_values(dataset: str, df: pd.DataFrame, column: str) -> ValidationResult: - """Checks if column has any null values (including NaN and NaT values). - - Args: - dataset: Name fo the dataset_name - df: A pandas DataFrame - column: The column to be validated - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_nulls - """ - mask = df[column].isnull() - no_of_nulls = mask.sum() - return ValidationResult(valid=not mask.any(), message=f"{dataset}[{column}] has {no_of_nulls} nulls", value=no_of_nulls) - - -@validator -def has_acceptable_percentage_of_nulls( - dataset: str, - df: pd.DataFrame, - column: str, - threshold: float, -) -> ValidationResult: - """Checks if a provided threshold of max nulls has been exceeded. - - Note: For an empty df the validation will always be successful - - Args: - dataset: Name fo the dataset_name - df: A pandas DataFrame - column: The column to be validated - threshold: Maximum allowed threshold - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is percentage_of_nulls - """ - if threshold <= 0 or threshold >= 1: - raise ValueError(f"Threshold value: {threshold} must be a value between 0 and 1.") - - no_of_nulls = df[column].isnull().sum() - if len(df) == 0: - percentage_of_nulls = 0 - else: - percentage_of_nulls = no_of_nulls / len(df) - - if percentage_of_nulls < threshold: - return ValidationResult( - valid=True, - message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls}", - value=percentage_of_nulls, - ) - return ValidationResult( - valid=False, - message=f"Percentage of nulls of for {dataset}[{column}] is {percentage_of_nulls} which exceeds threshold: {threshold}", - value=percentage_of_nulls, - ) - - -@validator -def is_in(dataset: str, df: pd.DataFrame, column: str, categorical_values: Set[str], match_all: bool = True) -> ValidationResult: - """Checks if the column only has allowed categorical values as per the set provided. - - Note: - Ignores nulls - - Args: - dataset: Name fo the dataset_name - df: A DataFrame - column: The DataFrame column to be validated - categorical_values: The allowed set of categorical values - match_all: If True, the categorical values must be a subset of the allowed set, otherwise they must be equal - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is no_of_not_acceptable - """ - unique_values = set(df[column][df[column].notna()].unique()) - - if match_all: - return _validate_categoricals_are_a_subset_of_the_acceptable(categorical_values, unique_values, column, dataset, df) - return _validate_all_acceptable_categoricals_are_present(categorical_values, unique_values, column, dataset, df) - - -@validator -def _validate_all_acceptable_categoricals_are_present(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult: - if unique_values == acceptable_categoricals: - validation_result = ValidationResult(valid=True, message=f"All acceptable categorical values for {dataset}[{column}] are present", value=0) - elif unique_values < acceptable_categoricals: - validation_result = ValidationResult( - valid=False, - message=f"Missing categorical values for {dataset}[{column}]: {acceptable_categoricals - unique_values}", - value=len(acceptable_categoricals - unique_values), - ) - else: - count_invalid = (~df[column].isin(acceptable_categoricals)).sum() - validation_result = ValidationResult( - valid=False, - message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells", - value=count_invalid, - ) - return validation_result - - -@validator -def _validate_categoricals_are_a_subset_of_the_acceptable(acceptable_categoricals: Set[str], unique_values: Set[str], column: str, dataset: str, df: pd.DataFrame) -> ValidationResult: - if unique_values.issubset(acceptable_categoricals): - return ValidationResult(valid=True, message=f"Categorical values for {dataset}[{column}] are acceptable", value=0) - count_invalid = (~df[column].isin(acceptable_categoricals)).sum() - return ValidationResult( - valid=False, - message=f"Values {unique_values - set(acceptable_categoricals)} for {dataset}[{column}] are not acceptable for {count_invalid} cells", - value=count_invalid, - ) - - -@validator -def is_greater_than( - dataset: str, - df: pd.DataFrame, - column: str, - threshold: float, -) -> ValidationResult: - """Confirms column values are above a given threshold. - - Args: - dataset: Name fo the dataset_name - df: A DataFrame - column: The DataFrame column to be validated - threshold: A lower bound threshold not to be exceeded - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the - percentage of invalid values - """ - no_nulls_for_column_df = df[~df[column].isnull()][column] - valid = no_nulls_for_column_df > threshold - - if valid.all(): - return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0) - - no_of_invalid = (~valid).sum() - return ValidationResult( - valid=False, - message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}", - value=no_of_invalid / len(no_nulls_for_column_df), - ) - - -@validator -def is_greater_than_or_equal( - dataset: str, - df: pd.DataFrame, - column: str, - threshold: float, -) -> ValidationResult: - """Confirms column values are above a given threshold. - - Args: - dataset: Name fo the dataset_name - df: A DataFrame - column: The DataFrame column to be validated - threshold: A lower bound threshold not to be exceeded - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the - percentage of invalid values - """ - no_nulls_for_column_df = df[~df[column].isnull()][column] - valid = no_nulls_for_column_df >= threshold - - if valid.all(): - return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are above {threshold}", value=0) - - no_of_invalid = (~valid).sum() - return ValidationResult( - valid=False, - message=f"{no_of_invalid} cell values for {dataset}[{column}] are below {threshold}", - value=no_of_invalid / len(no_nulls_for_column_df), - ) - - -@validator -def is_lower_than( - dataset: str, - df: pd.DataFrame, - column: str, - threshold: float, -) -> ValidationResult: - """Confirms column values are below a given threshold. - - IMPORTANT NOTE: Ignores nulls! - - Args: - dataset: Name fo the dataset_name - df: A DataFrame - column: The DataFrame column to be validated - threshold: A lower bound threshold not to be exceeded - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of - invalid values - """ - no_nulls_for_column_df = df[~df[column].isnull()][column] - valid = no_nulls_for_column_df < threshold # pd.DataFrame - - if valid.all(): - return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0) - - no_of_invalid = (~valid).sum() - return ValidationResult( - valid=False, - message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}", - value=no_of_invalid / len(no_nulls_for_column_df), - ) - - -@validator -def is_lower_than_or_equal( - dataset: str, - df: pd.DataFrame, - column: str, - threshold: float, -) -> ValidationResult: - """Confirms column values are below a given threshold. - - IMPORTANT NOTE: Ignores nulls! - - Args: - dataset: Name fo the dataset_name - df: A DataFrame - column: The DataFrame column to be validated - threshold: A lower bound threshold not to be exceeded - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of - invalid values - """ - no_nulls_for_column_df = df[~df[column].isnull()][column] - valid = no_nulls_for_column_df <= threshold - - if valid.all(): - return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] are below {threshold}", value=0) - - no_of_invalid = (~valid).sum() - return ValidationResult( - valid=False, - message=f"{no_of_invalid} cell values for {dataset}[{column}] are above {threshold}", - value=no_of_invalid / len(no_nulls_for_column_df), - ) - - -@validator -def is_between( - dataset: str, - df: pd.DataFrame, - column: str, - lower: float, - upper: float, - include_left: bool = False, - include_right: bool = False, -) -> ValidationResult: - """Confirms column values are between a lower bound and an upper bound thresholds. - - IMPORTANT NOTE: Ignores nulls! - - Args: - dataset: Name fo the dataset_name - df: A DataFrame - column: The DataFrame column to be validated - lower: The lower bound (left) - upper: The upper bound (right) - include_left: `left <= df[column]` - include_right: `df[column] <=right` - - Returns: - An instance of ValidationResult where `Validation.Result.valid` is a bool indicate the success of the validation, - `Validation.Result.message` is a message (usually used in exceptions), and `Validation.Result.value` is the percentage of - invalid values - """ - no_nulls_for_column_df = df[~df[column].isnull()][column] - lower_bound_operator = operator.ge if include_left else operator.gt - upper_bound_operator = operator.le if include_right else operator.lt - - valid = lower_bound_operator(no_nulls_for_column_df, lower) & upper_bound_operator(no_nulls_for_column_df, upper) - - if valid.all(): - return ValidationResult(valid=True, message=f"All values of {dataset}[{column}] is between {lower} and {upper} thresholds", value=0) - - no_of_invalid = (~valid).sum() - return ValidationResult( - valid=False, - message=f"{no_of_invalid} cell values for {dataset}[{column}] are either below {lower} or above {upper}", - value=no_of_invalid / len(no_nulls_for_column_df), - ) diff --git a/dynamicio/validators.py b/dynamicio/validators.py new file mode 100644 index 0000000..fa45d12 --- /dev/null +++ b/dynamicio/validators.py @@ -0,0 +1 @@ +"""Custom validators for the dynamicio, to be used with pandera schemas.""" diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 0000000..886e142 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,4 @@ +[mypy-pandera] +ignore_errors = True +[mypy] +warn_unused_configs = True diff --git a/pyproject.toml b/pyproject.toml index 855818d..2386f63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [tool.black] py38 = true -line-length = 185 +line-length = 120 include = '\.pyi?$' exclude = ''' ( @@ -30,7 +30,3 @@ exclude = ''' ] addopts = "-p no:warnings" log_cli = false - -[tool.pydocstyle] -convention = 'google' -add_ignore = 'D103' # Ignore missing docstring in public function diff --git a/requirements-dev.txt b/requirements-dev.txt index e96b75e..9633d9c 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -8,7 +8,6 @@ gitlint==0.17.0 mock==4.0.3 mypy==0.990 pre-commit==2.20.0 -pydocstyle==6.1.1 pylint==2.15.5 pytest-asyncio==0.20.2 pytest-cov==4.0.0 diff --git a/requirements.txt b/requirements.txt index bb39097..4a6a521 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,14 +4,16 @@ fastparquet>=0.8.0 fsspec==2022.3.0 kafka-python~=2.0.2 logzero>=1.7.0 -magic-logger>=1.0.2 -pandas>=1.2.4 +pandas~=1.2 psycopg2-binary~=2.9.3 pyarrow>=7.0.0 python-json-logger~=2.0.1 -PyYAML~=5.4.1 +PyYAML>=5.4.1 s3fs==0.4.2 simplejson~=3.17.2 SQLAlchemy~=1.4.11 tables~=3.7.0 pydantic~=1.10.2 +pandera~=0.14.5 +typer==0.9.0 +uhura~=1.5.0 diff --git a/tests/conftest.py b/tests/conftest.py index e03a7c4..1540430 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,913 +1,34 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, -import os -import pickle -import pickletools -import tempfile -from unittest.mock import Mock, patch - -import numpy as np import pandas as pd import pytest -from dynamicio import WithS3PathPrefix -from tests import constants -from tests.mocking.models import ERModel - -TEST_SQL_DIR = os.path.dirname(os.path.abspath(__file__)) + "/test_sql/" -__pickle_loads = pickle.loads - - -def mock_pickle_loads(data): - global MAX_PROTO_FOUND # pylint: disable=global-variable-undefined - op, fst, _ = next(pickletools.genops(data)) # pylint: disable=invalid-name) - if op.name == "PROTO": - proto = fst - MAX_PROTO_FOUND = max(MAX_PROTO_FOUND, proto) - return __pickle_loads(data) - - -def max_pklproto_hdf(hdf_filename): - global MAX_PROTO_FOUND # pylint: disable=global-variable-undefined - MAX_PROTO_FOUND = -1 - with pytest.MonkeyPatch().context() as mocked_context: - mocked_context.setattr(pickle, "loads", mock_pickle_loads) - try: - pd.read_hdf(hdf_filename) - except ValueError: - pass - return MAX_PROTO_FOUND - - -class DummyYaml: - def __init__(self, path): - self.path = path - - def __repr__(self): - return f"DummyYaml({self.path!r})" - - def __enter__(self): - return Mock(), None - - def __exit__(self, *args): - return None - - -@pytest.fixture -def expected_input_yaml_dict(): - return { - "bindings": { - "READ_FROM_S3_CSV_ALT": { - "name": "READ_FROM_S3_CSV_ALT", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "mock-key", - "file_type": "csv", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": None, - }, - "READ_FROM_S3_CSV": { - "name": "READ_FROM_S3_CSV", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "mock-key", - "file_type": "csv", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": { - "name": "read_from_s3_csv", - "columns": { - "id": { - "name": "id", - "data_type": "int64", - "validations": [ - {"name": "has_unique_values", "apply": True, "options": {}}, - { - "name": "has_no_null_values", - "apply": True, - "options": {}, - }, - ], - "metrics": ["UniqueCounts", "Counts"], - }, - "foo_name": { - "name": "foo_name", - "data_type": "object", - "validations": [ - { - "name": "has_no_null_values", - "apply": True, - "options": {}, - }, - { - "name": "is_in", - "apply": True, - "options": { - "categorical_values": [ - "class_a", - "class_b", - "class_c", - ] - }, - }, - ], - "metrics": ["CountsPerLabel"], - }, - "bar": { - "name": "bar", - "data_type": "int64", - "validations": [ - { - "name": "has_no_null_values", - "apply": True, - "options": {}, - }, - { - "name": "is_greater_than", - "apply": True, - "options": {"threshold": 1000}, - }, - { - "name": "is_lower_than", - "apply": True, - "options": {"threshold": 2000}, - }, - ], - "metrics": ["Min", "Max", "Mean", "Std", "Variance"], - }, - }, - }, - }, - "READ_FROM_S3_JSON": { - "name": "READ_FROM_S3_JSON", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_json_to_read.json", - "file_type": "json", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "mock-key", - "file_type": "json", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": None, - }, - "READ_FROM_S3_HDF": { - "name": "READ_FROM_S3_HDF", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_hdf_to_read.h5", - "file_type": "hdf", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "mock-key", - "file_type": "hdf", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": None, - }, - "READ_FROM_S3_PARQUET": { - "name": "READ_FROM_S3_PARQUET", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_parquet_to_read.parquet", - "file_type": "parquet", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "s3:sample-prefix/mock-key", - "file_type": "parquet", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": None, - }, - "READ_FROM_POSTGRES": { - "name": "READ_FROM_POSTGRES", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_pg_parquet_to_read.parquet", - "file_type": "parquet", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "postgres", - "postgres": { - "db_host": "127.0.0.1", - "db_port": "17039", - "db_name": "backend", - "db_user": "user", - "db_password": "pass", - }, - }, - }, - "dynamicio_schema": None, - }, - "READ_FROM_KAFKA": { - "name": "READ_FROM_KAFKA", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_parquet_to_read.parquet", - "file_type": "parquet", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "kafka", - "kafka": { - "kafka_server": "mock-kafka-server", - "kafka_topic": "mock-kafka-topic", - }, - }, - }, - "dynamicio_schema": None, - }, - "TEMPLATED_FILE_PATH": { - "name": "TEMPLATED_FILE_PATH", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/{{file_name_to_replace}}.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "path/to/{file_name_to_replace}.csv", - "file_type": "csv", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": None, - }, - "READ_FROM_PARQUET_TEMPLATED": { - "name": "READ_FROM_PARQUET_TEMPLATED", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/{{file_name_to_replace}}.parquet", - "file_type": "parquet", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "path/to/{file_name_to_replace}.parquet", - "file_type": "parquet", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": None, - }, - "REPLACE_SCHEMA_WITH_DYN_VARS": { - "name": "REPLACE_SCHEMA_WITH_DYN_VARS", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/{{file_name_to_replace}}.parquet", - "file_type": "parquet", - }, - } - }, - "dynamicio_schema": { - "name": "bar", - "columns": { - "column_a": { - "name": "column_a", - "data_type": "object", - "validations": [ - {"name": "has_unique_values", "apply": True, "options": {}} - ], - "metrics": ["Counts"], - }, - "column_b": { - "name": "column_b", - "data_type": "object", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}} - ], - "metrics": ["CountsPerLabel"], - }, - "column_c": { - "name": "column_c", - "data_type": "float64", - "validations": [ - { - "name": "is_greater_than", - "apply": True, - "options": {"threshold": 1000}, - } - ], - "metrics": [], - }, - "column_d": { - "name": "column_d", - "data_type": "float64", - "validations": [ - { - "name": "is_lower_than", - "apply": True, - "options": {"threshold": 1000.0}, - } - ], - "metrics": ["Min", "Max", "Mean", "Std", "Variance"], - }, - "0": { - "name": "0", - "data_type": "object", - "validations": [], - "metrics": [], - }, - "1": { - "name": "1", - "data_type": "object", - "validations": [], - "metrics": [], - }, - }, - }, - }, - } - } - - -@pytest.fixture -def expected_s3_csv_local_mapping(): - return { - "name": "READ_FROM_S3_CSV", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "mock-key", - "file_type": "csv", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": { - "name": "read_from_s3_csv", - "columns": { - "id": { - "name": "id", - "data_type": "int64", - "validations": [ - {"name": "has_unique_values", "apply": True, "options": {}}, - {"name": "has_no_null_values", "apply": True, "options": {}}, - ], - "metrics": ["UniqueCounts", "Counts"], - }, - "foo_name": { - "name": "foo_name", - "data_type": "object", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}}, - { - "name": "is_in", - "apply": True, - "options": { - "categorical_values": ["class_a", "class_b", "class_c"] - }, - }, - ], - "metrics": ["CountsPerLabel"], - }, - "bar": { - "name": "bar", - "data_type": "int64", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}}, - { - "name": "is_greater_than", - "apply": True, - "options": {"threshold": 1000}, - }, - { - "name": "is_lower_than", - "apply": True, - "options": {"threshold": 2000}, - }, - ], - "metrics": ["Min", "Max", "Mean", "Std", "Variance"], - }, - }, - }, - } - - -@pytest.fixture -def expected_s3_csv_cloud_mapping(): - return { - "name": "read_from_s3_csv", - "columns": { - "id": { - "name": "id", - "data_type": "int64", - "validations": [ - {"name": "has_unique_values", "apply": True, "options": {}}, - {"name": "has_no_null_values", "apply": True, "options": {}}, - ], - "metrics": ["UniqueCounts", "Counts"], - }, - "foo_name": { - "name": "foo_name", - "data_type": "object", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}}, - { - "name": "is_in", - "apply": True, - "options": { - "categorical_values": ["class_a", "class_b", "class_c"] - }, - }, - ], - "metrics": ["CountsPerLabel"], - }, - "bar": { - "name": "bar", - "data_type": "int64", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}}, - { - "name": "is_greater_than", - "apply": True, - "options": {"threshold": 1000}, - }, - { - "name": "is_lower_than", - "apply": True, - "options": {"threshold": 2000}, - }, - ], - "metrics": ["Min", "Max", "Mean", "Std", "Variance"], - }, - }, - } - - -@pytest.fixture -def expected_postgres_cloud_mapping(): - return { - "options": {}, - "data_backend_type": "postgres", - "postgres": { - "db_host": "127.0.0.1", - "db_port": "17039", - "db_name": "backend", - "db_user": "user", - "db_password": "pass", - }, - } - - -@pytest.fixture -def expected_s3_parquet_df(): - return pd.read_parquet(f"{constants.TEST_RESOURCES}/data/input/some_parquet_to_read.parquet") - - -@pytest.fixture(scope="class") -def expected_s3_hdf_file_path(): - return f"{constants.TEST_RESOURCES}/data/input/some_hdf_to_read.h5" - - -@pytest.fixture(scope="class") -def expected_s3_hdf_df(expected_s3_hdf_file_path): # pylint: disable=redefined-outer-name - return pd.read_hdf(expected_s3_hdf_file_path) - - -@pytest.fixture -def expected_s3_json_df(): - return pd.read_json(f"{constants.TEST_RESOURCES}/data/input/some_json_to_read.json", orient="columns") - - -@pytest.fixture -def expected_s3_csv_df(): - return pd.read_csv(f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv") - - -@pytest.fixture -def expected_df_with_less_columns(): - df = pd.DataFrame.from_records( - [ - [1, "name_a"], - [2, "name_b"], - [3, "name_a"], - [4, "name_b"], - [5, "name_a"], - [6, "name_b"], - [7, "name_a"], - [8, "name_b"], - [9, "name_a"], - [10, "name_b"], - [11, "name_a"], - [12, "name_b"], - [13, "name_a"], - [14, "name_b"], - [15, "name_a"], - ], - columns=["id", "foo_name"], - ) - return df - - -@pytest.fixture -def dataset_with_more_columns_than_dictated_in_schema(): - df = pd.DataFrame.from_records( - [ - [1, "foo_a", 1, 1500, 1600, "pass_through"], - [2, "foo_b", 2, 1500, 1600, "pass_through"], - [3, "foo_a", 3, 1500, 1600, "pass_through"], - [4, "foo_b", 4, 1500, 1600, "pass_through"], - [5, "foo_a", 5, 1500, 1600, "pass_through"], - [6, "foo_b", 6, 1500, 1600, "pass_through"], - [7, "foo_a", 7, 1500, 1600, "pass_through"], - [8, "foo_b", 8, 1500, 1600, "pass_through"], - [9, "foo_a", 9, 1500, 1600, "pass_through"], - [10, "foo_b", 10, 1500, 1600, "pass_through"], - [11, "foo_a", 11, 1500, 1600, "pass_through"], - [12, "foo_b", 12, 1500, 1600, "pass_through"], - [13, "foo_a", 13, 1500, 1600, "pass_through"], - [14, "foo_b", 14, 1500, 1600, "pass_through"], - [15, "foo_a", 15, 1500, 1600, "pass_through"], - ], - columns=["id", "foo_name", "bar", "start_odometer", "end_odometer", "event_type"], - ) - return df - @pytest.fixture def test_df(): - df = pd.DataFrame.from_records( - [ - ["cm_1", "id_1", 1000, "ABC"], - ["cm_2", "id_2", 1000, "ABC"], - ["cm_3", "id_3", 1000, "ABC"], - ], - columns=["id", "foo", "bar", "baz"], - ) - return df - - -@pytest.fixture -def expected_columns(): - return [ERModel.id, ERModel.foo, ERModel.bar, ERModel.baz] - - -@pytest.fixture -def expected_kwargs_for_read_parquet(): - return {"engine", "columns", "kwargs", "path", "use_nullable_dtypes"} - - -@pytest.fixture -def expected_value_serializer(): - return {'value_serializer': 'WithKafka._default_value_serializer'} + return pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "c": [True, False, True]}) @pytest.fixture -def input_messages_df(): - return pd.DataFrame.from_dict( - [ - {"id": "message01", "foo": "xxxxxxxx", "bar": 0, "baz": ["a", "b", "c"]}, - {"id": "message02", "foo": "yyyyyyyy", "bar": 1, "baz": ["d", "e", "f"]}, - ] - ) +def injectable_string(): + return "{var1}/{var2}" @pytest.fixture -def input_schema_definition(): - return { - "columns": { - "id": { - "metrics": ["UniqueCounts", "Counts"], - "type": "int64", - "validations": { - "has_no_null_values": {"apply": True, "options": {}}, - "has_unique_values": {"apply": True, "options": {}}, - }, - }, - "bar": { - "metrics": ["Min", "Max", "Mean", "Std", "Var"], - "type": "int64", - "validations": { - "has_no_null_values": {"apply": True, "options": {}}, - "is_greater_than": {"apply": True, "options": {"threshold": 1000}}, - "is_lower_than": {"apply": True, "options": {"threshold": 2000}}, - }, - }, - "foo_name": { - "metrics": None, - "type": "object", - "validations": { - "is_in": { - "apply": True, - "options": {"categorical_values": ["class_a", "class_b", "class_c"]}, - }, - "has_no_null_values": {"apply": True, "options": {}}, - }, - }, - }, - "name": "read_from_s3_csv", - } - - -# @pytest.fixture -# def expected_schema(): -# return {"id": "int64", "foo_name": "object", "bar": "int64"} - - -@pytest.fixture -def expected_schema_definition(): - return { - "name": "READ_FROM_S3_CSV", - "environments": { - "LOCAL": { - "options": {}, - "data_backend_type": "local", - "local": { - "file_path": f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "options": {}, - "data_backend_type": "s3", - "s3": { - "file_path": "mock-key", - "file_type": "csv", - "bucket": "mock-bucket", - }, - }, - }, - "dynamicio_schema": { - "name": "read_from_s3_csv", - "columns": { - "id": { - "name": "id", - "data_type": "int64", - "validations": [ - {"name": "has_unique_values", "apply": True, "options": {}}, - {"name": "has_no_null_values", "apply": True, "options": {}}, - ], - "metrics": ["UniqueCounts", "Counts"], - }, - "foo_name": { - "name": "foo_name", - "data_type": "object", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}}, - { - "name": "is_in", - "apply": True, - "options": { - "categorical_values": ["class_a", "class_b", "class_c"] - }, - }, - ], - "metrics": ["CountsPerLabel"], - }, - "bar": { - "name": "bar", - "data_type": "int64", - "validations": [ - {"name": "has_no_null_values", "apply": True, "options": {}}, - { - "name": "is_greater_than", - "apply": True, - "options": {"threshold": 1000}, - }, - { - "name": "is_lower_than", - "apply": True, - "options": {"threshold": 2000}, - }, - ], - "metrics": ["Min", "Max", "Mean", "Std", "Variance"], - }, - }, - }, - } - - -@pytest.fixture -def valid_dataframe(): - return pd.DataFrame.from_dict( - { - "id": [3, 2, 1, 0], - "foo_name": ["class_a", "class_b", "class_c", "class_a"], - "bar": [1500, 1500, 1500, 1500], - } - ) - - -@pytest.fixture -def invalid_dataframe(): - return pd.DataFrame.from_dict( - { - "id": [3, 2, 0, 0], - "foo_name": ["class_a", "class_b", "class_d", "class_a"], - "bar": [999, 1500, 2500, 1500], - } - ) - - -@pytest.fixture -def expected_messages(): - return { - "has_unique_values", - "is_in", - "is_greater_than", - "is_lower_than", - } - - -@pytest.fixture -def input_df(): - return pd.DataFrame.from_records( - [ - ["event_0", "A", "A", "discharge", 10.01234, pd.NA, pd.Timestamp("2021-03-30"), 100.01234, 5, 5, ], - ["event_1", "B", "B", "pass_through", 10.01234, None, pd.Timestamp("2021-03-30"), 100.01234, 6, 6, ], - ["event_2", "A", "A", "load", None, None, pd.NaT, pd.NA, 7, 7], - ["event_3", "B", "B", "pass_through", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 8, 8, ], - ["event_4", "C", pd.NA, "load", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 9, 9, ], - ["event_5", "A", "A", "pass_through", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 8, 8, ], - ["event_6", "C", "C", "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 7, 7, ], - ["event_7", "A", None, "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 6, 6, ], - ["event_8", None, np.nan, "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 5, 5, ], - ["event_9", "A", "A", "discharge", 10.01234, 10.01234, pd.Timestamp("2021-03-30"), 100.01234, 5, None, ], - ], - columns=["id", "category_a", "category_b", "activity", "duration_a", "duration_b", "start_time", "load", "weight_a", "weight_b", ], - ) - - -@pytest.fixture -def empty_df(): - return pd.DataFrame.from_records( - [], - columns=["id", "category_a", "category_b", "activity", "duration_a", "duration_b", "start_time", "load", "weight_a", "weight_b", ], - ) - - -# Mocks -s3_obj_file_names = ["s3://path/to/obj_1.h5", "s3://path/to/obj_2.h5", "s3://path/to/obj_3.h5"] -invalid_s3_obj_file_names = ["s3://path/to/.gitkeep", "s3://path/to/obj_2.h5", "s3://path/to/obj_3.h5"] -local_obj_file_names = ["obj_1.h5", "obj_2.h5", "obj_3.h5"] -invalid_local_obj_file_names = ["obj_2.h5", "obj_3.h5"] - - -@pytest.fixture -def mock__read_hdf_file(): - def return_mock_df(path, _schema, **_options): - path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)} - - return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]}) - - with patch.object(WithS3PathPrefix, "_read_hdf_file", side_effect=return_mock_df) as mock: - yield mock - - -@pytest.fixture -def mock__read_parquet_file(): - def return_mock_df(path, _schema, **_options): - path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)} - - return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]}) - - with patch.object(WithS3PathPrefix, "_read_parquet_file", side_effect=return_mock_df) as mock: - yield mock - - -@pytest.fixture -def mock__read_csv_file(): - def return_mock_df(path, _schema, **_options): - path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)} - - return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]}) - - with patch.object(WithS3PathPrefix, "_read_csv_file", side_effect=return_mock_df) as mock: - yield mock - - -@pytest.fixture -def mock__read_json_file(): - def return_mock_df(path, _schema, **_options): - path_id_map = {"temp/" + f: i + 1 for i, f in enumerate(local_obj_file_names)} - - return pd.DataFrame({"id": [path_id_map[path]], "foo_name": ["class_a"], "bar": [1001]}) - - with patch.object(WithS3PathPrefix, "_read_json_file", side_effect=return_mock_df) as mock: - yield mock +def failing_injections(): + return {"var1": Exception()} @pytest.fixture -# pylint: disable=invalid-name -def mock_temporary_directory(): - with patch.object(tempfile, "TemporaryDirectory") as mock: - mock.return_value.__enter__.return_value = "temp" - yield mock +def passing_injections(): + return {"var1": "hello", "var2": "there"} -@pytest.fixture -def mock_listdir(): - with patch.object(os, "listdir", return_value=local_obj_file_names) as mock: - yield mock - - -@pytest.fixture -def mock_invalid_listdir(): - with patch.object(os, "listdir", return_value=invalid_local_obj_file_names) as mock: - yield mock - - -@pytest.fixture -# pylint: disable=invalid-name -def mock_parquet_temporary_directory(): - with patch.object(tempfile, "TemporaryDirectory") as mock: - mock.return_value.__enter__.return_value = os.path.join(constants.TEST_RESOURCES, "data/input/batch/parquet") - yield mock - - -@pytest.fixture -# pylint: disable=invalid-name -def mock_parquet_temporary_directory_w_empty_files(): - with patch.object(tempfile, "TemporaryDirectory") as mock: - mock.return_value.__enter__.return_value = os.path.join(constants.TEST_RESOURCES, "data/input/batch/parquet_w_empty_files") - yield mock +@pytest.fixture( + params=[ + "sample.csv", + "sample.parquet", + "sample.json", + "sample.h5", + ] +) +def file_name(request): + return request.param diff --git a/tests/constants.py b/tests/constants.py index 62cb1ec..d22b4e4 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -1,17 +1,5 @@ -"""A module for configuring all dynamic environment variables for testing purposes""" +"""A module with constants used in tests.""" -import os +from pathlib import Path -TEST_RESOURCES = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") - -# Dynamic Vars -MOCK_BUCKET = "mock-bucket" -MOCK_KEY = "mock-key" -KAFKA_SERVER = "mock-kafka-server" -KAFKA_TOPIC = "mock-kafka-topic" -DB_HOST = "127.0.0.1" -DB_PORT = "17039" -DB_NAME = "backend" -DB_USER = "user" -DB_PASS = "pass" -LOWER_THAN_LIMIT = 1000 +TEST_FIXTURES = Path(__file__).parent / "fixtures" diff --git a/tests/fixtures/sample.csv b/tests/fixtures/sample.csv new file mode 100644 index 0000000..1df431d --- /dev/null +++ b/tests/fixtures/sample.csv @@ -0,0 +1,4 @@ +a,b,c +1,x,True +2,y,False +3,z,True diff --git a/tests/resources/data/input/batch/not_just_hdf/part_01.h5 b/tests/fixtures/sample.h5 similarity index 99% rename from tests/resources/data/input/batch/not_just_hdf/part_01.h5 rename to tests/fixtures/sample.h5 index f8f5e23..97ea786 100644 Binary files a/tests/resources/data/input/batch/not_just_hdf/part_01.h5 and b/tests/fixtures/sample.h5 differ diff --git a/tests/resources/data/input/batch/hdf/part_01.h5 b/tests/fixtures/sample.hdf similarity index 99% rename from tests/resources/data/input/batch/hdf/part_01.h5 rename to tests/fixtures/sample.hdf index f8f5e23..6d5af3c 100644 Binary files a/tests/resources/data/input/batch/hdf/part_01.h5 and b/tests/fixtures/sample.hdf differ diff --git a/tests/fixtures/sample.json b/tests/fixtures/sample.json new file mode 100644 index 0000000..e7f1aa0 --- /dev/null +++ b/tests/fixtures/sample.json @@ -0,0 +1 @@ +{"a":{"0":1,"1":2,"2":3},"b":{"0":"x","1":"y","2":"z"},"c":{"0":true,"1":false,"2":true}} \ No newline at end of file diff --git a/tests/fixtures/sample.parquet b/tests/fixtures/sample.parquet new file mode 100644 index 0000000..46d7e96 Binary files /dev/null and b/tests/fixtures/sample.parquet differ diff --git a/tests/fixtures/schemas.py b/tests/fixtures/schemas.py new file mode 100644 index 0000000..aecdab9 --- /dev/null +++ b/tests/fixtures/schemas.py @@ -0,0 +1,10 @@ +# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, R0801 + +from pandera import SchemaModel +from pandera.typing import Series + + +class SampleSchema(SchemaModel): + a: Series[int] + b: Series[str] + c: Series[bool] diff --git a/tests/mocking/__init__.py b/tests/mocking/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/mocking/io.py b/tests/mocking/io.py deleted file mode 100644 index 8ed59fb..0000000 --- a/tests/mocking/io.py +++ /dev/null @@ -1,188 +0,0 @@ -# pylint: disable=missing-class-docstring, missing-module-docstring, missing-function-docstring - -from dynamicio import UnifiedIO -from dynamicio.core import SCHEMA_FROM_FILE - - -class ReadS3IO(UnifiedIO): - schema = {"id": "int64"} - - -class ReadMockS3CsvIO(UnifiedIO): - schema = SCHEMA_FROM_FILE - - -class TemplatedFile(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class ReadLocalParquetTemplated(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class ReadS3CsvIO(UnifiedIO): - schema = SCHEMA_FROM_FILE - - -class ReadS3DataWithLessColumnsIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object"} - - -class ReadS3DataWithFalseTypes(UnifiedIO): - schema = {"id": "float64", "foo_name": "object"} - - -class ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO(UnifiedIO): - schema = { - "bar": "int64", - "foo_name": "object", - "a_number": "int64", - "b_number": "int64", - "bar_type": "object", - } - - -class ReadS3ParquetIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class ReadS3ParquetWEmptyFilesIO(UnifiedIO): - schema = {"id": "object", "bar": "int64"} - - -class ReadS3ParquetWithLessColumnsIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object"} - - -class ReadS3HdfIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class AsyncReadS3HdfIO(UnifiedIO): - schema = {"col_1": "int64", "col_2": "object"} - - -class ReadS3JsonIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class WriteS3ParquetIO(UnifiedIO): - schema = {"col_1": "int64", "col_2": "object"} - - -class WriteS3ParquetExternalIO(UnifiedIO): - schema = { - "bar": "int64", - "event_type": "object", - "id": "int64", - "end_odometer": "int64", - "foo_name": "object", - } - - -class WriteS3CsvIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class WriteS3CsvWithSchema(UnifiedIO): - schema = SCHEMA_FROM_FILE - - -class WriteS3HdfIO(UnifiedIO): - schema = {"col_1": "int64", "col_2": "object"} - - -class WriteS3JsonIO(UnifiedIO): - schema = {"col_1": "int64", "col_2": "object"} - - -class ReadPostgresIO(UnifiedIO): - schema = {"id": "object", "foo": "object", "bar": "int64", "baz": "object"} - - -class WritePostgresIO(UnifiedIO): - schema = {"id": "object", "foo": "object", "bar": "int64", "baz": "object"} - - -class WriteExtendedPostgresIO(UnifiedIO): - schema = {"id": "object", "foo": "object", "bar": "int64", "start_date": "datetime64[ns]", "active": "bool", "net": "float64"} - - -class WriteKafkaIO(UnifiedIO): - schema = {"id": "object", "foo": "object", "bar": "int64", "baz": "object"} - - -class WriteKeyedKafkaIO(UnifiedIO): - schema = {"key": "object", "id": "object", "foo": "object", "bar": "int64", "baz": "object"} - - -class MockKafkaProducer: - def __init__(self): - self.my_stream = [] - - def send(self, topic: str, value: dict, key: str = None): # pylint: disable=unused-argument - self.my_stream.append({"key": key, "value": value}) - - def flush(self): - pass - - def close(self): - pass - - -class ReadS3ParquetWithDifferentCastableDTypeIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - # Input format of some_parquet_to_read.parquet is: - # id,foo_name,bar - # 1,foo_a,1 - # 2,foo_b,2 - # ... - # 15,foo_a,15 - - -class ReadS3ParquetWithDifferentNonCastableDTypeIO(UnifiedIO): - schema = {"id": "int64", "foo_name": "int64", "bar": "int64"} - - # Input format of some_parquet_to_read.parquet is: - # id,foo_name,bar - # 1,foo_a,1 - # 2,foo_b,2 - # ... - # 15,foo_a,15 - - -class ReadFromBatchLocalParquet(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class ReadFromBatchLocalHdf(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64"} - - -class ParquetWithSomeBool(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"} - - -class CsvWithSomeBool(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"} - - -class HdfWithSomeBool(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"} - - -class JsonWithSomeBool(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"} - - -class ParquetWithCustomValidate(UnifiedIO): - schema = {"id": "int64", "foo_name": "object", "bar": "int64", "bool_col": "bool"} - - @staticmethod - def validate(df): - if not df["id"].is_unique: - return False - if df["bar"].isna().any(): - return False - return True diff --git a/tests/mocking/models.py b/tests/mocking/models.py deleted file mode 100644 index e19d09f..0000000 --- a/tests/mocking/models.py +++ /dev/null @@ -1,33 +0,0 @@ -"""A module for defining sql_alchemy models.""" -# pylint: disable=too-few-public-methods, R0801, C0104 -__all__ = ["ERModel"] - -from sqlalchemy import Column, Integer, String -from sqlalchemy.ext.declarative import declarative_base - -Base = declarative_base() - - -class ERModel(Base): - """ - Sql_alchemy model for example table - - """ - - __tablename__ = "example" - - id = Column(String, primary_key=True) - foo = Column(String) - bar = Column(Integer) - baz = Column(String) - - -clsdict = { - "clsname": "PgModel", - "__tablename__": "pg", - "id": Column(String(64), primary_key=True, nullable=False), - "foo": Column(String(64)), - "bar": Column(Integer()), - "baz": Column(String(64)), -} -PgModel = type(clsdict["clsname"], (Base,), clsdict) diff --git a/tests/resource_tests/test_kafka.py b/tests/resource_tests/test_kafka.py new file mode 100644 index 0000000..cfaea28 --- /dev/null +++ b/tests/resource_tests/test_kafka.py @@ -0,0 +1,52 @@ +# flake8: noqa: I101 + +from unittest.mock import MagicMock, call, patch + +import pytest + +from dynamicio import KafkaResource + + +@pytest.fixture +def mocked_kafka_producer(): + mocked_kafka_producer = MagicMock() + with patch("dynamicio.io.kafka.KafkaProducer") as kafka_producer: + kafka_producer.return_value = mocked_kafka_producer + yield mocked_kafka_producer + + +@pytest.fixture +def kafka_resource() -> KafkaResource: + return KafkaResource(topic="test_topic", server="test_server") + + +def test_kafka_resource_write(test_df, kafka_resource, mocked_kafka_producer): + kafka_resource.write(test_df) + mocked_kafka_producer.send.assert_has_calls( + [ + call("test_topic", key=0, value={"a": 1, "b": "x", "c": True}), + call("test_topic", key=1, value={"a": 2, "b": "y", "c": False}), + call("test_topic", key=2, value={"a": 3, "b": "z", "c": True}), + ] + ) + + +def test_kafka_resource_read(kafka_resource): + with pytest.raises(NotImplementedError): + kafka_resource.read() + + +def test_kafka_inject_success(kafka_resource, passing_injections, test_df, mocked_kafka_producer): + kafka_resource.topic = "{var1}" + kafka_resource.server = "{var2}" + kafka_resource = kafka_resource.inject(**passing_injections) + assert kafka_resource.topic == passing_injections["var1"] + assert kafka_resource.server == passing_injections["var2"] + kafka_resource.write(test_df) + mocked_kafka_producer.send.assert_has_calls( + [ + call(passing_injections["var1"], key=0, value={"a": 1, "b": "x", "c": True}), + call(passing_injections["var1"], key=1, value={"a": 2, "b": "y", "c": False}), + call(passing_injections["var1"], key=2, value={"a": 3, "b": "z", "c": True}), + ] + ) diff --git a/tests/resource_tests/test_local_file.py b/tests/resource_tests/test_local_file.py new file mode 100644 index 0000000..47c0b82 --- /dev/null +++ b/tests/resource_tests/test_local_file.py @@ -0,0 +1,28 @@ +import pandas as pd + +from dynamicio.io import LocalFileResource +from tests import constants +from tests.fixtures.schemas import SampleSchema + + +def test_read(test_df, file_name): + resource = LocalFileResource(path=constants.TEST_FIXTURES / file_name) + df = resource.read() + pd.testing.assert_frame_equal(df, test_df) + + +def test_read_with_schema(test_df, file_name): + resource = LocalFileResource(path=constants.TEST_FIXTURES / file_name, pa_schema=SampleSchema) + df = resource.read() + pd.testing.assert_frame_equal(df, test_df) + + +def test_write(test_df, tmpdir, file_name): + resource = LocalFileResource(path=tmpdir / file_name) + resource.write(test_df) + # reading should probably not be done with the config here + df = resource.read() + pd.testing.assert_frame_equal(df, test_df) + + +# TODO: test float json thing diff --git a/tests/resource_tests/test_postgres.py b/tests/resource_tests/test_postgres.py new file mode 100644 index 0000000..4543cf8 --- /dev/null +++ b/tests/resource_tests/test_postgres.py @@ -0,0 +1,241 @@ +from unittest.mock import ANY, MagicMock, Mock, patch + +import pandas as pd +import pytest + +from dynamicio import PostgresResource +from tests import constants +from tests.fixtures.schemas import SampleSchema + +sample_path = f"{constants.TEST_FIXTURES}/sample.parquet" + + +@pytest.fixture +def postgres_table_resource() -> PostgresResource: + return PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + db_schema="republic", + table_name="test_table", + ) + + +@pytest.fixture +def postgres_query_resource() -> PostgresResource: + return PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + db_schema="republic", + sql_query="SELECT * FROM other_table", + ) + + +@pytest.fixture +def mock_cursor(): + return MagicMock() + + +@pytest.fixture +def mock_binding(): + return "mock_binding" + + +@pytest.fixture +def mocked_session(mock_cursor, mock_binding): + mock_session = MagicMock() + mock_session.connection.return_value.connection.cursor.return_value = mock_cursor + mock_session.get_bind.return_value = mock_binding + mock_session_maker = Mock(return_value=mock_session) + with patch("dynamicio.io.postgres.Session", mock_session_maker): + yield mock_session + + +@pytest.fixture +def postgres_df(postgres_table_resource) -> pd.DataFrame: + return pd.read_parquet(sample_path) + + +@pytest.fixture +def read_sql_mock(postgres_df): + with patch("pandas.read_sql", return_value=postgres_df) as mock: + yield mock + + +@pytest.fixture +def to_sql_mock(postgres_df): + with patch("pandas.DataFrame.to_sql", return_value=None) as mock: + yield mock + + +def test_postgres_resource_read(postgres_table_resource, postgres_df, read_sql_mock, mocked_session, mock_binding): + df = postgres_table_resource.read() + read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding) + pd.testing.assert_frame_equal(df, postgres_df) + + +def test_postgres_resource_read_with_schema(postgres_df, read_sql_mock, mocked_session, mock_binding): + resource = PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + db_schema="republic", + table_name="test_table", + pa_schema=SampleSchema, + ) + df = resource.read() + read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding) + pd.testing.assert_frame_equal(df, postgres_df) + + +def test_postgres_resource_read_without_application_name(): + mocked_session_scope = MagicMock() + with patch("dynamicio.io.postgres.session_scope", mocked_session_scope): + resource = PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + db_schema="republic", + table_name="test_table", + pa_schema=SampleSchema, + ) + try: + df = resource.read() + except Exception as e: + pass + + mocked_session_scope.assert_called_once_with("postgresql://test_user@test_host:1234/test_db", None) + + +def test_postgres_resource_read_with_application_name(): + mocked_session_scope = MagicMock() + with patch("dynamicio.io.postgres.session_scope", mocked_session_scope): + resource = PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + db_schema="republic", + table_name="test_table", + pa_schema=SampleSchema, + application_name="test_app", + ) + try: + df = resource.read() + except Exception as e: + pass + + mocked_session_scope.assert_called_once_with("postgresql://test_user@test_host:1234/test_db", "test_app") + + +class PgFilterSampleSchema(SampleSchema): + class Config: + strict = "filter" + + +def test_postgres_resource_read_with_filter_schema( + postgres_table_resource, postgres_df, read_sql_mock, mocked_session, mock_binding +): + postgres_table_resource.pa_schema = PgFilterSampleSchema + df = postgres_table_resource.read() + read_sql_mock.assert_called_once_with( + sql="SELECT a, b, c FROM republic.test_table", + con=mock_binding, + ) + pd.testing.assert_frame_equal(df, postgres_df) + + +def test_postgres_query_resource_read( + postgres_query_resource, postgres_df, read_sql_mock, mocked_session, mock_binding +): + df = postgres_query_resource.read() + read_sql_mock.assert_called_once_with(sql="SELECT * FROM other_table", con=mock_binding) + pd.testing.assert_frame_equal(df, postgres_df) + + +# --- Write tests --- + + +def test_postgres_resource_write( + postgres_table_resource, postgres_df, to_sql_mock, mocked_session, mock_binding, mock_cursor +): + postgres_table_resource.write(postgres_df) + to_sql_mock.assert_called_once_with( + name="test_table", con=mock_binding, if_exists="replace", index=False, schema="republic" + ) + + +def test_postgres_resource_write_truncate_and_append( + postgres_table_resource, postgres_df, to_sql_mock, mocked_session, mock_binding, mock_cursor +): + postgres_table_resource.truncate_and_append = True + postgres_table_resource.write(postgres_df) + mocked_session.execute.assert_called_once_with("TRUNCATE TABLE republic.test_table;") + mock_cursor.execute.assert_called_once_with("SET search_path TO republic;") + mock_cursor.copy_from.assert_called_once_with(ANY, "test_table", columns=postgres_df.columns, null="") + + +def test_postgres_resource_inject_and_read(postgres_df, read_sql_mock, mocked_session, mock_binding): + resource = PostgresResource( + db_user="{db_user}", + db_host="{db_host}", + db_port=1234, + db_name="that_{db_name}", + db_schema="{republic}", + table_name="{table}", + ) + resource = resource.inject( + db_user="test_user", db_host="test_host", db_name="test_db", table="test_table", republic="republic" + ) + df = resource.read() + read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding) + pd.testing.assert_frame_equal(df, postgres_df) + + +def test_postgres_resource_inject_and_read_query(postgres_df, read_sql_mock, mocked_session, mock_binding): + resource = PostgresResource( + db_user="{db_user}", + db_host="{db_host}", + db_port=1234, + db_name="that_{db_name}", + db_schema="{republic}", + sql_query="SELECT * FROM {republic}.{table}", + ) + resource = resource.inject( + db_user="test_user", db_host="test_host", db_name="test_db", table="test_table", republic="republic" + ) + df = resource.read() + read_sql_mock.assert_called_once_with(sql="SELECT * FROM republic.test_table", con=mock_binding) + pd.testing.assert_frame_equal(df, postgres_df) + + +def test_postgres_resource_raises_on_wrong_read_configuration(postgres_df, read_sql_mock, mocked_session, mock_binding): + resource = PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + table_name="test_table", + sql_query="SELECT * FROM other_table", + ) + with pytest.raises(ValueError): + resource.read() + + +def test_postgres_resource_raises_on_wrong_write_configuration( + postgres_df, read_sql_mock, mocked_session, mock_binding, to_sql_mock, mock_cursor +): + resource = PostgresResource( + db_user="test_user", + db_host="test_host", + db_port=1234, + db_name="test_db", + sql_query="SELECT * FROM other_table", + ) + with pytest.raises(ValueError): + resource.write(postgres_df) diff --git a/tests/resource_tests/test_s3.py b/tests/resource_tests/test_s3.py new file mode 100644 index 0000000..245f275 --- /dev/null +++ b/tests/resource_tests/test_s3.py @@ -0,0 +1,75 @@ +from contextlib import contextmanager +from typing import Generator + +import boto3 +import pandas as pd +import pytest +from botocore.stub import Stubber +from unittest.mock import patch + +from dynamicio.io import S3Resource +from tests import constants +from tests.fixtures.schemas import SampleSchema + + +@pytest.fixture +def with_s3_stubber(): + s3_client = boto3.client("s3") + Stubber(s3_client) + + with patch("boto3.client"): + yield s3_client + + +@pytest.fixture +def with_mocked_named_reader(): + @contextmanager + def mocked_named_reader(s3_client, s3_bucket: str, s3_key: str) -> Generator: + name = s3_bucket + "/" + s3_key + target_file = type("MockNamedTemporaryFile", (object,), {"name": name})() + yield target_file + + with patch(f"dynamicio.io.s3.s3_named_file_reader", new=mocked_named_reader) as target: + yield target + + +@contextmanager +def mocked_s3_generator(s3_client, s3_bucket: str, s3_key: str) -> Generator: + yield s3_bucket + "/" + s3_key + + +@pytest.fixture +def with_mocked_reader(): + with patch(f"dynamicio.io.s3.s3_reader", new=mocked_s3_generator) as target: + yield target + + +@pytest.fixture +def with_mocked_writer(): + with patch(f"dynamicio.io.s3.s3_writer", new=mocked_s3_generator) as target: + yield target + + +@pytest.fixture(autouse=True) +def with_mocked_s3(with_mocked_named_reader, with_mocked_reader, with_mocked_writer, with_s3_stubber): + yield + + +def test_read(test_df, file_name): + resource = S3Resource(bucket=str(constants.TEST_FIXTURES), path=file_name) + df = resource.read() + pd.testing.assert_frame_equal(df, test_df) + + +def test_read_with_schema(test_df, file_name): + resource = S3Resource(bucket=str(constants.TEST_FIXTURES), path=file_name, pa_schema=SampleSchema) + df = resource.read() + pd.testing.assert_frame_equal(df, test_df) + + +def test_write(test_df, tmpdir, file_name): + resource = S3Resource(bucket=str(tmpdir), path=file_name) + resource.write(test_df) + # reading should probably not be done with the config here + df = resource.read() + pd.testing.assert_frame_equal(df, test_df) diff --git a/tests/resources/data/external/h5_with_more_columns.h5 b/tests/resources/data/external/h5_with_more_columns.h5 deleted file mode 100644 index 37b4fa4..0000000 Binary files a/tests/resources/data/external/h5_with_more_columns.h5 and /dev/null differ diff --git a/tests/resources/data/external/json_with_more_columns.json b/tests/resources/data/external/json_with_more_columns.json deleted file mode 100644 index 75bd97c..0000000 --- a/tests/resources/data/external/json_with_more_columns.json +++ /dev/null @@ -1,104 +0,0 @@ -{ - "id":{ - "0":1, - "1":2, - "2":3, - "3":4, - "4":5, - "5":6, - "6":7, - "7":8, - "8":9, - "9":10, - "10":11, - "11":12, - "12":13, - "13":14, - "14":15 - }, - "foo_name":{ - "0":"foo_a", - "1":"foo_b", - "2":"foo_a", - "3":"foo_b", - "4":"foo_a", - "5":"foo_b", - "6":"foo_a", - "7":"foo_b", - "8":"foo_a", - "9":"foo_b", - "10":"foo_a", - "11":"foo_b", - "12":"foo_a", - "13":"foo_b", - "14":"foo_a" - }, - "bar":{ - "0":1, - "1":2, - "2":3, - "3":4, - "4":5, - "5":6, - "6":7, - "7":8, - "8":9, - "9":10, - "10":11, - "11":12, - "12":13, - "13":14, - "14":15 - }, - "bar_type":{ - "0":"my-type", - "1":"my-type", - "2":"my-type", - "3":"my-type", - "4":"my-type", - "5":"my-type", - "6":"my-type", - "7":"my-type", - "8":"my-type", - "9":"my-type", - "10":"my-type", - "11":"my-type", - "12":"my-type", - "13":"my-type", - "14":"my-type" - }, - "a_number":{ - "0":1500, - "1":1500, - "2":1500, - "3":1500, - "4":1500, - "5":1500, - "6":1500, - "7":1500, - "8":1500, - "9":1500, - "10":1500, - "11":1500, - "12":1500, - "13":1500, - "14":1500 - }, - "b_number":{ - "0":1600, - "1":1600, - "2":1600, - "3":1600, - "4":1600, - "5":1600, - "6":1600, - "7":1600, - "8":1600, - "9":1600, - "10":1600, - "11":1600, - "12":1600, - "13":1600, - "14":1600 - } -} \ No newline at end of file diff --git a/tests/resources/data/input/batch/hdf/part_02.h5 b/tests/resources/data/input/batch/hdf/part_02.h5 deleted file mode 100644 index 62b6174..0000000 Binary files a/tests/resources/data/input/batch/hdf/part_02.h5 and /dev/null differ diff --git a/tests/resources/data/input/batch/not_just_hdf/part_02.h5 b/tests/resources/data/input/batch/not_just_hdf/part_02.h5 deleted file mode 100644 index 62b6174..0000000 Binary files a/tests/resources/data/input/batch/not_just_hdf/part_02.h5 and /dev/null differ diff --git a/tests/resources/data/input/batch/not_just_hdf/something_to_ignore.txt b/tests/resources/data/input/batch/not_just_hdf/something_to_ignore.txt deleted file mode 100644 index e69de29..0000000 diff --git a/tests/resources/data/input/batch/not_just_parquet/part_01.parquet b/tests/resources/data/input/batch/not_just_parquet/part_01.parquet deleted file mode 100644 index fb5deb4..0000000 Binary files a/tests/resources/data/input/batch/not_just_parquet/part_01.parquet and /dev/null differ diff --git a/tests/resources/data/input/batch/not_just_parquet/part_02.parquet b/tests/resources/data/input/batch/not_just_parquet/part_02.parquet deleted file mode 100644 index 00934a8..0000000 Binary files a/tests/resources/data/input/batch/not_just_parquet/part_02.parquet and /dev/null differ diff --git a/tests/resources/data/input/batch/not_just_parquet/something_to_ignore.txt b/tests/resources/data/input/batch/not_just_parquet/something_to_ignore.txt deleted file mode 100644 index e69de29..0000000 diff --git a/tests/resources/data/input/batch/parquet/part_01.parquet b/tests/resources/data/input/batch/parquet/part_01.parquet deleted file mode 100644 index fb5deb4..0000000 Binary files a/tests/resources/data/input/batch/parquet/part_01.parquet and /dev/null differ diff --git a/tests/resources/data/input/batch/parquet/part_02.parquet b/tests/resources/data/input/batch/parquet/part_02.parquet deleted file mode 100644 index 00934a8..0000000 Binary files a/tests/resources/data/input/batch/parquet/part_02.parquet and /dev/null differ diff --git a/tests/resources/data/input/batch/parquet_w_empty_files/emptyfile.parquet b/tests/resources/data/input/batch/parquet_w_empty_files/emptyfile.parquet deleted file mode 100644 index 9eea44b..0000000 Binary files a/tests/resources/data/input/batch/parquet_w_empty_files/emptyfile.parquet and /dev/null differ diff --git a/tests/resources/data/input/batch/parquet_w_empty_files/fullfile.parquet b/tests/resources/data/input/batch/parquet_w_empty_files/fullfile.parquet deleted file mode 100644 index 5fdaca4..0000000 Binary files a/tests/resources/data/input/batch/parquet_w_empty_files/fullfile.parquet and /dev/null differ diff --git a/tests/resources/data/input/some_csv_to_read.csv b/tests/resources/data/input/some_csv_to_read.csv deleted file mode 100644 index a6f8c5f..0000000 --- a/tests/resources/data/input/some_csv_to_read.csv +++ /dev/null @@ -1,16 +0,0 @@ -id,foo_name,bar -1,name_a,1 -2,name_b,2 -3,name_a,3 -4,name_b,4 -5,name_a,5 -6,name_b,6 -7,name_a,7 -8,name_b,8 -9,name_a,9 -10,name_b,10 -11,name_a,11 -12,name_b,12 -13,name_a,13 -14,name_b,14 -15,name_a,15 \ No newline at end of file diff --git a/tests/resources/data/input/some_hdf_to_read.h5 b/tests/resources/data/input/some_hdf_to_read.h5 deleted file mode 100644 index 39e6995..0000000 Binary files a/tests/resources/data/input/some_hdf_to_read.h5 and /dev/null differ diff --git a/tests/resources/data/input/some_json_to_read.json b/tests/resources/data/input/some_json_to_read.json deleted file mode 100644 index 0e6bd00..0000000 --- a/tests/resources/data/input/some_json_to_read.json +++ /dev/null @@ -1,53 +0,0 @@ -{ - "id":{ - "0":1, - "1":2, - "2":3, - "3":4, - "4":5, - "5":6, - "6":7, - "7":8, - "8":9, - "9":10, - "10":11, - "11":12, - "12":13, - "13":14, - "14":15 - }, - "foo_name":{ - "0":"name_a", - "1":"name_b", - "2":"name_a", - "3":"name_b", - "4":"name_a", - "5":"name_b", - "6":"name_a", - "7":"name_b", - "8":"name_a", - "9":"name_b", - "10":"name_a", - "11":"name_b", - "12":"name_a", - "13":"name_b", - "14":"name_a" - }, - "bar":{ - "0":1, - "1":2, - "2":3, - "3":4, - "4":5, - "5":6, - "6":7, - "7":8, - "8":9, - "9":10, - "10":11, - "11":12, - "12":13, - "13":14, - "14":15 - } -} \ No newline at end of file diff --git a/tests/resources/data/input/some_parquet_to_read.parquet b/tests/resources/data/input/some_parquet_to_read.parquet deleted file mode 100644 index 9eb054b..0000000 Binary files a/tests/resources/data/input/some_parquet_to_read.parquet and /dev/null differ diff --git a/tests/resources/data/input/some_pg_parquet_to_read.parquet b/tests/resources/data/input/some_pg_parquet_to_read.parquet deleted file mode 100644 index ddf1d30..0000000 Binary files a/tests/resources/data/input/some_pg_parquet_to_read.parquet and /dev/null differ diff --git a/tests/resources/data/temp/.gitkeep b/tests/resources/data/temp/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/resources/definitions/external.yaml b/tests/resources/definitions/external.yaml deleted file mode 100644 index 8c17d7e..0000000 --- a/tests/resources/definitions/external.yaml +++ /dev/null @@ -1,83 +0,0 @@ ---- -READ_FROM_ATHENA: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "athena" - athena: - db_host: "[[ DB_HOST ]]" - db_port: "[[ DB_PORT ]]" - db_name: "[[ DB_NAME ]]" - db_user: "[[ DB_USER ]]" - db_password: "[[ DB_PASS ]]" - -READ_MOCK_S3_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "csv" - # Missing SCHEMA - -READ_FROM_S3_JSON: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/external/json_with_more_columns.json" - file_type: "json" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "json" - -READ_FROM_S3_HDF: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/external/h5_with_more_columns.h5" - file_type: "hdf" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "hdf" - -WRITE_TO_S3_PARQUET: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/external/some_parquet_with_schema_dictated_column_order.parquet" - file_type: "parquet" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "test/write_some_parquet.parquet" - file_type: "parquet" - -WRITE_TO_S3_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/external/write_some_csv.csv" - file_type: "csv" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "test/write_some_csv.csv" - file_type: "csv" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/write_to_s3_csv.yaml" diff --git a/tests/resources/definitions/input.yaml b/tests/resources/definitions/input.yaml deleted file mode 100644 index 2edeac3..0000000 --- a/tests/resources/definitions/input.yaml +++ /dev/null @@ -1,287 +0,0 @@ ---- -READ_FROM_S3_CSV_ALT: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "csv" - -READ_FROM_S3_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "csv" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml" - -READ_FROM_S3_JSON: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json" - file_type: "json" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "json" - -READ_FROM_S3_HDF: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5" - file_type: "hdf" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "hdf" - -READ_FROM_S3_PARQUET: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "s3:sample-prefix/[[ MOCK_KEY ]]" - file_type: "parquet" - -READ_FROM_S3_PATH_PREFIX_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]" - path_prefix: "[[ MOCK_KEY ]]" - file_type: "csv" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml" - -READ_FROM_S3_PATH_PREFIX_PARQUET: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]" - path_prefix: "[[ MOCK_KEY ]]" - file_type: "parquet" - -READ_FROM_S3_PATH_PREFIX_HDF: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5" - file_type: "hdf" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]" - path_prefix: "[[ MOCK_KEY ]]" - file_type: "hdf" - -READ_FROM_S3_PATH_PREFIX_JSON: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json" - file_type: "hdf" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]" - path_prefix: "[[ MOCK_KEY ]]" - file_type: "json" - -READ_FROM_POSTGRES: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_pg_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "postgres" - postgres: - db_host: "[[ DB_HOST ]]" - db_port: "[[ DB_PORT ]]" - db_name: "[[ DB_NAME ]]" - db_user: "[[ DB_USER ]]" - db_password: "[[ DB_PASS ]]" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/pg.yaml" - -READ_FROM_POSTGRES_WITH_QUERY_IN_OPTIONS: - CLOUD: - type: "postgres" - postgres: - db_host: "[[ DB_HOST ]]" - db_port: "[[ DB_PORT ]]" - db_name: "[[ DB_NAME ]]" - db_user: "[[ DB_USER ]]" - db_password: "[[ DB_PASS ]]" - options: - sql_query: "SELECT * FROM table_name_from_yaml_options" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/pg.yaml" - -READ_FROM_KAFKA: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "kafka" - kafka: - kafka_server: "[[ KAFKA_SERVER ]]" - kafka_topic: "[[ KAFKA_TOPIC ]]" - -TEMPLATED_FILE_PATH: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.csv" - file_type: "csv" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "path/to/{file_name_to_replace}.csv" - file_type: "csv" - -READ_FROM_PARQUET_TEMPLATED: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.parquet" - file_type: "parquet" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "path/to/{file_name_to_replace}.parquet" - file_type: "parquet" - -READ_FROM_BATCH_LOCAL_PARQUET: - LOCAL: - type: "local_batch" - local: - path_prefix: "[[ TEST_RESOURCES ]]/data/input/batch/parquet/" - file_type: "parquet" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.parquet" - file_type: "parquet" - -READ_FROM_BATCH_LOCAL_NOT_JUST_PARQUET: - LOCAL: - type: "local_batch" - local: - path_prefix: "[[ TEST_RESOURCES ]]/data/input/batch/not_just_parquet/" - file_type: "parquet" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.parquet" - file_type: "parquet" - -READ_FROM_BATCH_LOCAL_HDF: - LOCAL: - type: "local_batch" - local: - path_prefix: "[[ TEST_RESOURCES ]]/data/input/batch/hdf/" - file_type: "hdf" - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]/data/input/{file_name_to_replace}.hdf" - file_type: "hdf" - -S3_PARQUET_WITH_BOOL: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_with_bool_vals.parquet" - file_type: "parquet" - -S3_CSV_WITH_BOOL: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_with_bool_vals.csv" - file_type: "csv" - -S3_HDF_WITH_BOOL: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_with_bool_vals.h5" - file_type: "hdf" - -S3_JSON_WITH_BOOL: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_with_bool_vals.json" - file_type: "json" - -S3_PARQUET_WITH_CUSTOM_VALIDATE: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_with_bool_vals.parquet" - file_type: "parquet" - -S3_PARQUET_WITH_OPTIONS_IN_CODE: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - -S3_PARQUET_WITH_OPTIONS_IN_DEFINITION: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - options: - option_3: false - option_4: true - -WRITE_TO_S3_PATH_PREFIX_PARQUET: - CLOUD: - type: "s3_path_prefix" - s3: - bucket: "[[ MOCK_BUCKET ]]" - path_prefix: "[[ MOCK_KEY ]]" - file_type: "parquet" - diff --git a/tests/resources/definitions/processed.yaml b/tests/resources/definitions/processed.yaml deleted file mode 100644 index b54bfa1..0000000 --- a/tests/resources/definitions/processed.yaml +++ /dev/null @@ -1,81 +0,0 @@ ---- -WRITE_TO_S3_PARQUET: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_parquet.parquet" - file_type: "parquet" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "test/write_some_parquet.parquet" - file_type: "parquet" - -WRITE_TO_S3_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_csv.csv" - file_type: "csv" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "test/write_some_csv.csv" - file_type: "csv" - -WRITE_TO_S3_JSON: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_json.json" - file_type: "json" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "test/write_some_json.json" - file_type: "json" - -WRITE_TO_S3_HDF: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/write_some_h5.h5" - file_type: "hdf" - CLOUD: - type: "s3_file" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "test/write_some_h5.h5" - file_type: "hdf" - -WRITE_TO_KAFKA_JSON: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/write_kafka_messages.json" - file_type: "json" - options: - orient: "records" - CLOUD: - type: "kafka" - kafka: - kafka_server: "[[ KAFKA_SERVER ]]" - kafka_topic: "[[ KAFKA_TOPIC ]]" - -WRITE_TO_PG_PARQUET: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/processed/write_kafka_messages.parquet" - file_type: "parquet" - CLOUD: - type: "postgres" - postgres: - db_host: "[[ DB_HOST ]]" - db_port: "[[ DB_PORT ]]" - db_name: "[[ DB_NAME ]]" - db_user: "[[ DB_USER ]]" - db_password: "[[ DB_PASS ]]" diff --git a/tests/resources/definitions/test_input.yaml b/tests/resources/definitions/test_input.yaml deleted file mode 100644 index 8ad2428..0000000 --- a/tests/resources/definitions/test_input.yaml +++ /dev/null @@ -1,129 +0,0 @@ ---- -READ_FROM_S3_CSV_ALT: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "csv" - -READ_FROM_S3_CSV: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "csv" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml" - -READ_FROM_S3_JSON: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json" - file_type: "json" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "json" - -READ_FROM_S3_HDF: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5" - file_type: "hdf" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "[[ MOCK_KEY ]]" - file_type: "hdf" - -READ_FROM_S3_PARQUET: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "s3:sample-prefix/[[ MOCK_KEY ]]" - file_type: "parquet" - -READ_FROM_POSTGRES: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_pg_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "postgres" - postgres: - db_host: "[[ DB_HOST ]]" - db_port: "[[ DB_PORT ]]" - db_name: "[[ DB_NAME ]]" - db_user: "[[ DB_USER ]]" - db_password: "[[ DB_PASS ]]" - -READ_FROM_KAFKA: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - file_type: "parquet" - CLOUD: - type: "kafka" - kafka: - kafka_server: "[[ KAFKA_SERVER ]]" - kafka_topic: "[[ KAFKA_TOPIC ]]" - -TEMPLATED_FILE_PATH: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.csv" - file_type: "csv" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "path/to/{file_name_to_replace}.csv" - file_type: "csv" - -READ_FROM_PARQUET_TEMPLATED: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.parquet" - file_type: "parquet" - CLOUD: - type: "s3" - s3: - bucket: "[[ MOCK_BUCKET ]]" - file_path: "path/to/{file_name_to_replace}.parquet" - file_type: "parquet" - -REPLACE_SCHEMA_WITH_DYN_VARS: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES ]]/data/input/{file_name_to_replace}.parquet" - file_type: "parquet" - schema: - file_path: "[[ TEST_RESOURCES ]]/schemas/bar.yaml" diff --git a/tests/resources/schemas/bar.yaml b/tests/resources/schemas/bar.yaml deleted file mode 100644 index 2a96769..0000000 --- a/tests/resources/schemas/bar.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -name: bar -columns: - column_a: - type: "object" - validations: - has_unique_values: - apply: true - options: {} - metrics: - - Counts - column_b: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - CountsPerLabel - column_c: - type: float64 - validations: - is_greater_than: - apply: true - options: - threshold: 1000 - metrics: [] - column_d: - type: float64 - validations: - is_lower_than: - apply: true - options: - threshold: "[[ LOWER_THAN_LIMIT ]]" - metrics: - - Min - - Max - - Mean - - Std - - Variance - "0": - type: "object" - validations: {} - metrics: [] - 1: - type: "object" - validations: {} - metrics: [] diff --git a/tests/resources/schemas/foo.yaml b/tests/resources/schemas/foo.yaml deleted file mode 100644 index 8cce75a..0000000 --- a/tests/resources/schemas/foo.yaml +++ /dev/null @@ -1,46 +0,0 @@ ---- -name: foo -columns: - id: - type: "object" - validations: - has_unique_values: - apply: true - options: - name: - type: "objet" - validations: - has_no_null_values: - apply: true - options: - year: - type: "float64" - validations: - is_greater_than: - apply: true - options: - threshold: 1950 - amount: - type: "float64" - validations: - is_between: - apply: true - options: - lower: 0 - upper: 1000 - include_left: false - include_right: true # true by default - category: - type: "object" - validations: - is_in: - apply: true - options: - categorical_values: - - class_a - - class_b - - class_c - match_all: false # true by default, if false, then the column unique categoricals must be equal to the acceptable ones, else they must be a subset - has_no_null_values: - apply: true - options: diff --git a/tests/resources/schemas/pg.yaml b/tests/resources/schemas/pg.yaml deleted file mode 100644 index 17fadb5..0000000 --- a/tests/resources/schemas/pg.yaml +++ /dev/null @@ -1,40 +0,0 @@ ---- -name: pg -columns: - id: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - CountsPerLabel - foo: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - metrics: - - Max - - Min - bar: - type: "int64" - validations: - is_greater_than: - apply: true - options: - threshold: 1950 - metrics: [] - baz: - type: "object" - validations: - is_between: - apply: true - options: - lower: 0 - upper: 1000 - metrics: - - Min - - Max - - Mean diff --git a/tests/resources/schemas/read_from_s3_csv.yaml b/tests/resources/schemas/read_from_s3_csv.yaml deleted file mode 100644 index ec1f767..0000000 --- a/tests/resources/schemas/read_from_s3_csv.yaml +++ /dev/null @@ -1,50 +0,0 @@ ---- -name: read_from_s3_csv -columns: - id: - type: "int64" - validations: - has_unique_values: - apply: true - options: {} - has_no_null_values: - apply: true - options: {} - metrics: - - UniqueCounts - - Counts - foo_name: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - is_in: - apply: true - options: - categorical_values: - - class_a - - class_b - - class_c - metrics: - - CountsPerLabel - bar: - type: "int64" - validations: - has_no_null_values: - apply: true - options: {} - is_greater_than: - apply: true - options: - threshold: 1000 - is_lower_than: - apply: true - options: - threshold: 2000 - metrics: - - Min - - Max - - Mean - - Std - - Variance diff --git a/tests/resources/schemas/some_csv_to_read.yaml b/tests/resources/schemas/some_csv_to_read.yaml deleted file mode 100644 index 8487043..0000000 --- a/tests/resources/schemas/some_csv_to_read.yaml +++ /dev/null @@ -1,14 +0,0 @@ -columns: - id: - metrics: [] - type: int64 - validations: {} - bar: - metrics: [] - type: int64 - validations: {} - foo_name: - metrics: [] - type: object - validations: {} -name: some_csv_to_read diff --git a/tests/resources/schemas/some_hdf_to_read.yaml b/tests/resources/schemas/some_hdf_to_read.yaml deleted file mode 100644 index 33bc4c6..0000000 --- a/tests/resources/schemas/some_hdf_to_read.yaml +++ /dev/null @@ -1,14 +0,0 @@ -columns: - id: - metrics: [] - type: int64 - validations: {} - bar: - metrics: [] - type: int64 - validations: {} - foo_name: - metrics: [] - type: object - validations: {} -name: some_hdf_to_read diff --git a/tests/resources/schemas/some_json_to_read.yaml b/tests/resources/schemas/some_json_to_read.yaml deleted file mode 100644 index 3b496d0..0000000 --- a/tests/resources/schemas/some_json_to_read.yaml +++ /dev/null @@ -1,14 +0,0 @@ -columns: - id: - metrics: [] - type: int64 - validations: {} - bar: - metrics: [] - type: int64 - validations: {} - foo_name: - metrics: [] - type: object - validations: {} -name: some_json_to_read diff --git a/tests/resources/schemas/some_parquet_to_read.yaml b/tests/resources/schemas/some_parquet_to_read.yaml deleted file mode 100644 index f4c65cd..0000000 --- a/tests/resources/schemas/some_parquet_to_read.yaml +++ /dev/null @@ -1,14 +0,0 @@ -columns: - id: - metrics: [] - type: int64 - validations: {} - bar: - metrics: [] - type: int64 - validations: {} - foo_name: - metrics: [] - type: object - validations: {} -name: some_parquet_to_read diff --git a/tests/resources/schemas/some_pg_parquet_to_read.yaml b/tests/resources/schemas/some_pg_parquet_to_read.yaml deleted file mode 100644 index 9021ae9..0000000 --- a/tests/resources/schemas/some_pg_parquet_to_read.yaml +++ /dev/null @@ -1,18 +0,0 @@ -columns: - bar: - metrics: [] - type: int64 - validations: {} - baz: - metrics: [] - type: object - validations: {} - foo: - metrics: [] - type: object - validations: {} - id: - metrics: [] - type: object - validations: {} -name: some_pg_parquet_to_read diff --git a/tests/resources/schemas/write_to_s3_csv.yaml b/tests/resources/schemas/write_to_s3_csv.yaml deleted file mode 100644 index 8b5f3a1..0000000 --- a/tests/resources/schemas/write_to_s3_csv.yaml +++ /dev/null @@ -1,42 +0,0 @@ ---- -name: read_from_s3_csv -columns: - id: - type: "int64" - validations: - has_unique_values: - apply: true - options: {} - has_no_null_values: - apply: true - options: {} - metrics: null - foo_name: - type: "object" - validations: - has_no_null_values: - apply: true - options: {} - is_in: - apply: true - options: - categorical_values: - - class_a - - class_b - - class_c - metrics: null - bar: - type: "int64" - validations: - has_no_null_values: - apply: true - options: {} - is_greater_than: - apply: true - options: - threshold: 1000 - is_lower_than: - apply: true - options: - threshold: 2000 - metrics: null diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index 2ffe89e..0000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,346 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302 -import argparse -import os -from unittest.mock import patch - -import pandas as pd -import pytest - -import dynamicio -from dynamicio import cli -from dynamicio.cli import parse_args -from dynamicio.errors import InvalidDatasetTypeError -from tests.conftest import DummyYaml -from tests.constants import TEST_RESOURCES - - -class TestCli: - @pytest.mark.unit - def test_entrypoint(self): - print() # Just makes the output more readable in the terminal - - # When - exit_status = os.system("python -m dynamicio --help") - - # Then - assert exit_status == 0 - - @pytest.mark.unit - @pytest.mark.parametrize( - ["args_pattern", "expected_args"], - [ - ( - ["-b", "-p", "path/to/datasets_dir", "-o", "output_dir"], - argparse.Namespace(batch=True, output="output_dir", path="path/to/datasets_dir", single=False), - ), - ( - ["-s", "-p", "path/to/datasets_dir/the_one.parquet", "-o", "output_dir"], - argparse.Namespace( - batch=False, - output="output_dir", - path="path/to/datasets_dir/the_one.parquet", - single=True, - ), - ), - ], - ) - def test_parser_can_take_one_out_of_two_valid_argument_patters(self, args_pattern, expected_args): - # When/Then - assert parse_args(args_pattern) == expected_args - - @pytest.mark.unit - @pytest.mark.parametrize( - "args_pattern", - [ - ["-p", "path/to/datasets_dir", "-o", "output_dir"], - ["-p", "path/to/datasets_dir/the_one.parquet", "-o", "output_dir"], - ], - ) - def test_parse_args_raises_system_exit_if_batch_or_single_flags_not_provided(self, args_pattern): - # When/Then - with pytest.raises(SystemExit): - parse_args(args_pattern) - - @pytest.mark.unit - @pytest.mark.parametrize( - "args_pattern", - [ - ["-b"], - ["-b", "-o", "output_dir"], - ["-b", "-p", "path/to/datasets_dir"], - ["-s"], - ["-s", "-o", "output_dir"], - ["-s", "-p", "path/to/datasets_dir"], - ], - ) - def test_parse_args_raises_system_exit_with_approved_flag_without_path_and_output(self, args_pattern): - # When/Then - with pytest.raises(SystemExit): - parse_args(args_pattern) - - @pytest.mark.unit - def test_when_single_flag_is_used__generate_schema_for__is_called_once(self): - with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args, patch.object(cli, "generate_schema_for") as mocked__generate_schema_for, patch.object( - cli, "open" - ) as mocked__open, patch.object(cli.yaml, "safe_dump") as mocked__dump: - # Given - mocked__parse_args.return_value = argparse.Namespace(batch=False, single=True, path="the_one.parquet", output=".") - mocked__generate_schema_for.return_value = {"name": "the_one", "columns": {}} - mocked__open.return_value = DummyYaml(path="path/to/the_one.yaml") - mocked__dump.return_value = "The-Matrix" - # When - dynamicio.cli.run() - - # Then - assert mocked__generate_schema_for.called_once_with("the_one.parquet", ".") - - @pytest.mark.unit - def test_when_batch_flag_is_used__generate_schema_for__is_called_multiple_times_as_per_the_no_of_files_under_the_datasets_dir( - self, - ): - - with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args: - with patch.object(cli, "generate_schema_for") as mocked__generate_schema_for: - with patch.object(cli.glob, "glob") as mocked__glob: - with patch.object(cli, "open") as mocked__open: - with patch.object(cli.yaml, "safe_dump") as mocked__dump: - # Given - mocked__parse_args.return_value = argparse.Namespace(batch=True, single=False, path="path/to/datasets_dir", output=".") - mocked__generate_schema_for.return_value = { - "name": "random", - "columns": {}, - } - mocked__glob.return_value = [ - "path/to/datasets_dir/agent_1.parquet", - "path/to/datasets_dir/agent_2.parquet", - ] - mocked__open.return_value = DummyYaml(path="path/to/the_oracle.yaml") - mocked__dump.return_value = "file_content" - # When - dynamicio.cli.run() - - # Then - assert mocked__generate_schema_for.call_count == 2 - - @pytest.mark.unit - @pytest.mark.parametrize( - ["dataset", "expected_reader"], - [ - ("path/to/dataset.parquet", "read_parquet"), - ("path/to/dataset.json", "read_json"), - ("path/to/dataset.csv", "read_csv"), - ("path/to/dataset.h5", "read_hdf"), - ], - ) - def test_generate_schema_for__uses_the_appropriate_pandas_reader_to_read_a_file(self, dataset, expected_reader): - # When - with patch.object(cli.pd, expected_reader) as mocked_reader: - mocked_reader.return_value = pd.DataFrame() - cli.generate_schema_for(dataset) - - # Then - mocked_reader.assert_called() - - @pytest.mark.unit - def test_generate_schema_for__throws_exception_InvalidDatasetTypeError(self): - # Given - dataset = "path/to/trinity.txt" - - # When/Then - with pytest.raises(InvalidDatasetTypeError): - cli.generate_schema_for(dataset) - - @pytest.mark.unit - def test_generate_schema_for__returns_a_json_schema_with_a_name_key_populated_with_the_dataset_name( - self, - ): - # Given - dataset = "path/to/the_matrix.parquet" - - # When - with patch.object(cli.pd, "read_parquet") as mocked_reader: - mocked_reader.return_value = pd.DataFrame.from_dict({"agents": [1, 2, 3], "zioners": [4, 5, 6]}) - json_schema = cli.generate_schema_for(dataset) - - # Then - assert json_schema["name"] == "the_matrix" - - @pytest.mark.unit - def test_generate_schema_for__returns_a_json_schema_with_all_columns_in_the_provided_dataset( - self, - ): - # Given - dataset = "path/to/the_matrix.parquet" - - # When - with patch.object(cli.pd, "read_parquet") as mocked_reader: - mocked_reader.return_value = pd.DataFrame.from_dict({"agents": [1, 2, 3], "zioners": [4, 5, 6]}) - json_schema = cli.generate_schema_for(dataset) - - # Then - assert list(json_schema["columns"].keys()) == ["agents", "zioners"] - - @pytest.mark.unit - def test_generate_schema_for__returns_a_json_schema_with_all_columns_in_the_provided_dataset_with_the_correct_data_types( - self, - ): - # Given - dataset = "path/to/the_matrix.parquet" - - # When - with patch.object(cli.pd, "read_parquet") as mocked_reader: - mocked_reader.return_value = pd.DataFrame.from_dict( - { - "agents": [1, 2, 3], - "zioners": ["4", "5", "6"], - "red_pill": [True, False, True], - "value": [1.0, 2.0, 3.0], - } - ) - json_schema = cli.generate_schema_for(dataset) - - # Then - assert {column["type"] for column in json_schema["columns"].values()} == { - "bool", - "object", - "int64", - "float64", - } - - @pytest.mark.unit - def test_generate_schema_for__returns_a_valid_json_schema_for_a_given_dataset(self): - # Given - dataset = "path/to/the_matrix.parquet" - - # When - with patch.object(cli.pd, "read_parquet") as mocked_reader: - mocked_reader.return_value = pd.DataFrame.from_dict( - { - "agents": [1, 2, 3], - "zioners": ["4", "5", "6"], - "red_pill": [True, False, True], - "value": [1.0, 2.0, 3.0], - } - ) - json_schema = cli.generate_schema_for(dataset) - - # Then - assert json_schema == { - "columns": { - "agents": {"metrics": [], "type": "int64", "validations": {}}, - "red_pill": {"metrics": [], "type": "bool", "validations": {}}, - "value": {"metrics": [], "type": "float64", "validations": {}}, - "zioners": {"metrics": [], "type": "object", "validations": {}}, - }, - "name": "the_matrix", - } - - @pytest.mark.unit - def test_cli_runner_raises_invalid_dataset_type_error_exception_message_when_invoked_with_single_flag_and_invalid_path( - self, - ): - # Given - dataset = "path/to/trinity.txt" - - # When/Then - with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args: - mocked__parse_args.return_value = argparse.Namespace( - batch=False, - single=True, - path=dataset, - output=os.path.join(TEST_RESOURCES, "data/temp/"), - ) - with pytest.raises(InvalidDatasetTypeError): - cli.run() - - @pytest.mark.unit - def test_when_single_flag_is_used__the_cli_generates_a_schema_yaml_for_the_provided_dataset( - self, - ): - with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args: - with patch.object(cli.pd, "read_parquet") as mocked_reader: - mocked__parse_args.return_value = argparse.Namespace( - batch=False, - single=True, - path="the_one.parquet", - output=os.path.join(TEST_RESOURCES, "data/temp/"), - ) - # Given - mocked_reader.return_value = pd.DataFrame.from_dict({"agents": [1, 2, 3], "zioners": [4, 5, 6]}) - - # When - dynamicio.cli.run() - - # Then - output_yaml = os.path.join(TEST_RESOURCES, "data/temp", "the_one.yaml") - try: - assert os.path.isfile(output_yaml) - finally: - os.remove(output_yaml) - - @pytest.mark.unit - def test_when_batch_flag_is_used__the_cli_generates_a_schema_yaml_for_each_dataset_in_the_provided_dir( - self, - ): - with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args: - with patch.object(cli.pd, "read_parquet") as mocked_reader: - with patch.object(cli.glob, "glob") as mocked__glob: - # Given - mocked__parse_args.return_value = argparse.Namespace( - batch=True, - single=False, - path="path/to/datasets_dir", - output=os.path.join(TEST_RESOURCES, "data/temp/"), - ) - mocked__glob.return_value = [ - "path/to/datasets_dir/agent_1.parquet", - "path/to/datasets_dir/agent_2.parquet", - ] - mocked_reader.return_value = pd.DataFrame.from_dict({"skills": [1, 2, 3], "levels": [4, 5, 6]}) - - # When - dynamicio.cli.run() - - # Then - output_yaml_1 = os.path.join(TEST_RESOURCES, "data/temp", "agent_1.yaml") - output_yaml_2 = os.path.join(TEST_RESOURCES, "data/temp", "agent_2.yaml") - try: - assert os.path.isfile(output_yaml_1) & os.path.isfile(output_yaml_2) - finally: - os.remove(output_yaml_1) - os.remove(output_yaml_2) - - @pytest.mark.unit - def test_cli_runner_prints_an_invalid_dataset_type_warning_when_invoked_with_batch_flag_and_a_dir_with_an_invalid_path_but_is_not_interrupted(self, capsys): - - with patch.object(cli.argparse.ArgumentParser, "parse_args") as mocked__parse_args: - with patch.object(cli.glob, "glob") as mocked__glob: - with patch.object(cli.pd, "read_parquet") as mocked_reader: - with patch.object(cli, "open") as mocked__open: - with patch.object(cli.yaml, "safe_dump") as mocked__dump: - # Given - mocked__parse_args.return_value = argparse.Namespace( - batch=True, - single=False, - path="a/dummy/path/", - output=os.path.join(TEST_RESOURCES, "data/temp/"), - ) - mocked__glob.return_value = [ - "path/to/neo.parquet", - "path/to/trinity.txt", - "path/to/morpheus.parquet", - ] - mocked_reader.return_value = pd.DataFrame.from_dict({"column_1": [1, 2, 3], "column_2": [4, 5, 6]}) - mocked__open.return_value = DummyYaml(path="path/to/the_oracle.yaml") - mocked__dump.return_value = "file_content" - # When - cli.run() - captured = capsys.readouterr() - - # Then - std_out = captured.out.split("\n") - assert ( - (std_out[0] == "Generating schema for: path/to/neo.parquet") - and (std_out[1] == "Skipping path/to/trinity.txt! You may want to remove this file from the datasets directory") - and (std_out[2] == "Generating schema for: path/to/morpheus.parquet") - ) diff --git a/tests/test_config.py b/tests/test_config.py deleted file mode 100644 index 45da395..0000000 --- a/tests/test_config.py +++ /dev/null @@ -1,181 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, R0801 -import io -import os - -import pytest -import yaml - -from dynamicio.config.io_config import IOConfig, SafeDynamicResourceLoader, SafeDynamicSchemaLoader -from tests import constants - - -class TestIOConfig: - @pytest.mark.unit - def test_config_io_parser_returns_a_transformed_dict_version_of_the_yaml_input_with_dynamic_values_replaced(self, expected_input_yaml_dict): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ) - - # When - yaml_dict = input_config.config.dict() - # Then - assert yaml_dict == expected_input_yaml_dict - - @pytest.mark.unit - def test_config_io_get_schema_definition_returns_a_schema_definition_from_a_source_config(self, expected_schema_definition): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ) - - # When - schema_definition = input_config.config.bindings["READ_FROM_S3_CSV"].dict() - - # Then - assert schema_definition == expected_schema_definition - - @pytest.mark.unit - def test_config_io_sources_returns_all_available_sources(self): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ) - - # When - sources = list(input_config.config.bindings.keys()) - - # Then - assert sources == [ - "READ_FROM_S3_CSV_ALT", - "READ_FROM_S3_CSV", - "READ_FROM_S3_JSON", - "READ_FROM_S3_HDF", - "READ_FROM_S3_PARQUET", - "READ_FROM_POSTGRES", - "READ_FROM_KAFKA", - "TEMPLATED_FILE_PATH", - "READ_FROM_PARQUET_TEMPLATED", - "REPLACE_SCHEMA_WITH_DYN_VARS", - ] - - @pytest.mark.unit - def test_get_for_config_io_set_for_a_local_env_returns_a_local_mapping_for_a_given_key(self, expected_s3_csv_local_mapping): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ) - - # When - s3_csv_local_mapping = input_config.config.bindings["READ_FROM_S3_CSV"].dict() - - # Then - assert s3_csv_local_mapping == expected_s3_csv_local_mapping - - @pytest.mark.unit - def test_get_for_config_io_set_for_a_cloud_env_returns_a_cloud_mapping_for_an_s3_csv_key(self, expected_s3_csv_cloud_mapping): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ) - - # When - s3_csv_cloud_mapping = input_config.get(source_key="READ_FROM_S3_CSV").dynamicio_schema.dict() - - # Then - assert s3_csv_cloud_mapping == expected_s3_csv_cloud_mapping - - @pytest.mark.unit - def test_get_for_config_io_set_for_a_cloud_env_returns_a_cloud_mapping_for_an_postgres_key(self, expected_postgres_cloud_mapping): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ) - - # When - postgres_cloud_mapping = input_config.get(source_key="READ_FROM_POSTGRES").dict() - - # Then - assert postgres_cloud_mapping == expected_postgres_cloud_mapping - - @pytest.mark.unit - def test__get_schema_definition_dynamically_replaces_numerical_values_in_schemas(self): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ) - - # When - my_config = input_config.get(source_key="REPLACE_SCHEMA_WITH_DYN_VARS") - - # Then - assert my_config._parent.dynamicio_schema.columns["column_c"].validations[0].dict() == { # pylint: disable=protected-access - "apply": True, - "name": "is_greater_than", - "options": {"threshold": 1000}, - } - - @pytest.mark.unit - def test__get_schema_definition_returns_float_only_in_case_of_replacements(self): - # Given - input_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/test_input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ) - - # When - my_config = input_config.get(source_key="REPLACE_SCHEMA_WITH_DYN_VARS") - schema_dict = {} - for col in my_config._parent.dynamicio_schema.columns.values(): # pylint: disable=protected-access - schema_dict[col.name] = str(col.data_type) - - # Then - - assert schema_dict == { - "column_a": "ColumnType.object", - "column_b": "ColumnType.object", - "column_c": "ColumnType.float64", - "column_d": "ColumnType.float64", - "0": "ColumnType.object", - "1": "ColumnType.object", - } - - -class TestSafeDynamicLoader: # pylint: disable=R0903 - @pytest.mark.unit - def test_replaces_all_resource_template_instances(self): - file_contents = 'abc: "[[ VALUE_1 ]]/[[ VALUE_2 ]]"' - - class MockEnvironmentModule: # pylint: disable=R0903 - VALUE_1 = "abc" - VALUE_2 = "def" - - result = yaml.load(io.StringIO(file_contents), SafeDynamicResourceLoader.with_module(MockEnvironmentModule)) - - assert result == {"abc": "abc/def"} - - @pytest.mark.unit - def test_replaces_all_schema_template_instances(self): - file_contents = 'abc: "[[ VALUE_A ]]"' - - class MockEnvironmentModule: # pylint: disable=R0903 - VALUE_A = 100 - - result = yaml.load(io.StringIO(file_contents), SafeDynamicSchemaLoader.with_module(MockEnvironmentModule)) - - assert result == {"abc": 100} diff --git a/tests/test_core.py b/tests/test_core.py deleted file mode 100644 index 9118bcc..0000000 --- a/tests/test_core.py +++ /dev/null @@ -1,1002 +0,0 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, R0801 -import asyncio -import logging -import os -import time -from typing import Mapping, Tuple -from unittest.mock import patch - -import numpy as np -import pandas as pd -import pytest - -import dynamicio -from dynamicio.config import IOConfig -from dynamicio.core import CASTING_WARNING_MSG, DynamicDataIO -from dynamicio.errors import ColumnsDataTypeError, SchemaNotFoundError, SchemaValidationError -from dynamicio.mixins import WithS3File -from tests import constants -from tests.mocking.io import ( - CsvWithSomeBool, - HdfWithSomeBool, - JsonWithSomeBool, - ParquetWithCustomValidate, - ParquetWithSomeBool, - ReadMockS3CsvIO, - ReadS3CsvIO, - ReadS3DataWithFalseTypes, - ReadS3IO, - ReadS3ParquetIO, - WriteS3CsvIO, - WriteS3CsvWithSchema, - WriteS3ParquetExternalIO, -) - - -@pytest.fixture(autouse=True, scope="module") -def propagate_logger(): - # We need this because otherwise caplog can't capture the logs - logging.getLogger("dynamicio.metrics").propagate = True - yield - logging.getLogger("dynamicio.metrics").propagate = False - - -class TestCoreIO: - @pytest.mark.unit - def test_abstract_class_dynamic_data_io_cant_be_used_for_object_instantiation(self): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When/Then - with pytest.raises(TypeError): - DynamicDataIO(source_config=s3_csv_local_config) - - @pytest.mark.unit - def test_objects_of_dynamic_data_io_subclasses_cant_be_instantiated_in_the_absence_of_a_non_empty_schema( - self, - ): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When/Then - with pytest.raises(AssertionError): - - class AbsentSchemaIO(DynamicDataIO): - pass - - AbsentSchemaIO(source_config=s3_csv_local_config) - - @pytest.mark.unit - def test_objects_of_s3io_subclasses_cant_be_instantiated_in_the_presence_of_a_empty_dict_schema( - self, - ): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # Given/When/Then - with pytest.raises(ValueError): - - class EmptySchemaIO(WithS3File, DynamicDataIO): - dataset_name = "EmptySchema" - schema = {} - - EmptySchemaIO(source_config=s3_csv_local_config) - - @pytest.mark.unit - def test_objects_of_dynamic_data_io_subclasses_cant_be_instantiated_in_the_presence_of_a_schema_eq_to_none( - self, - ): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When/Then - with pytest.raises(ValueError): - - class NoneSchemaIO(WithS3File, DynamicDataIO): - dataset_name = "NoneSchema" - schema = None - - NoneSchemaIO(source_config=s3_csv_local_config) - - @pytest.mark.unit - def test_dynamic_data_io_object_instantiation_is_only_possible_for_subclasses(self): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - s3_csv_io = ReadS3CsvIO(source_config=s3_csv_local_config) - - # Then - assert isinstance(s3_csv_io, ReadS3CsvIO) and isinstance(s3_csv_io, DynamicDataIO) - - @pytest.mark.unit - def test_subclasses_of_dynamic_data_io_need_to_define_a_schema(self): - # Given/When/Then - with pytest.raises(AssertionError): - - class S3CsvIONoSchema(DynamicDataIO): # pylint: disable=unused-variable - pass - - @pytest.mark.unit - def test_subclasses_of_dynamic_data_io_need_to_define_a_static_validate_function(self): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When/Then - with pytest.raises(AssertionError): - - class CMVolumesIONoValidationFunction(DynamicDataIO): - schema = {"foo": "int64"} - - CMVolumesIONoValidationFunction(source_config=s3_csv_local_config) - - @pytest.mark.unit - def test_subclasses_of_dynamic_data_io_need_to_implement_private_reader_for_new_source_types( - self, - ): - # Given - athena_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_ATHENA") - - # When - with pytest.raises(AssertionError): - ReadS3IO(source_config=athena_cloud_config) - - @pytest.mark.unit - def test_key_error_is_thrown_for_missing_schema_if_unified_io_subclass_assigns_schema_from_file_but_file_is_missing( - self, - ): - # Given - read_mock_s3_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_MOCK_S3_CSV") - - # When - with pytest.raises(SchemaNotFoundError): - ReadMockS3CsvIO(source_config=read_mock_s3_cloud_config) - - @pytest.mark.integration - def test_schema_validations_are_applied_for_an_io_class_with_a_schema_definition(self, valid_dataframe): - # Given - df = valid_dataframe - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - io_instance = ReadS3CsvIO(source_config=s3_csv_cloud_config) - - # When - return_value = io_instance.validate_from_schema(df) - - # Then - assert io_instance == return_value - - @pytest.mark.integration - def test_log_metrics_from_schema_are_applied_for_an_io_class_with_a_schema_definition(self, caplog, valid_dataframe): - # Given - df = valid_dataframe - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - io_instance = ReadS3CsvIO(source_config=s3_csv_cloud_config) - - # When - with caplog.at_level(logging.INFO): - print() - return_value = io_instance.log_metrics_from_schema(df) - - # Then - assert ( - io_instance is return_value - and (len(caplog.records) == 10) - and (getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "id", "metric": "UniqueCounts", "value": 4.0}') - and (getattr(caplog.records[1], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "id", "metric": "Counts", "value": 4.0}') - and (getattr(caplog.records[2], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "foo_name-class_a", "metric": "CountsPerLabel", "value": 2.0}') - and (getattr(caplog.records[3], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "foo_name-class_b", "metric": "CountsPerLabel", "value": 1.0}') - and (getattr(caplog.records[4], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "foo_name-class_c", "metric": "CountsPerLabel", "value": 1.0}') - and (getattr(caplog.records[5], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Min", "value": 1500.0}') - and (getattr(caplog.records[6], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Max", "value": 1500.0}') - and (getattr(caplog.records[7], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Mean", "value": 1500.0}') - and (getattr(caplog.records[8], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Std", "value": 0.0}') - and (getattr(caplog.records[9], "message") == '{"message": "METRIC", "dataset": "READ_FROM_S3_CSV", "column": "bar", "metric": "Variance", "value": 0.0}') - ) - - @pytest.mark.integration - def test_schema_validations_errors_are_thrown_for_each_validation_if_df_does_not_map_to_schema_definition(self, invalid_dataframe): - # Given - df = invalid_dataframe - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - with pytest.raises(SchemaValidationError): - ReadS3CsvIO(source_config=s3_csv_cloud_config).validate_from_schema(df) - - @pytest.mark.integration - def test_schema_validations_exception_message_is_a_dict_with_all_violated_validations(self, invalid_dataframe, expected_messages): - # Given - df = invalid_dataframe - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - try: - ReadS3CsvIO(source_config=s3_csv_cloud_config).validate_from_schema(df) - except SchemaValidationError as _exception: - # Then - assert _exception.message.keys() == expected_messages # pylint: disable=no-member - - @pytest.mark.integration - def test_local_writers_only_write_out_castable_columns_according_to_the_io_schema_case_float64_to_int64_id(self, dataset_with_more_columns_than_dictated_in_schema): - - # Given - # Note col_1 will be interpreted with type float64 - input_df = dataset_with_more_columns_than_dictated_in_schema - - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - # class WriteS3ParquetExternalIO(UnifiedIO): - # schema = { - # 'bar': 'int64', - # 'event_type': 'object', - # 'id': 'int64', - # 'end_odometer': 'int64', - # 'foo_name': 'object', - # } - write_s3_io = WriteS3ParquetExternalIO(source_config=s3_parquet_local_config) - write_s3_io.write(input_df) - - # # Then - try: - output_df = pd.read_parquet(s3_parquet_local_config.local.file_path) - assert output_df.columns.to_list() == [ - "id", - "foo_name", - "bar", - "end_odometer", - "event_type", - ] - finally: - os.remove(s3_parquet_local_config.local.file_path) - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema") - def test_schema_validations_are_not_applied_on_read_if_validate_flag_is_false(self, mock_validate_from_schema): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - # ReadS3CsvIO(source_config=s3_csv_cloud_config, apply_schema_validations=False).read() - ReadS3CsvIO(source_config=s3_csv_local_config).read() # False is the default value - - # Then - mock_validate_from_schema.assert_not_called() - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema") - def test_schema_validations_are_automatically_applied_on_read_if_validate_flag_is_true(self, mock_validate_from_schema): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - ReadS3CsvIO(source_config=s3_csv_local_config, apply_schema_validations=True).read() - - # Then - mock_validate_from_schema.assert_called() - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema") - def test_schema_validations_are_automatically_applied_on_write_if_validate_flag_is_true(self, mock_validate_from_schema, valid_dataframe): - # Given - df = valid_dataframe - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - WriteS3CsvWithSchema(source_config=s3_csv_local_config, apply_schema_validations=True).write(df) - - # Then - try: - mock_validate_from_schema.assert_called() - finally: - os.remove(s3_csv_local_config.local.file_path) - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "validate_from_schema") - def test_schema_validations_are_not_applied_on_write_if_validate_flag_is_false(self, mock_validate_from_schema, valid_dataframe): - # Given - df = valid_dataframe - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - # WriteS3CsvWithSchema(source_config=s3_csv_cloud_config, apply_schema_validations=False).write(df) - WriteS3CsvWithSchema(source_config=s3_csv_local_config).write(df) # False is the default value - - # Then - try: - mock_validate_from_schema.assert_not_called() - finally: - os.remove(s3_csv_local_config.local.file_path) - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema") - def test_schema_metrics_are_not_logged_on_read_if_metrics_flag_is_false(self, mock_log_metrics_from_schema): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - # ReadS3CsvIO(source_config=s3_csv_cloud_config, log_schema_metrics=False).read() - ReadS3CsvIO(source_config=s3_csv_local_config).read() # False is the default value - - # Then - mock_log_metrics_from_schema.assert_not_called() - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema") - def test_schema_metrics_are_automatically_logged_on_read_if_validate_flag_is_true(self, mock_log_metrics_from_schema): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - ReadS3CsvIO(source_config=s3_csv_local_config, log_schema_metrics=True).read() - - # Then - mock_log_metrics_from_schema.assert_called() - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema") - def test_schema_metrics_are_automatically_logged_on_write_if_metrics_flag_is_true(self, mock_log_metrics_from_schema, valid_dataframe): - # Given - df = valid_dataframe - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - WriteS3CsvWithSchema(source_config=s3_csv_local_config, log_schema_metrics=True).write(df) - - # Then - try: - mock_log_metrics_from_schema.assert_called() - finally: - os.remove(s3_csv_local_config.local.file_path) - - @pytest.mark.unit - @patch.object(dynamicio.core.DynamicDataIO, "log_metrics_from_schema") - def test_schema_metrics_are_not_logged_on_write_if_metrics_flag_is_false(self, mock_log_metrics_from_schema, valid_dataframe): - # Given - df = valid_dataframe - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - # WriteS3CsvWithSchema(source_config=s3_csv_cloud_config, log_schema_metrics=False).write(df) - WriteS3CsvWithSchema(source_config=s3_csv_local_config).write(df) # False is the default value - - # Then - try: - mock_log_metrics_from_schema.assert_not_called() - finally: - os.remove(s3_csv_local_config.local.file_path) - - @pytest.mark.unit - @pytest.mark.parametrize( - "df, expected_dtype, expected_warning", - [ - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]), - "bool", - None, - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ], - ) - def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_parquet( - self, caplog, df, expected_dtype, expected_warning - ): - # Note: In the presence of a boolean cell value in a column, if that column also has numbers or strings, df.to_parquet() will not write it out. - # It will try to convert it to a bool and it will fail throwing an `pyarrow.lib.ArrowInvalid:` error - # - # This makes parquet a safer option from the available filetypes. - - # Given - s3_parquet_with_some_bool_col_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_BOOL") - - ParquetWithSomeBool(source_config=s3_parquet_with_some_bool_col_local_config).write(df) - - # Then - try: - if caplog.messages: - assert caplog.messages[0] == expected_warning - assert pd.read_parquet(s3_parquet_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype - finally: - os.remove(s3_parquet_with_some_bool_col_local_config.local.file_path) - - @pytest.mark.unit - @pytest.mark.parametrize( - "df, expected_dtype, expected_warning", - [ - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]), - "bool", - None, - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": 1}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records( - [ - {"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, - {"id": 2, "foo_name": "B", "bar": 12, "bool_col": "random"}, - ] - ), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ], - ) - def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_csv( - self, caplog, df, expected_dtype, expected_warning - ): - - # Given - s3_csv_with_some_bool_col_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_CSV_WITH_BOOL") - - CsvWithSomeBool(source_config=s3_csv_with_some_bool_col_local_config).write(df) - - # Then - try: - if caplog.messages: - assert caplog.messages[0] == expected_warning - assert pd.read_csv(s3_csv_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype - finally: - os.remove(s3_csv_with_some_bool_col_local_config.local.file_path) - - @pytest.mark.unit - @pytest.mark.parametrize( - "df, expected_dtype, expected_warning", - [ - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]), - "bool", - None, - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": 1}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records( - [ - {"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, - {"id": 2, "foo_name": "B", "bar": 12, "bool_col": "random"}, - ] - ), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ], - ) - def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_hdf( - self, caplog, df, expected_dtype, expected_warning - ): - - # Given - s3_hdf_with_some_bool_col_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_HDF_WITH_BOOL") - - HdfWithSomeBool(source_config=s3_hdf_with_some_bool_col_local_config).write(df) - - # Then - try: - if caplog.messages: - assert caplog.messages[0] == expected_warning - assert pd.read_hdf(s3_hdf_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype - finally: - os.remove(s3_hdf_with_some_bool_col_local_config.local.file_path) - - @pytest.mark.unit - @pytest.mark.parametrize( - "df, expected_dtype, expected_warning", - [ - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": False}]), - "bool", - None, - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": None}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": np.NAN}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": 1}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records( - [ - {"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, - {"id": 2, "foo_name": "B", "bar": 12, "bool_col": "random"}, - ] - ), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ( - pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NaT}]), - "bool", - CASTING_WARNING_MSG.format("bool_col", "bool", "object"), - ), - ], - ) - def test__has_valid_dtypes_does_not_attempt_to_convert_object_type_to_other_type_unless_other_is_bool_and_column_has_no_non_boolean_values_when_writing_a_json( - self, caplog, df, expected_dtype, expected_warning - ): - - # Note: In the presence of a boolean cell value in a column, but with additional values of ambiguous type, df.to_json() will try to convert the column - # to a type `int` or `float`, converting boolean values to numbers to `1.0 : True` and `0.0 : False`, and the rest to NaN. This can cause data corruption issues. - - # Given - s3_json_with_some_bool_col_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_JSON_WITH_BOOL") - - JsonWithSomeBool(source_config=s3_json_with_some_bool_col_local_config).write(df) - - # Then - try: - if caplog.messages: - assert caplog.messages[0] == expected_warning - assert pd.read_json(s3_json_with_some_bool_col_local_config.local.file_path)["bool_col"].dtype.name == expected_dtype - finally: - os.remove(s3_json_with_some_bool_col_local_config.local.file_path) - - @pytest.mark.unit - @pytest.mark.parametrize( - "df", - [ - (pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 12, "bool_col": pd.NA}])), - ( - pd.DataFrame.from_records( - [ - {"id": 1, "foo_name": "A", "bar": False, "bool_col": True}, - {"id": 2, "foo_name": "B", "bar": "BAD-VALUE", "bool_col": False}, - ] - ) - ), - ], - ) - def test__has_valid_dtypes_throws_columns_data_type_error_when_casting_fails(self, df): - - # Note: In the presence of a boolean cell value in a column, but with additional values of ambiguous type, df.to_json() will try to convert the column - # to a type `int` or `float`, converting boolean values to numbers to `1.0 : True` and `0.0 : False`, and the rest to NaN. This can cause data corruption issues. - - # Given - s3_parquet_with_some_bool_col_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_BOOL") - - # Then - with pytest.raises(ColumnsDataTypeError): - ParquetWithSomeBool(source_config=s3_parquet_with_some_bool_col_local_config).write(df) - - @pytest.mark.unit - def test_a_custom_validate_method_can_be_used_to_override_the_default_abstract_one(self): - - # Given - df = pd.DataFrame.from_records([{"id": 1, "foo_name": "A", "bar": 12, "bool_col": True}, {"id": 2, "foo_name": "B", "bar": 13, "bool_col": False}]) - s3_parquet_with_some_bool_col_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_CUSTOM_VALIDATE") - - # When - ParquetWithCustomValidate(source_config=s3_parquet_with_some_bool_col_local_config).write(df) - - # Then - try: - pd.testing.assert_frame_equal(pd.read_parquet(s3_parquet_with_some_bool_col_local_config.local.file_path), df) - finally: - os.remove(s3_parquet_with_some_bool_col_local_config.local.file_path) - - @pytest.mark.integration - def test_show_casting_warnings_flag_default_value_prevents_showing_casting_logs(self, caplog): - # Given - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - io_instance = ReadS3DataWithFalseTypes(source_config=s3_csv_cloud_config) # i.e.show_casting_warnings=False - - # When - with caplog.at_level(logging.INFO): - io_instance.read() - - # Then - assert len(caplog.records) == 0 - - @pytest.mark.integration - def test_show_casting_warnings_flag_allows_casting_logs_to_be_printed_if_set_to_true(self, caplog): - # Given - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - io_instance = ReadS3DataWithFalseTypes(source_config=s3_csv_cloud_config, show_casting_warnings=True) - - # When - with caplog.at_level(logging.INFO): - io_instance.read() - - # Then - assert getattr(caplog.records[0], "message") == "Expected: 'float64' dtype for READ_S3_DATA_WITH_FALSE_TYPES['id]', found 'int64'" - - @pytest.mark.unit - def test_options_are_read_from_code(self): - - # Given - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_CODE") - - # When - config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config, option_1=False, option_2=True) - - # Then - assert config_io.options == {"option_1": False, "option_2": True} - - @pytest.mark.unit - def test_options_are_read_from_resource_definition(self): - # Given - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_DEFINITION") - - # When - config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config) - - # Then - assert config_io.options == {"option_3": False, "option_4": True} - - @pytest.mark.unit - def test_options_are_that_are_read_from_both_resource_definition_and_code_but_with_no_conflicts_are_merged(self): - # Given - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_DEFINITION") - - # When - config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config, option_1=False, option_2=True) - - # Then - assert config_io.options == {"option_1": False, "option_2": True, "option_3": False, "option_4": True} - - @pytest.mark.unit - def test_options_from_code_are_prioritized(self): - # Given - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_DEFINITION") - - # When - config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config, option_1=False, option_2=True, option_3=True) # option_3 is conflicting - - # Then - assert config_io.options == {"option_1": False, "option_2": True, "option_3": True, "option_4": True} - - @pytest.mark.unit - @pytest.mark.parametrize( - "camel_case_string, expected_string", - [ - ("TestStringABC", "TEST_STRING_ABC"), - ("TestString", "TEST_STRING"), - ("ThisIsAnotherTest", "THIS_IS_ANOTHER_TEST"), - ("AbstractS3Test", "ABSTRACT_S3_TEST"), - ("YetAnotherGREATTest", "YET_ANOTHER_GREAT_TEST"), - ], - ) - def test_transform_class_names_to_dataset_names(self, camel_case_string, expected_string): - # Given/When - transformed_string = DynamicDataIO._transform_class_name_to_dataset_name(camel_case_string) # pylint: disable=W0212 - - assert transformed_string == expected_string - - @pytest.mark.unit - def test_no_options_at_all_are_provided_with_no_issues(self): - - # Given - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_CODE") - - # When - config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config) - - # Then - assert config_io.options == {} - - @pytest.mark.unit - def test_dataset_name_is_defined_by_io_class_if_schema_from_file_is_not_provided(self): - - # Given - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - config_io = ReadS3ParquetIO(source_config=s3_parquet_local_config) - - # Then - assert config_io.name == "READ_S3_PARQUET_IO" - - @pytest.mark.unit - def test_dataset_name_is_inferred_from_schema_if_schema_from_file_is_provided(self): - - # Given - s3_read_from_csv_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - config_io = ReadS3CsvIO(source_config=s3_read_from_csv_config) - - # Then - assert config_io.name == "READ_FROM_S3_CSV" - - -class TestAsyncCoreIO: - @pytest.mark.unit - def test_read_is_called_through_async_read(self): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - with patch.object(dynamicio.core.DynamicDataIO, "read") as mock_read: - mock_read.return_value = pd.DataFrame.from_records([[1, "name_a"]], columns=["id", "foo_name"]) - asyncio.run(ReadS3CsvIO(source_config=s3_csv_local_config).async_read()) - - # Then - mock_read.assert_called() - - @pytest.mark.unit - @pytest.mark.asyncio - async def test_write_is_called_through_async_write(self): - # Given - df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]}) - - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - with patch.object(dynamicio.core.DynamicDataIO, "write") as mock_write: - await asyncio.gather(WriteS3CsvIO(source_config=s3_csv_local_config).async_write(df)) - - # Then - mock_write.assert_called() - - @pytest.mark.unit - def test_async_read_does_indeed_operate_in_parallel(self): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - def dummy_read(self) -> pd.DataFrame: # pylint: disable=unused-argument - time.sleep(0.1) - return pd.DataFrame.from_records([[1, "name_a"]], columns=["id", "foo_name"]) - - async def multi_read(config: Mapping[str, str]) -> Tuple: - return await asyncio.gather( - ReadS3CsvIO(source_config=config).async_read(), - ReadS3CsvIO(source_config=config).async_read(), - ReadS3CsvIO(source_config=config).async_read(), - ReadS3CsvIO(source_config=config).async_read(), - ) - - # When - with patch.object(dynamicio.core.DynamicDataIO, "read", new=dummy_read): - start_time = time.time() - asyncio.run(multi_read(s3_csv_local_config)) - duration = time.time() - start_time - - # Then - assert duration < 0.125 - - @pytest.mark.unit - def test_async_write_does_indeed_operate_in_parallel(self): - # Given - df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]}) - - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - def dummy_write(self, _df: pd.DataFrame) -> bool: # pylint: disable=unused-argument - time.sleep(0.1) - return True - - async def multi_write(config: Mapping[str, str], _df: pd.DataFrame) -> Tuple: - return await asyncio.gather( - WriteS3CsvIO(source_config=config).async_write(_df), - WriteS3CsvIO(source_config=config).async_write(_df), - WriteS3CsvIO(source_config=config).async_write(_df), - WriteS3CsvIO(source_config=config).async_write(_df), - ) - - # When - with patch.object(dynamicio.core.DynamicDataIO, "read", new=dummy_write): - start_time = time.time() - asyncio.run(multi_write(s3_csv_local_config, df)) - duration = time.time() - start_time - - # Then - assert duration < 0.125 diff --git a/tests/test_metrics.py b/tests/test_metrics.py index 2a504d7..7f74c81 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,169 +1,51 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods -import logging - -import pytest - -from dynamicio.metrics import Counts, CountsPerLabel, log_metric, Max, Mean, Min, Std, UniqueCounts, Variance - - -@pytest.fixture(autouse=True, scope="module") -def propagate_logger(): - # We need this because otherwise caplog can't capture the logs - logging.getLogger("dynamicio.metrics").propagate = True - yield - logging.getLogger("dynamicio.metrics").propagate = False - - -class TestMetricsLogger: - @pytest.mark.unit - def test_metric_logging_works_even_if_value_is_nan(self, caplog): - # Given/ When - with caplog.at_level(logging.INFO): - log_metric(dataset="Test-DataSet", column="A", metric="B", value=float("nan")) - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "A", "metric": "B", "value": NaN}' - - @pytest.mark.unit - def test_metric_logging_works_even_if_value_is_inf(self, caplog): - # Given/ When - with caplog.at_level(logging.INFO): - log_metric(dataset="Test-DataSet", column="A", metric="B", value=float("inf")) - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "A", "metric": "B", "value": Infinity}' - - -class TestMin: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_min = Min(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() # keep this in for a better test output - log_min() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Min", "value": 5.0}' - - -class TestMax: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_max = Max(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() - log_max() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Max", "value": 9.0}' - - -class TestMean: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_mean = Mean(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() - log_mean() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Mean", "value": 6.6}' - - -class TestStd: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_std = Std(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() - log_std() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Std", "value": 1.429840705968481}' - - -class TestVariance: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_var = Variance(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() - log_var() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Variance", "value": 2.0444444444444443}' - - -class TestCounts: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_counts = Counts(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() - log_counts() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "Counts", "value": 10.0}' - - -class TestUniqueCounts: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_unique_counts = UniqueCounts(dataset_name="Test-DataSet", df=df, column="weight_a") - - # When - with caplog.at_level(logging.INFO): - print() - log_unique_counts() - - # Then - assert getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "weight_a", "metric": "UniqueCounts", "value": 5.0}' - - -class TestCountsPerLabel: - @pytest.mark.unit - def test_metric_generation_and_logging(self, caplog, input_df): - # Given - df = input_df - log_counts_per_label = CountsPerLabel(dataset_name="Test-DataSet", df=df, column="activity") - - # When - with caplog.at_level(logging.INFO): - print() - log_counts_per_label() - - # Then - assert ( - (len(caplog.records) == 3) - and (getattr(caplog.records[0], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "activity-discharge", "metric": "CountsPerLabel", "value": 5.0}') - and (getattr(caplog.records[1], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "activity-load", "metric": "CountsPerLabel", "value": 2.0}') - and ( - getattr(caplog.records[2], "message") == '{"message": "METRIC", "dataset": "Test-DataSet", "column": "activity-pass_through", "metric": "CountsPerLabel", "value": 3.0}' - ) +from unittest import mock +from unittest.mock import call + +from pandera import Field, SchemaModel +from pandera.typing import Series + +from dynamicio import ParquetResource +from dynamicio.metrics import Metric +from tests.constants import TEST_RESOURCES + + +class ParquetSampleSchema(SchemaModel): + """Schema for sample parquet file.""" + + id: Series[int] + foo_name: Series[str] = Field(log_statistics={"metrics": [Metric.COUNTS_PER_LABEL]}) + bar: Series[int] = Field( + log_statistics={ + "metrics": [ + Metric.MIN, + Metric.MAX, + Metric.MEAN, + Metric.STD, + Metric.VARIANCE, + Metric.COUNTS, + Metric.UNIQUE_COUNTS, + ] + } + ) + + +def test_metrics_logged_successfully(): + test_path = TEST_RESOURCES / "data/input/parquet_sample.parquet" + + resource = ParquetResource(path=test_path, pa_schema=ParquetSampleSchema) + + with mock.patch("dynamicio.metrics.log_metric") as log_metric: + _ = resource.read() + log_metric.assert_has_calls( + [ + call(column="foo_name", metric=Metric.COUNTS_PER_LABEL, value=8), + call(column="foo_name", metric=Metric.COUNTS_PER_LABEL, value=7), + call(column="bar", metric=Metric.MIN, value=1), + call(column="bar", metric=Metric.MAX, value=15), + call(column="bar", metric=Metric.MEAN, value=8), + call(column="bar", metric=Metric.STD, value=4.47213595499958), + call(column="bar", metric=Metric.VARIANCE, value=20), + call(column="bar", metric=Metric.COUNTS, value=15), + call(column="bar", metric=Metric.UNIQUE_COUNTS, value=15), + ] ) diff --git a/tests/test_mixins/test_kafka_mixins.py b/tests/test_mixins/test_kafka_mixins.py deleted file mode 100644 index 9fcb8fe..0000000 --- a/tests/test_mixins/test_kafka_mixins.py +++ /dev/null @@ -1,310 +0,0 @@ -# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801 -import os -from unittest.mock import MagicMock, patch - -import pandas as pd -import pytest -from kafka import KafkaProducer - -import dynamicio.mixins.with_kafka - -from dynamicio.config import IOConfig -from dynamicio.mixins import WithKafka -from tests import constants -from tests.mocking.io import ( - MockKafkaProducer, - WriteKafkaIO, -) - - -class TestKafkaIO: - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") - @patch.object(MockKafkaProducer, "send") - def test_write_to_kafka_is_called_for_writing_an_iterable_of_dicts_with_env_as_cloud_kafka(self, mock__kafka_producer, mock__kafka_producer_send, input_messages_df): - # Given - def rows_generator(_df, chunk_size): - _chunk = [] - for _, row in df.iterrows(): - _chunk.append(row.to_dict()) - if len(_chunk) == chunk_size: - yield pd.DataFrame(_chunk) - _chunk.clear() - - df = input_messages_df - - mock__kafka_producer.return_value = MockKafkaProducer() - mock__kafka_producer_send.return_value = MagicMock() - - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - - # When - for chunk in rows_generator(_df=df, chunk_size=2): - WriteKafkaIO(kafka_cloud_config).write(chunk) - # Then - assert mock__kafka_producer_send.call_count == 1 - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") - @patch.object(MockKafkaProducer, "send") - def test_write_to_kafka_is_called_with_document_transformer_if_provided_for_writing_an_iterable_of_dicts_with_env_as_cloud_kafka( - self, mock__kafka_producer, mock__kafka_producer_send, input_messages_df - ): - # Given - def rows_generator(_df, chunk_size): - _chunk = [] - for _, row in df.iterrows(): - _chunk.append(row.to_dict()) - if len(_chunk) == chunk_size: - yield pd.DataFrame(_chunk) - _chunk.clear() - - df = input_messages_df.iloc[[0]] - - mock__kafka_producer.return_value = MockKafkaProducer() - mock__kafka_producer_send.return_value = MagicMock() - - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - - # When - for chunk in rows_generator(_df=df, chunk_size=2): - WriteKafkaIO(kafka_cloud_config, document_transformer=lambda v: dict(**v, worked=True)).write(chunk) - # Then - mock__kafka_producer_send.assert_called_once_with( - { - "id": "message01", - "foo": "xxxxxxxx", - "bar": 0, - "baz": ["a", "b", "c"], - "worked": True, - } - ) - - @pytest.mark.unit - def test_kafka_producer_default_value_serialiser_is_used_unless_alternative_is_given(self, test_df): - # Given - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config) - - # When - with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer, patch.object(MockKafkaProducer, "send") as mock__kafka_producer_send: - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock__kafka_producer.return_value = MockKafkaProducer() - mock__kafka_producer_send.return_value = MagicMock() - write_kafka_io.write(test_df) - - # Then - value_serializer = write_kafka_io._WithKafka__kafka_config.pop("value_serializer") - assert "WithKafka._default_value_serializer" in str(value_serializer) - - @pytest.mark.unit - def test_kafka_producer_default_key_serialiser_is_used_unless_alternative_is_given(self, test_df): - # Given - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config) - - # When - with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer, patch.object(MockKafkaProducer, "send") as mock__kafka_producer_send: - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock__kafka_producer.return_value = MockKafkaProducer() - mock__kafka_producer_send.return_value = MagicMock() - write_kafka_io.write(test_df) - - # Then - key_serializer = write_kafka_io._WithKafka__kafka_config.pop("key_serializer") - assert "WithKafka._default_key_serializer" in str(key_serializer) - - @pytest.mark.unit - @patch.object(MockKafkaProducer, "send") - @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") - def test_kafka_producer_default_compression_type_is_snappy(self, mock__kafka_producer, mock__kafka_producer_send, test_df): - # Given - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock__kafka_producer.return_value = MockKafkaProducer() - mock__kafka_producer_send.return_value = MagicMock() - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config) - - # When - write_kafka_io.write(test_df) - - # Then - write_kafka_io._WithKafka__kafka_config.pop("value_serializer") # Removed as it returns a unique function identifier - write_kafka_io._WithKafka__kafka_config.pop("key_serializer") # Removed as it returns a unique function identifier - assert write_kafka_io._WithKafka__kafka_config == {"bootstrap_servers": "mock-kafka-server", "compression_type": "snappy"} - - @pytest.mark.unit - @patch.object(MockKafkaProducer, "send") - @patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") - def test_kafka_producer_options_are_replaced_by_the_user_options(self, mock__kafka_producer, mock__kafka_producer_send, test_df): - # Given - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock__kafka_producer.return_value = MockKafkaProducer() - mock__kafka_producer_send.return_value = MagicMock() - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config, compression_type="lz4", acks=2) - - # When - write_kafka_io.write(test_df) - - # Then - value_serializer = write_kafka_io._WithKafka__kafka_config.pop("value_serializer") # Removed as it returns a unique function identifier - write_kafka_io._WithKafka__kafka_config.pop("key_serializer") # Removed as it returns a unique function identifier - assert write_kafka_io._WithKafka__kafka_config == { - "acks": 2, - "bootstrap_servers": "mock-kafka-server", - "compression_type": "lz4", - } and "WithKafka._default_value_serializer" in str(value_serializer) - - @pytest.mark.unit - def test_producer_send_method_sends_messages_with_index_as_key_by_default_if_a_keygen_is_not_provided(self, test_df): - # Given - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config) - - # When - with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer: - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock_producer = MockKafkaProducer() - mock__kafka_producer.return_value = mock_producer - write_kafka_io.write(test_df) - - # Then - assert mock_producer.my_stream == [ - {"key": 0, "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1"}}, - {"key": 1, "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2"}}, - {"key": 2, "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3"}}, - ] - - @pytest.mark.unit - def test_producer_send_method_can_send_keyed_messages_using_a_custom_key_generator(self, test_df): - # Given - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda _, message: "XXX") - - # When - with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer: - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock_producer = MockKafkaProducer() - mock__kafka_producer.return_value = mock_producer - write_kafka_io.write(test_df) - - # Then - assert mock_producer.my_stream == [ - {"key": "XXX", "value": {"bar": 1000, "baz": "ABC", "foo": "id_1", "id": "cm_1"}}, - {"key": "XXX", "value": {"bar": 1000, "baz": "ABC", "foo": "id_2", "id": "cm_2"}}, - {"key": "XXX", "value": {"bar": 1000, "baz": "ABC", "foo": "id_3", "id": "cm_3"}}, - ] - - @pytest.mark.unit - @pytest.mark.parametrize( - "key, encoded_key", - [ - (None, None), - ("cacik", b"cacik"), - ], - ) - def test_default_key_serialiser_returns_none_if_key_is_not_provided_and_an_encoded_string_otherwise(self, key, encoded_key): - # Given/When/Then - assert encoded_key == WithKafka._default_key_serializer(key) - - @pytest.mark.unit - @pytest.mark.parametrize( - "value, encoded_value", - [ - (None, b"null"), - ({"a": 1, "b": "cacik"}, b'{"a": 1, "b": "cacik"}'), - ({"a": 1, "b": None}, b'{"a": 1, "b": null}'), - ], - ) - def test_default_value_serialiser_returns_encoded_mapping_if_key_is_not_provided_and_an_encoded_string_otherwise(self, value, encoded_value): - # Given/When/Then - assert encoded_value == WithKafka._default_value_serializer(value) - - @pytest.mark.unit - def test_default_key_generator_and_transformer_are_used_if_none_are_provided_by_the_user(self): - # Given - keyed_test_df = pd.DataFrame.from_records( - [ - ["key-01", "cm_1", "id_1", 1000, "ABC"], - ["key-01", "cm_2", "id_2", 1000, "ABC"], # <-- index is non-unique - ["key-02", "cm_3", "id_3", 1000, "ABC"], - ], - columns=["key", "id", "foo", "bar", "baz"], - ).set_index("key") - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config) - - # When - with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer: - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock_producer = MockKafkaProducer() - mock__kafka_producer.return_value = mock_producer - - # When - write_kafka_io.write(keyed_test_df) - assert (write_kafka_io._WithKafka__key_generator("idx", "value") == "idx") and (write_kafka_io._WithKafka__document_transformer("value") == "value") - - @pytest.mark.unit - def test_custom_key_generator_and_transformer_are_used_if_they_are_provided_by_the_user(self): - # Given - keyed_test_df = pd.DataFrame.from_records( - [ - ["key-01", "cm_1", "id_1", 1000, "ABC"], - ["key-01", "cm_2", "id_2", 1000, "ABC"], - ["key-02", "cm_3", "id_3", 1000, "ABC"], - ], - columns=["key", "id", "foo", "bar", "baz"], - ).set_index("key") - kafka_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - write_kafka_io = WriteKafkaIO(kafka_cloud_config, key_generator=lambda idx, _: "xxx", document_transformer=lambda _: "xxx") - - # When - with patch.object(dynamicio.mixins.with_kafka, "KafkaProducer") as mock__kafka_producer: - mock__kafka_producer.DEFAULT_CONFIG = KafkaProducer.DEFAULT_CONFIG - mock_producer = MockKafkaProducer() - mock__kafka_producer.return_value = mock_producer - - # When - write_kafka_io.write(keyed_test_df) - assert (write_kafka_io._WithKafka__key_generator("idx", "value") == "xxx") and (write_kafka_io._WithKafka__document_transformer("value") == "xxx") diff --git a/tests/test_mixins/test_local_mixins.py b/tests/test_mixins/test_local_mixins.py deleted file mode 100644 index 39732b6..0000000 --- a/tests/test_mixins/test_local_mixins.py +++ /dev/null @@ -1,795 +0,0 @@ -# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801 -import asyncio -import os -import time -from typing import Mapping, Tuple -from unittest.mock import patch - -import numpy as np -import pandas as pd -import pytest - -import dynamicio.mixins.utils -import dynamicio.mixins.with_local - -from dynamicio.config import IOConfig -from tests import constants -from tests.conftest import max_pklproto_hdf -from tests.constants import TEST_RESOURCES -from tests.mocking.io import ( - AsyncReadS3HdfIO, - ReadFromBatchLocalHdf, - ReadFromBatchLocalParquet, - ReadPostgresIO, - ReadS3CsvIO, - ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO, - ReadS3DataWithLessColumnsIO, - ReadS3HdfIO, - ReadS3JsonIO, - ReadS3ParquetIO, - TemplatedFile, - WriteKafkaIO, - WritePostgresIO, - WriteS3CsvIO, - WriteS3HdfIO, - WriteS3ParquetIO, -) -from tests.mocking.models import ERModel - - -class TestLocalIO: - @pytest.mark.unit - def test_read_parquet_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_parquet_to_read.parquet" - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - s3_parquet_df = ReadS3DataWithLessColumnsIO(source_config=s3_parquet_local_config).read() - - # Then - assert expected_df_with_less_columns.equals(s3_parquet_df) - - @pytest.mark.unit - def test_read_json_pandas_reader_will_maintain_columns_order_of_the_original_dataset_when_filtering_out_columns( - self, - ): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/definitions/external.yaml/json_with_more_columns.json" - s3_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_JSON") - - # When - s3_json_df = ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO(source_config=s3_json_local_config).read() - - # Then - assert s3_json_df.columns.to_list() == ["foo_name", "bar", "bar_type", "a_number", "b_number"] - - @pytest.mark.unit - def test_read_hdf_pandas_reader_will_maintain_columns_order_of_the_original_dataset_when_filtering_out_columns( - self, - ): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/definitions/external.yaml/h5_with_more_columns.h5" - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/external.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_HDF") - - # When - s3_hdf_df = ReadS3DataWithLessColumnsAndMessedOrderOfColumnsIO(source_config=s3_hdf_local_config).read() - - # Then - assert s3_hdf_df.columns.to_list() == ["foo_name", "bar", "bar_type", "a_number", "b_number"] - - @pytest.mark.unit - def test_read_csv_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv" - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV_ALT") - - # When - s3_csv_df = ReadS3DataWithLessColumnsIO(source_config=s3_csv_local_config).read() - - # Then - assert expected_df_with_less_columns.equals(s3_csv_df) - - @pytest.mark.unit - def test_read_h5_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_hdf_to_read.h5" - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_HDF") - - # When - s3_hdf_df = ReadS3DataWithLessColumnsIO(source_config=s3_parquet_local_config).read() - - # Then - assert expected_df_with_less_columns.equals(s3_hdf_df) - - @pytest.mark.unit - def test_read_json_pandas_reader_will_only_load_columns_in_schema(self, expected_df_with_less_columns): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_json_to_read.json" - s3_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_JSON") - - # When - s3_json_df = ReadS3DataWithLessColumnsIO(source_config=s3_json_local_config).read() - - # Then - assert expected_df_with_less_columns.equals(s3_json_df) - - @pytest.mark.unit - def test_read_json_pandas_reader_will_only_filter_out_columns_not_in_schema(self, expected_df_with_less_columns): - # Given - s3_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_JSON") - - # When - s3_json_df = ReadS3DataWithLessColumnsIO(source_config=s3_json_local_config).read() - - # Then - assert expected_df_with_less_columns.equals(s3_json_df) - - @pytest.mark.unit - def test_read_hdf_pandas_reader_will_only_filter_out_columns_not_in_schema(self, expected_df_with_less_columns): - # Given - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_HDF") - - # When - s3_hdf_df = ReadS3DataWithLessColumnsIO(source_config=s3_hdf_local_config).read() - - # Then - assert expected_df_with_less_columns.equals(s3_hdf_df) - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_local.WithLocal, "_read_from_local") - def test_local_reader_is_called_for_loading_any_file_when_env_is_set_to_local(self, mock__read_from_local, expected_s3_csv_df): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - mock__read_from_local.return_value = expected_s3_csv_df - - # When - ReadS3CsvIO(source_config=s3_csv_local_config).read() - - # Then - mock__read_from_local.assert_called() - - @pytest.mark.unit - def test_a_local_parquet_file_is_loaded_when_io_config_is_initialised_with_local_env_and_parquet_file_type(self, test_df): - # Given - pg_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - pg_parquet_df = ReadPostgresIO(source_config=pg_parquet_local_config).read() - - # Then - assert test_df.equals(pg_parquet_df) - - @pytest.mark.unit - def test_a_local_h5_file_is_loaded_when_io_config_is_initialised_with_local_env_and_hdf_file_type(self, expected_s3_hdf_df): - # Given - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_HDF") - - # When - s3_hdf_df = ReadS3HdfIO(source_config=s3_hdf_local_config).read() - - # Then - assert expected_s3_hdf_df.equals(s3_hdf_df) - - @pytest.mark.unit - def test_a_local_json_file_is_loaded_when_io_config_is_initialised_with_local_env_and_json_file_type(self, expected_s3_json_df): - # Given - s3_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_JSON") - - # When - options = {"orient": "columns"} - s3_json_df = ReadS3JsonIO(source_config=s3_json_local_config, **options).read() - - # Then - assert expected_s3_json_df.equals(s3_json_df) - - @pytest.mark.unit - def test_a_local_csv_file_is_loaded_when_io_config_is_initialised_with_local_env_and_csv_file_type(self, expected_s3_csv_df): - # Given - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - s3_csv_df = ReadS3CsvIO(source_config=s3_csv_local_config).read() - - # Then - assert expected_s3_csv_df.equals(s3_csv_df) - - @pytest.mark.unit - def test_a_local_parquet_file_is_loaded_when_io_config_is_set_with_local_env_a_parquet_file_type_for_postgres(self, test_df): - # Given - pg_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - pg_df = ReadPostgresIO(source_config=pg_local_config, model=ERModel).read() - - # Then - assert test_df.equals(pg_df) - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_local.WithLocal, "_write_to_local") - def test_local_writer_is_called_for_writing_any_file_when_env_is_set_to_local(self, mock__write_to_local): - # Given - df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]}) - - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - WriteS3CsvIO(source_config=s3_csv_local_config).write(df) - - # Then - mock__write_to_local.assert_called() - - @pytest.mark.unit - def test_a_df_is_written_locally_as_parquet_when_io_config_is_initialised_with_local_env_value_and_parquet_file_type( - self, - test_df, - ): - # Given - df = test_df - - pg_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_PG_PARQUET") - - # When - WritePostgresIO(source_config=pg_parquet_local_config).write(df) - - # Then - try: - assert os.path.isfile(pg_parquet_local_config.local.file_path) - finally: - os.remove(pg_parquet_local_config.local.file_path) - - @pytest.mark.unit - def test_a_df_is_written_locally_as_csv_when_io_config_is_initialised_with_local_env_value_and_csv_file_type( - self, - ): - # Given - df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]}) - - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - WriteS3CsvIO(source_config=s3_csv_local_config).write(df) - - # Then - try: - assert os.path.isfile(s3_csv_local_config.local.file_path) - finally: - os.remove(s3_csv_local_config.local.file_path) - - @pytest.mark.unit - def test_a_df_is_written_locally_as_json_when_io_config_is_initialised_with_local_env_value_and_json_file_type(self, input_messages_df): - # Given - df = input_messages_df - - kafka_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_KAFKA_JSON") - - # When - WriteKafkaIO(source_config=kafka_json_local_config).write(df) - - # Then - try: - assert os.path.isfile(kafka_json_local_config.local.file_path) - finally: - os.remove(kafka_json_local_config.local.file_path) - - @pytest.mark.unit - def test_a_df_is_written_locally_as_h5_when_io_config_is_initialised_with_local_env_value_and_hdf_file_type( - self, - ): - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_HDF") - - # When - WriteS3HdfIO(source_config=s3_hdf_local_config).write(df) - - # Then - try: - assert os.path.isfile(s3_hdf_local_config.local.file_path) - finally: - os.remove(s3_hdf_local_config.local.file_path) - - @pytest.mark.unit - def test_dynamicio_default_pickle_protocol_is_4( - self, - ): - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_HDF") - - # When - WriteS3HdfIO(source_config=s3_hdf_local_config).write(df) - - # Then - try: - assert max_pklproto_hdf(s3_hdf_local_config.local.file_path) == 4 - finally: - os.remove(s3_hdf_local_config.local.file_path) - - @pytest.mark.unit - def test_dynamicio_default_pickle_protocol_is_bypassed_by_user_input( - self, - ): - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_HDF") - - # When - WriteS3HdfIO(source_config=s3_hdf_local_config, protocol=5).write(df) - - # Then - try: - assert max_pklproto_hdf(s3_hdf_local_config.local.file_path) == 5 - finally: - os.remove(s3_hdf_local_config.local.file_path) - - @pytest.mark.unit - def test_read_resolves_file_path_if_templated_for_some_input_data(self): - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet" - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="TEMPLATED_FILE_PATH") - - io_object = TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read") - - with patch.object(io_object, "_read_csv_file") as mocked__read_csv_file: - mocked__read_csv_file.return_value = pd.read_csv(os.path.join(TEST_RESOURCES, "data/input/some_csv_to_read.csv")) - io_object.read() - - mocked__read_csv_file.assert_called_once_with( - config.local.file_path.format(file_name_to_replace="some_csv_to_read"), - io_object.schema, - ) - - @pytest.mark.unit - def test_write_resolves_file_path_if_templated_for_some_output_data(self): - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet" - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="TEMPLATED_FILE_PATH") - - io_object = TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read") - - df = pd.read_csv(os.path.join(TEST_RESOURCES, "data/input/some_csv_to_read.csv")) - with patch.object(io_object, "_write_csv_file") as mocked__write_csv_file: - io_object.write(df) - - mocked__write_csv_file.assert_called_once() - (called_with_df, called_with_file_path) = mocked__write_csv_file.call_args[0] - pd.testing.assert_frame_equal(df, called_with_df) - assert called_with_file_path == config.local.file_path.format(file_name_to_replace="some_csv_to_read") - - @pytest.mark.integration - def test_local_writers_only_write_out_castable_columns_according_to_the_io_schema_case_float64_to_int64_id( - self, - ): - - # Given - # Note col_1 will be interpreted with type float64 - input_df = pd.DataFrame.from_dict({"col_1": [3.0, 2.0, 1.0], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - # class WriteS3ParquetIO(DynamicDataIO): - # schema = {"col_1": "int64", "col_2": "object"} - # - # @staticmethod - # def validate(df: pd.DataFrame): - # pass - write_s3_io = WriteS3ParquetIO(source_config=s3_parquet_local_config) - write_s3_io.write(input_df) - - # # Then - try: - output_df = pd.read_parquet(s3_parquet_local_config.local.file_path) - assert list(output_df.dtypes) == [ - np.dtype("int64"), - np.dtype("O"), - ] # order of the list matters - finally: - os.remove(s3_parquet_local_config.local.file_path) - - @pytest.mark.integration - def test_local_writers_only_write_out_columns_in_a_provided_io_schema(self): - - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - # class WriteS3ParquetIO(DynamicDataIO): - # schema = {"col_1": "int64", "col_2": "object"} - # - # @staticmethod - # def validate(df: pd.DataFrame): - # pass - write_s3_io = WriteS3ParquetIO(source_config=s3_parquet_local_config) - write_s3_io.write(input_df) - - # Then - try: - output_df = pd.read_parquet(s3_parquet_local_config.local.file_path) - no_of_columns_of_output_df = len(list(output_df.columns)) - no_of_columns_of_input_df = len(list(input_df.columns)) - assert (no_of_columns_of_input_df - no_of_columns_of_output_df == 1) and (set(output_df.columns) == {*write_s3_io.schema.columns.keys()}) # pylint: disable=no-member - finally: - os.remove(s3_parquet_local_config.local.file_path) - - @pytest.mark.unit - def test_pyarrow_is_used_as_backend_parquet(self): - - # When - implementation = dynamicio.mixins.with_local.pd.io.parquet.get_engine("auto") - - # Then - assert implementation.__class__.__name__ == "PyArrowImpl" - - @pytest.mark.integration - def test_write_parquet_file_is_called_with_additional_pyarrow_args(self): - - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - to_parquet_kwargs = { - "use_deprecated_int96_timestamps": False, - "coerce_timestamps": "ms", - "allow_truncated_timestamps": True, - "row_group_size": 1000000, - } - - # When - with patch.object(dynamicio.mixins.with_local.pd.DataFrame, "to_parquet") as mocked__to_parquet: - write_s3_io = WriteS3ParquetIO(source_config=s3_parquet_local_config, **to_parquet_kwargs) - write_s3_io.write(input_df) - - # Then - mocked__to_parquet.assert_called_once_with(os.path.join(constants.TEST_RESOURCES, "data/processed/write_some_parquet.parquet"), **to_parquet_kwargs) - - @pytest.mark.integration - @patch.object(dynamicio.mixins.with_local.pd, "read_parquet") - def test_read_parquet_file_is_called_with_additional_pyarrow_args(self, mock__read_parquet): - - # Given - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="S3_PARQUET_WITH_OPTIONS_IN_CODE") - - read_parquet_kwargs = {"filters": [("a", "<", "2")]} - - # When - ReadFromBatchLocalParquet(config, **read_parquet_kwargs).read() - # Then - mock__read_parquet.assert_called_once_with(config.local.file_path, columns=["id", "foo_name", "bar"], **read_parquet_kwargs) - - @pytest.mark.unit - def test_read_with_pyarrow_is_called_as_default_when_no_engine_option_is_provided(self): - # Given - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__read_with_pyarrow") as mocked__read_with_pyarrow: - ReadS3ParquetIO(config).read() - - # Then - mocked__read_with_pyarrow.assert_called_once_with(config.local.file_path, columns=["id", "foo_name", "bar"]) - - @pytest.mark.unit - def test_read_with_pyarrow_is_called_when_engine_option_is_set_to_pyarrow(self): - # Given - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__read_with_pyarrow") as mocked__read_with_pyarrow: - ReadS3ParquetIO(config, engine="pyarrow").read() - - # Then - mocked__read_with_pyarrow.assert_called_once_with(config.local.file_path, engine="pyarrow", columns=["id", "foo_name", "bar"]) - - @pytest.mark.unit - def test_read_with_fastparquet_is_called_when_engine_option_is_set_to_fastparquet(self): - # Given - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__read_with_fastparquet") as mocked__read_with_fastparquet: - ReadS3ParquetIO(config, engine="fastparquet").read() - - # Then - mocked__read_with_fastparquet.assert_called_once_with(config.local.file_path, engine="fastparquet", columns=["id", "foo_name", "bar"]) - - @pytest.mark.unit - def test_write_with_pyarrow_is_called_as_default_when_no_engine_option_is_provided(self): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__write_with_pyarrow") as mocked__write_with_pyarrow: - WriteS3ParquetIO(config).write(input_df) - - # Then - mocked__write_with_pyarrow.assert_called() - - @pytest.mark.unit - def test_write_with_pyarrow_is_called_when_engine_option_is_set_to_pyarrow(self): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__write_with_pyarrow") as mocked__write_with_pyarrow: - WriteS3ParquetIO(config, engine="pyarrow").write(input_df) - - # Then - mocked__write_with_pyarrow.assert_called() - - @pytest.mark.unit - def test_write_with_fastparquet_is_called_when_engine_option_is_set_to_fastparquet(self): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_WithLocal__write_with_fastparquet") as mocked__write_with_fastparquet: - WriteS3ParquetIO(config, engine="fastparquet").write(input_df) - - # Then - mocked__write_with_fastparquet.assert_called() - - @pytest.mark.unit - def test_async_read_does_not_operate_in_parallel_for_hdf_files(self): - - # Given - s3_hdf_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_HDF") - - async def multi_read(config: Mapping[str, str]) -> Tuple: - return await asyncio.gather( - AsyncReadS3HdfIO(source_config=config).async_read(), - AsyncReadS3HdfIO(source_config=config).async_read(), - ) - - def dummy_read_hdf(*args, **kwargs) -> pd.DataFrame: # pylint: disable=unused-argument - time.sleep(0.1) - return pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - - # When - with patch.object(dynamicio.mixins.with_local.pd, "read_hdf", new=dummy_read_hdf): - start_time = time.time() - asyncio.run(multi_read(s3_hdf_cloud_config)) - duration = time.time() - start_time - - # Then - assert duration >= 0.2 - - @pytest.mark.unit - def test_async_write_does_not_operate_in_parallel_for_hdf_files(self): - - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_HDF") - - async def multi_write(config: Mapping[str, str], _df: pd.DataFrame) -> Tuple: - return await asyncio.gather(WriteS3HdfIO(source_config=config).async_write(_df), WriteS3HdfIO(source_config=config).async_write(_df)) - - @dynamicio.mixins.utils.allow_options([*dynamicio.mixins.utils.args_of(pd.DataFrame.to_hdf), *["protocol"]]) - def dummy_to_hdf(*args, **kwargs): # pylint: disable=unused-argument - time.sleep(0.1) - - # When - with patch.object(dynamicio.mixins.with_local.pd.DataFrame, "to_hdf", new=dummy_to_hdf): - start_time = time.time() - asyncio.run(multi_write(s3_hdf_local_config, df)) - duration = time.time() - start_time - - # Then - assert duration >= 0.2 - - -class TestBatchLocal: - @pytest.mark.unit - def test_multiple_files_are_loaded_when_batch_local_type_is_used_for_parquet(self, expected_s3_parquet_df): - # Given - parquet_local_batch_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_BATCH_LOCAL_PARQUET") - expected_concatenated_df = expected_s3_parquet_df - - # When - concatenated_df = ReadFromBatchLocalParquet(source_config=parquet_local_batch_config).read() - - # Then - pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df) - - @pytest.mark.unit - def test_files_that_dont_comply_to_the_provided_file_type_are_ignored(self, expected_s3_parquet_df): - # Given - parquet_local_batch_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_BATCH_LOCAL_NOT_JUST_PARQUET") - expected_concatenated_df = expected_s3_parquet_df - - # When - concatenated_df = ReadFromBatchLocalParquet(source_config=parquet_local_batch_config).read() - - # Then - pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df) - - @pytest.mark.unit - def test_if_hdf_file_is_chosen_then_file_type_is_converted_to_h5_for_filtering(self, expected_s3_parquet_df): - # Given - parquet_local_batch_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_BATCH_LOCAL_NOT_JUST_PARQUET") - expected_concatenated_df = expected_s3_parquet_df - - # When - concatenated_df = ReadFromBatchLocalParquet(source_config=parquet_local_batch_config).read() - - # Then - pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df) - - @pytest.mark.unit - def test_multiple_files_are_loaded_when_batch_local_type_is_used_for_hdf(self, expected_s3_hdf_df): - # Given - parquet_local_batch_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_BATCH_LOCAL_HDF") - expected_concatenated_df = expected_s3_hdf_df - - # When - concatenated_df = ReadFromBatchLocalHdf(source_config=parquet_local_batch_config).read() - - # Then - pd.testing.assert_frame_equal(expected_concatenated_df, concatenated_df.sort_values(by="id").reset_index(drop=True)) diff --git a/tests/test_mixins/test_mixin_utils.py b/tests/test_mixins/test_mixin_utils.py deleted file mode 100644 index c79c224..0000000 --- a/tests/test_mixins/test_mixin_utils.py +++ /dev/null @@ -1,159 +0,0 @@ -# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801 -import os -from typing import Any - -import pytest - -from dynamicio.config import IOConfig -from dynamicio.mixins.utils import allow_options, args_of, get_string_template_field_names, resolve_template -from tests import constants -from tests.mocking.io import ( - ReadS3CsvIO, -) - - -class TestGetStringTemplateFieldNames: - @pytest.mark.unit - @pytest.mark.parametrize( - ["s", "expected_result"], - [ - ("", []), - ("abc", []), - ("{abc}", ["abc"]), - ("a{abc}d{def}", ["abc", "def"]), - ("a{0}b{1}", ["0", "1"]), - ("{abc:.2f}", ["abc"]), - ], - ) - def test_returns_correct_result(self, s, expected_result): - result = get_string_template_field_names(s) - assert result == expected_result - - -class TestResolveTemplate: - @pytest.mark.unit - @pytest.mark.parametrize( - ["s", "options", "expected_result"], - [ - ("{abc}d{def}", {"abc": "100", "def": "hello"}, "100dhello"), - ("{hello}", {"world": "100", "hello": "world"}, "world"), - ], - ) - def test_returns_correct_result(self, s, options, expected_result): - result = resolve_template(s, options) - assert result == expected_result - - @pytest.mark.unit - @pytest.mark.parametrize(["s"], [("abc{0}",), ("{1def}def",)]) - def test_raises_value_error_if_s_has_fields_which_are_not_valid_identifiers(self, s): - with pytest.raises(ValueError): - resolve_template(s, None) - - @pytest.mark.unit - @pytest.mark.parametrize( - ["s", "options"], - [("{abc}", {}), ("{abc}", {"def": "something"}), ("{abc}{def}", {"def": "700"})], - ) - def test_raises_value_error_if_template_field_cannot_be_resolved_to_options(self, s, options): - with pytest.raises(ValueError): - resolve_template(s, options) - - -class TestAllowedOptions: - @pytest.fixture(autouse=True) - def _pass_fixtures(self, capsys): - self.capsys = capsys # pylint: disable=attribute-defined-outside-init - - @pytest.mark.unit - def test_args_of_returns_valid_set_of_allowed_kwargs_for_a_given_function(self): - # Given - def magic_function(arg_a: str, arg_b: int, arg_c: bool) -> bool: - print(f"{arg_a}: {arg_b}") - return arg_c - - func = magic_function - - # When - options = args_of(func) - - # Then - assert options == {"arg_a", "arg_b", "arg_c"} - - @pytest.mark.integration - def test_allow_options_can_use_iterable_returned_from_args_of_to_filter_out_invalid_options( - self, - ): - # Given - def magic_function(arg_a: str, arg_b: int, arg_c: bool) -> bool: - print(f"{arg_a}: {arg_b}") - return arg_c - - func = magic_function - - @allow_options(args_of(func)) - def mock_method(**options: Any): - return [*options] - - # When - options = mock_method(arg_a="A", arg_b=1, arg_c=True, invalid_option="I SHOULDN'T BE HERE") - - # Then - assert options == ["arg_a", "arg_b", "arg_c"] - - @pytest.mark.integration - def test_allow_options_does_not_filter_out_valid_args_when_they_are_passed_as_args_and_not_as_kwargs( - self, - ): - # Given - def magic_function(arg_a: str, arg_b: int, arg_c: bool) -> bool: - return [arg_a, arg_b, arg_c] - - func = magic_function - - @allow_options(args_of(func)) - def mock_method(schema: "str", **options: Any): - print(schema) - return magic_function(**options) - - # When - # options = mock_method(schema="schema", **{"arg_a": "A", "arg_b": 1, "arg_c": True, "invalid_option": "I SHOULDN'T BE HERE"}) # THIS WOULD FAIL! - options = mock_method( - "schema", - **{"arg_a": "A", "arg_b": 1, "arg_c": True, "invalid_option": "I SHOULDN'T BE HERE"}, - ) - - # Then - captured = self.capsys.readouterr() - assert (captured.out == "schema\n") and (options == ["A", 1, True]) - - @pytest.mark.integration # This is an integration test as it uses `allow_options()` after `args_of()` - def test_when_reading_locally_or_from_s3_invalid_options_are_ignored(self, expected_s3_csv_df): - # Given - invalid_option = "INVALID_OPTION" - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - s3_csv_df = ReadS3CsvIO(source_config=s3_csv_local_config, foo=invalid_option).read() - - # Then - assert expected_s3_csv_df.equals(s3_csv_df) - - @pytest.mark.integration - def test_when_reading_locally_or_from_s3_valid_options_are_considered(self, expected_s3_csv_df): - # Given - # VALID OPTION: dtype=None - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="LOCAL", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - s3_csv_df = ReadS3CsvIO(source_config=s3_csv_local_config, dtype=None).read() - - # Then - assert expected_s3_csv_df.equals(s3_csv_df) diff --git a/tests/test_mixins/test_postgres_mixins.py b/tests/test_mixins/test_postgres_mixins.py deleted file mode 100644 index 62447db..0000000 --- a/tests/test_mixins/test_postgres_mixins.py +++ /dev/null @@ -1,214 +0,0 @@ -# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801 -import os -from unittest.mock import ANY, patch - -import pandas as pd -import pytest -from sqlalchemy.sql.base import ImmutableColumnCollection - -from dynamicio import WithPostgres -from dynamicio.config import IOConfig -from tests import constants -from tests.mocking.io import ( - ReadPostgresIO, - WriteExtendedPostgresIO, - WritePostgresIO, -) -from tests.mocking.models import ERModel, PgModel - - -class TestPostgresIO: - @pytest.mark.unit - def test_when_reading_from_postgres_with_env_as_cloud_get_table_columns_returns_valid_list_of_columns_for_a_model(self, expected_columns): - # Given - pg_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - columns = ReadPostgresIO(source_config=pg_cloud_config)._get_table_columns(ERModel) # pylint: disable=protected-access - # Then - assert columns == expected_columns - - @pytest.mark.unit - @patch.object(WithPostgres, "_read_from_postgres") - def test_read_from_postgres_is_called_for_loading_a_table_with_columns_with_env_as_cloud_and_type_as_postgres(self, mock__read_from_postgres, test_df): - # Given - mock__read_from_postgres.return_value = test_df - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - ReadPostgresIO(source_config=postgres_cloud_config).read() - - # Then - mock__read_from_postgres.assert_called() - - @pytest.mark.unit - @patch.object(WithPostgres, "_write_to_postgres") - def test_write_to_postgres_is_called_for_uploading_a_table_with_columns_with_env_as_cloud_and_type_as_postgres(self, mock__write_to_postgres, test_df): - # Given - df = test_df - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_PG_PARQUET") - - # When - WritePostgresIO(source_config=postgres_cloud_config).write(df) - - # Then - mock__write_to_postgres.assert_called() - - @pytest.mark.unit - @patch.object(WithPostgres, "_write_to_postgres") - def test_write_to_postgres_is_called_with_truncate_and_append_option(self, mock__write_to_postgres, test_df): - # Given - df = test_df - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get( - source_key="WRITE_TO_PG_PARQUET", - ) - - # When - write_config = WritePostgresIO(source_config=postgres_cloud_config, truncate_and_append=True) - - write_config.write(df) - - # Then - mock__write_to_postgres.assert_called_once() - (called_with_df,) = mock__write_to_postgres.call_args[0] - pd.testing.assert_frame_equal(test_df, called_with_df) - assert "truncate_and_append" in write_config.options - - @pytest.mark.unit - @patch.object(WithPostgres, "_read_from_postgres") - def test_read_from_postgres_by_implicitly_generating_datamodel_from_schema(self, mock__read_from_postgres, test_df): - # Given - mock__read_from_postgres.return_value = test_df - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When / Then - ReadPostgresIO(source_config=postgres_cloud_config).read() - mock__read_from_postgres.assert_called() - - @pytest.mark.unit - @patch.object(WithPostgres, "_read_database") - def test_read_from_postgres_with_query(self, mock__read_database): - # Given - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - ReadPostgresIO(source_config=postgres_cloud_config, sql_query="SELECT * FROM example").read() - - # Then - mock__read_database.assert_called_with(ANY, "SELECT * FROM example") - - @pytest.mark.unit - @patch.object(WithPostgres, "_read_database") - def test_read_from_postgres_with_query_in_options(self, mock__read_database): - # Given - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES_WITH_QUERY_IN_OPTIONS") - - # When - ReadPostgresIO(source_config=postgres_cloud_config).read() - - # Then - mock__read_database.assert_called_with(ANY, "SELECT * FROM table_name_from_yaml_options") - - @pytest.mark.unit - @patch.object(pd, "read_sql") - def test_read_from_postgres_with_query_and_options(self, mock__read_sql): - # Given - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - ReadPostgresIO(source_config=postgres_cloud_config, sql_query="SELECT * FROM example", parse_dates=["date"], wrong_arg="whatever").read() - - # Then - mock__read_sql.assert_called_with(sql="SELECT * FROM example", con=ANY, parse_dates=["date"]) - - @pytest.mark.unit - def test_generate_model_from_schema_returns_model(self): - # Given - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - schema = postgres_cloud_config.dynamicio_schema - schema_name = postgres_cloud_config.dynamicio_schema.name - model = ReadPostgresIO(source_config=postgres_cloud_config)._generate_model_from_schema(schema) - - # Then - assert len(model.__table__.columns) == len(schema.columns) and model.__tablename__ == schema_name - - @pytest.mark.unit - def test_get_table_columns_from_generated_model_returns_valid_list_of_columns(self): - # Given - pg_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_POSTGRES") - - # When - schema = pg_cloud_config.dynamicio_schema - model = ReadPostgresIO(source_config=pg_cloud_config)._generate_model_from_schema(schema) # pylint: disable=protected-access - columns = ReadPostgresIO(source_config=pg_cloud_config)._get_table_columns(model) # pylint: disable=protected-access - - # Then - assert isinstance(model.__table__.columns, ImmutableColumnCollection) - for x, y in zip(columns, [PgModel.id, PgModel.foo, PgModel.bar, PgModel.baz]): - assert str(x) == str(y) - - @pytest.mark.unit - def test_to_check_if_dataframe_has_valid_data_types(self): - # Given - postgres_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_PG_PARQUET") - - df = pd.DataFrame.from_records( - [ - ["cm_1", "id_1", 1000, "12/12/2000", True, 12.76], - ["cm_2", "id_2", 1000, "01/02/1990", False, 199.76], - ["cm_3", "id_3", 1000, "01/05/1990", False, 12.76], - ], - columns=["id", "foo", "bar", "start_date", "active", "net"], - ) - - # When - is_valid = WriteExtendedPostgresIO(source_config=postgres_cloud_config, show_casting_warnings=True)._has_valid_dtypes(df) - - # Then - assert is_valid is True diff --git a/tests/test_mixins/test_s3_mixins.py b/tests/test_mixins/test_s3_mixins.py deleted file mode 100644 index 4a98897..0000000 --- a/tests/test_mixins/test_s3_mixins.py +++ /dev/null @@ -1,831 +0,0 @@ -# pylint: disable=no-member, missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods, too-few-public-methods, protected-access, C0103, C0302, R0801 -import os -import shutil -from tempfile import NamedTemporaryFile -from unittest import mock -from unittest.mock import patch - -import pandas as pd -import pydantic -import pytest -import yaml - - -import dynamicio.mixins.with_local -import dynamicio.mixins.with_s3 - -from dynamicio.config import IOConfig -from dynamicio.errors import ColumnsDataTypeError -from tests import constants -from tests.constants import TEST_RESOURCES -from tests.mocking.io import ( - ReadS3CsvIO, - ReadS3HdfIO, - ReadS3JsonIO, - ReadS3ParquetIO, - ReadS3ParquetWEmptyFilesIO, - ReadS3ParquetWithDifferentCastableDTypeIO, - ReadS3ParquetWithDifferentNonCastableDTypeIO, - ReadS3ParquetWithLessColumnsIO, - TemplatedFile, - WriteS3CsvIO, - WriteS3HdfIO, - WriteS3JsonIO, - WriteS3ParquetIO, -) - - -class TestS3FileIO: - @pytest.mark.unit - def test_read_resolves_file_path_if_templated(self): - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet" - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="TEMPLATED_FILE_PATH") - - file_path = f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv" - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_read_csv_file") as mock__read_csv_file, patch.object( - dynamicio.mixins.with_s3.WithS3File, "_s3_named_file_reader" - ) as mock_s3_reader: - with open(file_path, "r") as file: # pylint: disable=unspecified-encoding - mock_s3_reader.return_value = file - io_obj = TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read") - final_schema = io_obj.schema - io_obj.read() - - mock__read_csv_file.assert_called_once_with(file_path, final_schema) - - @pytest.mark.unit - def test_write_resolves_file_path_if_templated(self): - # Given - # source data read from: "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.parquet" - config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="TEMPLATED_FILE_PATH") - - # When - with patch.object(dynamicio.mixins.with_local.WithLocal, "_write_csv_file") as mock__write_csv_file: - df = pd.read_csv(os.path.join(TEST_RESOURCES, "data/input/some_csv_to_read.csv")) - TemplatedFile(source_config=config, file_name_to_replace="some_csv_to_read").write(df) - - # Then - args, _ = mock__write_csv_file.call_args - assert "s3://mock-bucket/path/to/some_csv_to_read.csv" == args[1] - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_s3.WithS3File, "_read_from_s3_file") - def test_read_from_s3_file_is_called_for_loading_a_file_with_env_as_cloud_s3(self, mock__read_from_s3_file, expected_s3_csv_df): - # Given - mock__read_from_s3_file.return_value = expected_s3_csv_df - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - ReadS3CsvIO(source_config=s3_csv_cloud_config).read() - - # Then - mock__read_from_s3_file.assert_called() - - @pytest.mark.unit - def test_s3_reader_is_not_called_for_loading_a_parquet_with_env_as_cloud_s3_and_type_as_parquet_and_no_disk_space_flag(self): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - file_path = f"{constants.TEST_RESOURCES}/data/input/some_csv_to_read.csv" - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock_s3_reader, patch.object( - dynamicio.mixins.with_s3.WithS3File, "_read_parquet_file" - ) as mock_read_parquet_file: - with open(file_path, "r") as file: # pylint: disable=unspecified-encoding - mock_s3_reader.return_value = file - ReadS3ParquetIO(source_config=s3_parquet_cloud_config, no_disk_space=True).read() - - # Then - mock_s3_reader.assert_not_called() - mock_read_parquet_file.assert_called() - - @pytest.mark.unit - def test_s3_reader_is_called_for_loading_a_hdf_with_env_as_cloud_s3_and_type_as_hdf(self, expected_s3_hdf_file_path, expected_s3_hdf_df): - # Given - s3_hdf_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_HDF") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "boto3_client") as mock__boto3_client: - - def mock_download_fobj(s3_bucket, s3_key, target_file): - with open(expected_s3_hdf_file_path, "rb") as fin: - shutil.copyfileobj(fin, target_file) - - mock__boto3_client.download_fileobj.side_effect = mock_download_fobj - loaded_hdf_pd = ReadS3HdfIO(source_config=s3_hdf_cloud_config, no_disk_space=True).read() - - # Then - pd.testing.assert_frame_equal(loaded_hdf_pd, expected_s3_hdf_df) - - @pytest.mark.unit - def test_s3_reader_is_not_called_for_loading_a_json_with_env_as_cloud_s3_and_type_as_json_and_no_disk_space_flag(self): - # Given - s3_json_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_JSON") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock__s3_reader, patch.object( - dynamicio.mixins.with_s3.WithS3File, "_read_json_file" - ) as mock__read_json_file: - ReadS3JsonIO(source_config=s3_json_cloud_config, no_disk_space=True).read() - - # Then - mock__s3_reader.assert_not_called() - mock__read_json_file.assert_called() - - @pytest.mark.unit - def test_s3_reader_is_not_called_for_loading_a_csv_with_env_as_cloud_s3_and_type_as_csv_and_no_disk_space_flag(self): - # Given - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_CSV") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock__s3_reader, patch.object( - dynamicio.mixins.with_s3.WithS3File, "_read_csv_file" - ) as mock__read_csv_file: - ReadS3CsvIO(source_config=s3_csv_cloud_config, no_disk_space=True).read() - - # Then - mock__s3_reader.assert_not_called() - mock__read_csv_file.assert_called() - - @pytest.mark.unit - def test_ValueError_is_raised_if_file_path_missing_from_config(self, tmp_path): - tmp_yaml = tmp_path / "test.yaml" - with open(tmp_yaml, "w") as fout: - yaml.safe_dump( - { - "READ_FROM_S3_MISSING_FILE_PATH": { - "LOCAL": { - "type": "local", - "local": { - "file_path": "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "type": "s3_file", - "s3": {"bucket": "[[ MOCK_BUCKET ]]", "file_type": "csv"}, - }, - "schema": {"file_path": "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"}, - } - }, - fout, - ) - - with pytest.raises(pydantic.ValidationError): - IOConfig( - path_to_source_yaml=str(tmp_yaml), - env_identifier="CLOUD", - dynamic_vars=constants, - ) - - @pytest.mark.unit - def test_s3_writers_only_validate_schema_prior_writing_out_the_dataframe(self): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - # class WriteS3ParquetIO(DynamicDataIO): - # schema = {"col_1": "int64", "col_2": "object"} - # - # @staticmethod - # def validate(df: pd.DataFrame): - # pass - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object(WriteS3ParquetIO, "_apply_schema") as mock__apply_schema, patch.object( - WriteS3ParquetIO, "_write_parquet_file" - ) as mock__write_parquet_file: - with NamedTemporaryFile(delete=False) as temp_file: - mock__s3_writer.return_value = temp_file - WriteS3ParquetIO(source_config=s3_parquet_cloud_config).write(input_df) - - # Then - mock__apply_schema.assert_called() - mock__write_parquet_file.assert_called() - - @pytest.mark.unit - def test_columns_data_type_error_exception_is_not_generated_if_column_dtypes_can_be_casted_to_the_expected_dtypes(self, expected_s3_parquet_df): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_read_parquet_file") as mock__read_parquet_file, patch.object( - dynamicio.mixins.with_s3.WithS3File, "_s3_named_file_reader" - ): - mock__read_parquet_file.return_value = expected_s3_parquet_df - ReadS3ParquetWithDifferentCastableDTypeIO(source_config=s3_parquet_cloud_config).read() - - assert True, "No exception was raised" - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_named_file_reader") - @patch.object(dynamicio.mixins.with_s3.WithS3File, "_read_parquet_file") - def test_columns_data_type_error_exception_is_generated_if_column_dtypes_dont_map_to_the_expected_dtypes(self, mock__s3_reader, moc__read_parquet_file, expected_s3_parquet_df): - """ - ------------------------------ Captured log call ------------------------------- - - WARNING ...:dataio.py:273 Expected: 'float64' dtype for column: 'id', found: 'int64' instead. - WARNING ...:dataio.py:273 Expected: 'int64' dtype for column: 'foo_name', found: 'object' instead. - ERROR ...:dataio.py:277 Tried casting column: 'foo_name' to 'int64' from 'object', but failed. - - =========================== short test summary info ============================ - - FAILED ...:test_columns_data_type_error_exception_is_generated_if_column_dtypes_dont_map_to_the_expected_dtypes - - ============================== 1 failed in 0.48s =============================== - - """ - # Given - dataframe_returned = expected_s3_parquet_df - mock__s3_reader.return_value = dataframe_returned - - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When/Then - with pytest.raises(ColumnsDataTypeError): - ReadS3ParquetWithDifferentNonCastableDTypeIO(source_config=s3_parquet_cloud_config).read() - moc__read_parquet_file.assert_called() - - @pytest.mark.unit - def test_read_parquet_file_is_called_while_s3_reader_is_not_for_loading_a_parquet_with_env_as_cloud_s3_and_type_as_parquet_with_no_disk_space_option( - self, - ): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_reader") as mock__s3_reader, patch.object( - dynamicio.mixins.with_local.WithLocal, "_read_parquet_file" - ) as mock__read_parquet_file: - ReadS3ParquetIO(source_config=s3_parquet_cloud_config, no_disk_space=True).read() - - # Then - mock__s3_reader.assert_not_called() - mock__read_parquet_file.assert_called() - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_s3.WithS3File, "_write_to_s3_file") - def test_s3_writer_is_called_for_writing_a_file_with_env_is_set_to_cloud_s3(self, mock__write_to_s3_file): - # Given - df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]}) - - s3_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_JSON") - - # When - ReadS3HdfIO(source_config=s3_json_local_config).write(df) - - # Then - mock__write_to_s3_file.assert_called() - - @pytest.mark.unit - def test_write_parquet_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_s3(self): - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - - s3_parquet_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object( - dynamicio.mixins.with_local.WithLocal, "_write_parquet_file" - ) as mock__write_parquet_file: - with NamedTemporaryFile(delete=False) as temp_file: - mock__s3_writer.return_value = temp_file - WriteS3ParquetIO(source_config=s3_parquet_local_config).write(df) - - # Then - mock__write_parquet_file.assert_called() - - @pytest.mark.unit - def test_write_csv_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_csv(self): - # Given - df = pd.DataFrame.from_dict({"id": [3, 2, 1, 0], "foo_name": ["a", "b", "c", "d"], "bar": [1, 2, 3, 4]}) - - s3_csv_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_CSV") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object( - dynamicio.mixins.with_local.WithLocal, "_write_csv_file" - ) as mock__write_csv_file: - with NamedTemporaryFile(delete=False) as temp_file: - mock__s3_writer.return_value = temp_file - WriteS3CsvIO(source_config=s3_csv_local_config).write(df) - - # Then - mock__write_csv_file.assert_called() - - @pytest.mark.unit - def test_write_json_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_json(self): - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - - s3_json_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_JSON") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer, patch.object( - dynamicio.mixins.with_local.WithLocal, "_write_json_file" - ) as mock__write_json_file: - with NamedTemporaryFile(delete=False) as temp_file: - mock__s3_writer.return_value = temp_file - WriteS3JsonIO(source_config=s3_json_local_config).write(df) - - # Then - mock__write_json_file.assert_called() - - @pytest.mark.unit - def test_write_hdf_file_is_called_for_writing_a_parquet_with_env_as_cloud_s3_and_type_as_hdf(self): - # Given - df = pd.DataFrame.from_dict({"col_1": [3, 2, 1, 0], "col_2": ["a", "b", "c", "d"]}) - s3_hdf_local_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/processed.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_HDF") - - # When - with patch.object(dynamicio.mixins.with_s3.WithS3File, "_s3_writer") as mock__s3_writer: - with NamedTemporaryFile(delete=False) as temp_file: - mock__s3_writer.return_value = temp_file - WriteS3HdfIO(source_config=s3_hdf_local_config).write(df) - - # Then - assert os.stat(temp_file.name).st_size == 1064192, "Confirm that the output file size did not change" - - -class TestS3PathPrefixIO: - @pytest.mark.unit - def test_error_is_raised_if_path_prefix_missing_from_config(self, tmp_path): - - tmp_yaml = tmp_path / "test.yaml" - with open(tmp_yaml, "w") as fout: - yaml.safe_dump( - { - "READ_FROM_S3_MISSING_PATH_PREFIX": { - "LOCAL": { - "type": "local", - "local": { - "file_path": "[[ TEST_RESOURCES ]]/data/input/some_csv_to_read.csv", - "file_type": "csv", - }, - }, - "CLOUD": { - "type": "s3_path_prefix", - "s3": {"bucket": "[[ MOCK_BUCKET ]]", "file_type": "csv"}, - }, - "schema": {"file_path": "[[ TEST_RESOURCES ]]/schemas/read_from_s3_csv.yaml"}, - } - }, - fout, - ) - - with pytest.raises(pydantic.ValidationError): - IOConfig( - path_to_source_yaml=str(tmp_yaml), - env_identifier="CLOUD", - dynamic_vars=constants, - ) - - @pytest.mark.unit - def test_ValueError_is_raised_if_partition_cols_missing_from_options_when_uploading(self): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PATH_PREFIX_PARQUET") - - # When / Then - with pytest.raises(ValueError): - WriteS3ParquetIO(source_config=s3_parquet_cloud_config).write(input_df) - - @pytest.mark.unit - def test_error_is_raised_if_file_type_not_parquet_when_uploading(self, tmp_path): - - tmp_yaml = tmp_path / "test.yaml" - with open(tmp_yaml, "w") as fout: - yaml.safe_dump( - { - "WRITE_TO_S3_PATH_PREFIX_NOT_PARQUET": { - "CLOUD": { - "type": "s3_path_prefix", - "s3": { - "bucket": "[[ MOCK_BUCKET ]]", - "path_prefix": "[[ MOCK_KEY ]]", - "file_type": "not_parquet", - }, - } - } - }, - fout, - ) - - with pytest.raises(pydantic.ValidationError): - IOConfig( - path_to_source_yaml=str(tmp_yaml), - env_identifier="CLOUD", - dynamic_vars=constants, - ) - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_s3.WithS3PathPrefix, "_read_from_s3_path_prefix") - def test_read_from_s3_path_prefix_is_called_for_loading_a_path_prefix_with_env_as_cloud_s3(self, mock__read_from_s3_path_prefix, expected_s3_csv_df): - # Given - mock__read_from_s3_path_prefix.return_value = expected_s3_csv_df - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_CSV") - - # When - ReadS3CsvIO(source_config=s3_csv_cloud_config).read() - - # Then - mock__read_from_s3_path_prefix.assert_called() - - @pytest.mark.unit - @patch.object(dynamicio.mixins.with_s3.WithS3PathPrefix, "_write_to_s3_path_prefix") - def test_write_to_s3_path_prefix_is_called_for_uploading_to_a_path_prefix_with_env_as_cloud_s3(self, mock__write_to_s3_path_prefix): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PATH_PREFIX_PARQUET") - - # When - WriteS3ParquetIO(source_config=s3_parquet_cloud_config).write(input_df) - - # Then - mock__write_to_s3_path_prefix.assert_called() - - @pytest.mark.unit - @patch.object(WriteS3ParquetIO, "_write_parquet_file") - # pylint: disable=unused-argument - def test_awscli_runner_is_called_with_correct_s3_path_and_aws_command_when_uploading_a_path_prefix_with_env_as_cloud_s3(self, mock__write_parquet_file, mock_temporary_directory): - # Given - input_df = pd.DataFrame.from_dict({"col_1": [3, 2, 1], "col_2": ["a", "b", "c"], "col_3": ["a", "b", "c"]}) - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="WRITE_TO_S3_PATH_PREFIX_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - WriteS3ParquetIO(source_config=s3_parquet_cloud_config, partition_cols="col_2").write(input_df) - - # Then - mocked__awscli_runner.assert_called_with("s3", "sync", "temp", "s3://mock-bucket/mock-key", "--acl", "bucket-owner-full-control", "--only-show-errors", "--exact-timestamps") - - @pytest.mark.unit - # pylint: disable=unused-argument - def test_awscli_runner_is_called_with_correct_s3_path_and_aws_command_when_loading_a_path_prefix_with_env_as_cloud_s3( - self, mock_listdir, mock_temporary_directory, mock__read_hdf_file - ): - # Given - s3_hdf_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_HDF") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - ReadS3HdfIO(source_config=s3_hdf_cloud_config).read() - - # Then - mocked__awscli_runner.assert_called_with("s3", "sync", "s3://mock-bucket/mock-key", "temp", "--acl", "bucket-owner-full-control", "--only-show-errors", "--exact-timestamps") - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_hdf_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_hdf( - self, mock_listdir, mock_temporary_directory, mock__read_hdf_file - ): - # Given - s3_hdf_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_HDF") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - read_obj = ReadS3HdfIO(source_config=s3_hdf_cloud_config) - actual_schema = read_obj.schema - read_obj.read() - - # Then - assert len(mock__read_hdf_file.mock_calls) == 3 - mock__read_hdf_file.assert_has_calls( - [ - mock.call("temp/obj_1.h5", actual_schema), - mock.call("temp/obj_2.h5", actual_schema), - mock.call("temp/obj_3.h5", actual_schema), - ] - ) - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_parquet_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_parquet( - self, mock_listdir, mock_temporary_directory, mock__read_parquet_file - ): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - read_obj = ReadS3ParquetIO(source_config=s3_parquet_cloud_config) - actual_schema = read_obj.schema - read_obj.read() - - # Then - assert len(mock__read_parquet_file.mock_calls) == 3 - mock__read_parquet_file.assert_has_calls( - [ - mock.call("temp/obj_1.h5", actual_schema), - mock.call("temp/obj_2.h5", actual_schema), - mock.call("temp/obj_3.h5", actual_schema), - ] - ) - - @pytest.mark.unit - def test_read_parquet_file_is_called_while_awscli_runner_is_not_for_loading_a_parquet_with_env_as_cloud_s3_and_type_as_parquet_with_no_disk_space_option( - self, - ): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mock__awscli_runner, patch.object( - dynamicio.mixins.with_local.WithLocal, "_read_parquet_file" - ) as mock__read_parquet_file: - ReadS3ParquetIO(source_config=s3_parquet_cloud_config, no_disk_space=True).read() - - # Then - mock__read_parquet_file.assert_called() - mock__awscli_runner.assert_not_called() - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_parquet_file_can_read_directory_of_parquet_files_loading_only_necessary_columns(self, mock_parquet_temporary_directory): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - df = ReadS3ParquetWithLessColumnsIO(source_config=s3_parquet_cloud_config).read() - - # Then - assert df.shape == (15, 2) and df.columns.tolist() == ["id", "foo_name"] - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_parquet_file_can_filter_out_rows_using_appropriate_options(self, mock_parquet_temporary_directory): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - df = ReadS3ParquetIO(source_config=s3_parquet_cloud_config, filters=[[("foo_name", "==", "name_a")]]).read() - - # Then - assert df.shape == (8, 3) and df.columns.tolist() == ["id", "foo_name", "bar"] and df.foo_name.unique() == ["name_a"] - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_csv_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_csv( - self, - mock_listdir, - mock_temporary_directory, - mock__read_csv_file, - ): - # Given - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_CSV") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - read_obj = ReadS3ParquetIO(source_config=s3_csv_cloud_config) - actual_schema = read_obj.schema - read_obj.read() - - # Then - assert len(mock__read_csv_file.mock_calls) == 3 - mock__read_csv_file.assert_has_calls( - [ - mock.call("temp/obj_1.h5", actual_schema), - mock.call("temp/obj_2.h5", actual_schema), - mock.call("temp/obj_3.h5", actual_schema), - ] - ) - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_json_file_is_called_with_correct_local_file_path_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_json( - self, - mock_listdir, - mock_temporary_directory, - mock__read_json_file, - ): - # Given - s3_csv_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_JSON") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - read_obj = ReadS3ParquetIO(source_config=s3_csv_cloud_config) - actual_schema = read_obj.schema - read_obj.read() - - # Then - assert len(mock__read_json_file.mock_calls) == 3 - mock__read_json_file.assert_has_calls( - [ - mock.call("temp/obj_1.h5", actual_schema), - mock.call("temp/obj_2.h5", actual_schema), - mock.call("temp/obj_3.h5", actual_schema), - ] - ) - - @pytest.mark.unit - # pylint: disable=unused-argument - def test_a_concatenated_hdf_file_is_returned_with_schema_columns_when_loading_a_path_prefix_with_env_as_cloud_s3_and_type_as_hdf( - self, - mock_listdir, - mock_temporary_directory, - mock__read_hdf_file, - ): - # Given - s3_hdf_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_HDF") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_Value = True - h5_df = ReadS3HdfIO(source_config=s3_hdf_cloud_config).read() - - # Then - pd.testing.assert_frame_equal( - h5_df, - pd.DataFrame( - { - "id": [1, 2, 3], - "foo_name": ["class_a", "class_a", "class_a"], - "bar": [1001, 1001, 1001], - } - ), - ) - - @pytest.mark.unit - # pylint: disable=unused-argument - def test_a_ValueError_is_raised_if_file_type_is_not_supported_when_loading_a_path_prefix_with_env_as_cloud_s3( - self, - tmp_path, - mock_listdir, - mock_temporary_directory, - mock__read_hdf_file, - ): - # Given - test_yaml_file = tmp_path / "mytest.yml" - with open(test_yaml_file, "w") as fout: - yaml.dump( - { - "READ_FROM_S3_PATH_PREFIX_TXT": { - "CLOUD": { - "type": "s3_path_prefix", - "s3": { - "bucket": "test-bucket", - "path_prefix": "[[ MOCK_KEY ]]", - "file_type": "txt", - }, - } - } - }, - fout, - ) - - # When & Then - with pytest.raises(pydantic.ValidationError): - IOConfig( - path_to_source_yaml=test_yaml_file, - env_identifier="CLOUD", - dynamic_vars=constants, - ) - - @pytest.mark.unit - # pylint: disable=unused-argument - def test__read_parquet_file_can_read_directory_of_parquet_files_containing_empty_files( - self, - mock_parquet_temporary_directory_w_empty_files, - ): - # Given - s3_parquet_cloud_config = IOConfig( - path_to_source_yaml=(os.path.join(constants.TEST_RESOURCES, "definitions/input.yaml")), - env_identifier="CLOUD", - dynamic_vars=constants, - ).get(source_key="READ_FROM_S3_PATH_PREFIX_PARQUET") - - # When - with patch.object(dynamicio.mixins.with_s3, "awscli_runner") as mocked__awscli_runner: - mocked__awscli_runner.return_value = True - df = ReadS3ParquetWEmptyFilesIO(source_config=s3_parquet_cloud_config).read() - - # Then - assert df.shape == (10, 2) and df.columns.tolist() == ["id", "bar"] diff --git a/tests/test_regressions/conftest.py b/tests/test_regressions/conftest.py deleted file mode 100644 index 8c192b4..0000000 --- a/tests/test_regressions/conftest.py +++ /dev/null @@ -1,26 +0,0 @@ -import imp -import pathlib - -import pytest - - -@pytest.fixture -def regressions_resources_dir() -> pathlib.Path: - return (pathlib.Path(__file__).parent / "resources").resolve() - - -@pytest.fixture -def tests_resources_dir(regressions_resources_dir): - return regressions_resources_dir.parent.parent / "resources" - - -@pytest.fixture -def regressions_constants_module(regressions_resources_dir, tests_resources_dir): - mod = imp.new_module("regressions_constants_module") - mod.__dict__.update( - { - "REGRESSIONS_RESOURCES_DIR": str(regressions_resources_dir), - "TEST_RESOURCES_DIR": str(tests_resources_dir), - } - ) - return mod diff --git a/tests/test_regressions/resources/missing_v430_validations.yaml b/tests/test_regressions/resources/missing_v430_validations.yaml deleted file mode 100644 index 0669e30..0000000 --- a/tests/test_regressions/resources/missing_v430_validations.yaml +++ /dev/null @@ -1,13 +0,0 @@ -PRODUCTS: - LOCAL: - type: "local" - local: - file_path: "[[ TEST_RESOURCES_DIR ]]/data/input/some_csv_to_read.csv" - file_type: "csv" - schema: - name: products - columns: - id: - type: "object" - validations: {} - metrics: [] \ No newline at end of file diff --git a/tests/test_regressions/test_v430.py b/tests/test_regressions/test_v430.py deleted file mode 100644 index e0d1d11..0000000 --- a/tests/test_regressions/test_v430.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Test regressions discovered in v4.3.0 release""" - -from dynamicio import UnifiedIO -from dynamicio.config import IOConfig -from dynamicio.core import SCHEMA_FROM_FILE - - -class IO(UnifiedIO): - schema = SCHEMA_FROM_FILE - - -def test_missing_validations_and_metrics(regressions_resources_dir, regressions_constants_module): - """Dynamicio was refusing to work with schemas that did not have any validations specified.""" - # Given - input_config = IOConfig( - path_to_source_yaml=regressions_resources_dir / "missing_v430_validations.yaml", - env_identifier="LOCAL", - dynamic_vars=regressions_constants_module, - ) - io_instance = IO(source_config=input_config.get(source_key="PRODUCTS"), apply_schema_validations=True, log_schema_metrics=True) - - # When - data = io_instance.read() - - # Then - assert data.to_dict() == {"id": {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 12, 12: 13, 13: 14, 14: 15}} diff --git a/tests/test_resource_inject.py b/tests/test_resource_inject.py new file mode 100644 index 0000000..c4052c5 --- /dev/null +++ b/tests/test_resource_inject.py @@ -0,0 +1,12 @@ +from pathlib import Path + +from dynamicio import LocalFileResource + + +def test_file_resource_inject_success(injectable_string, passing_injections, test_df, tmpdir, file_name): + file_resource = LocalFileResource( + path=Path(tmpdir / injectable_string) / file_name, + ) + file_resource = file_resource.inject(**passing_injections) + file_resource.write(test_df) + file_resource.read() diff --git a/tests/test_resource_test_path_inject.py b/tests/test_resource_test_path_inject.py new file mode 100644 index 0000000..9c52376 --- /dev/null +++ b/tests/test_resource_test_path_inject.py @@ -0,0 +1,15 @@ +# import all resources +import pytest + +from dynamicio import LocalFileResource, S3Resource + + +@pytest.fixture(params=[LocalFileResource, S3Resource]) +def resource_instance(request, file_name): + return request.param(bucket="bucket", path="some_file.extension", test_path="{var1}") + + +def test_resource_test_path_inject(resource_instance): + assert str(resource_instance.test_path) == "{var1}" + resource_instance = resource_instance.inject(var1="aoeu") + assert str(resource_instance.test_path) == "aoeu" diff --git a/tests/test_serde.py b/tests/test_serde.py new file mode 100644 index 0000000..48c6de8 --- /dev/null +++ b/tests/test_serde.py @@ -0,0 +1,33 @@ +from unittest.mock import MagicMock + +import pandas as pd +import pytest + +from dynamicio.io.serde import CsvSerde, HdfSerde, JsonSerde, ParquetSerde + + +@pytest.fixture(params=[CsvSerde, JsonSerde, ParquetSerde, HdfSerde]) +def serde_class(request): + return request.param + + +@pytest.fixture +def serde_instance(serde_class): + _serde_instance = serde_class() + return _serde_instance + + +def test_serde_read_write(serde_instance, test_df, tmp_path): + serde_instance.write_to_file(tmp_path / "file", test_df) + read_write_df = serde_instance.read_from_file(tmp_path / "file") + pd.testing.assert_frame_equal(read_write_df, test_df) + + +def test_serde_validation_callback_called(serde_class, tmp_path, test_df): + validation_callback = MagicMock() + validation_callback.return_value = test_df + serde_instance = serde_class(validations=[validation_callback]) + serde_instance.write_to_file(tmp_path / "file", test_df) + validation_callback.assert_not_called() + serde_instance.read_from_file(tmp_path / "file") + validation_callback.assert_called_once() diff --git a/tests/test_uhura.py b/tests/test_uhura.py new file mode 100644 index 0000000..872e964 --- /dev/null +++ b/tests/test_uhura.py @@ -0,0 +1,82 @@ +from pathlib import Path + +import pandas as pd +import pytest +from uhura.modes import fixture_builder_mode, task_test_mode + +from dynamicio import KafkaResource, LocalFileResource, PostgresResource, S3Resource + + +@pytest.fixture() +def resources(file_name, tmpdir): + file_resource = LocalFileResource(path=tmpdir / "actual" / file_name) + s3_resource = S3Resource(bucket="bucket", path=file_name) + return file_resource, s3_resource + + +@pytest.fixture +def file_resource(resources): + return resources[0] + + +@pytest.fixture +def s3_resource(resources): + return resources[1] + + +def test_uhura_file(test_df, tmpdir, file_name): + file_resource = LocalFileResource(path=tmpdir / "actual" / file_name) + file_resource.write(test_df) + pd.testing.assert_frame_equal(file_resource.read(), test_df) + + with fixture_builder_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"): + file_resource.read() + file_resource.write(test_df) + + with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"): + df = file_resource.read() + file_resource.write(df) + with pytest.raises(AssertionError): + file_resource.write(df.drop("a", axis=1)) + + +@pytest.fixture +def s3_fixtures(file_name, tmpdir, test_df): + # Fixtures setup for s3 test + file_resource = LocalFileResource(path=tmpdir / "actual" / file_name) + file_resource.path = Path(tmpdir / "uhura" / "input" / "s3" / "bucket" / file_name) + file_resource.write(test_df) + file_resource.path = Path(tmpdir / "uhura" / "output" / "s3" / "bucket" / file_name) + file_resource.write(test_df) + + +def test_uhura_s3(test_df, tmpdir, file_name, s3_fixtures): + # Actual test + s3_resource = S3Resource(bucket="bucket", path=file_name) + with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"): + df = s3_resource.read() + s3_resource.write(df) + + # Check that, in test mode, the dfs are being compared and if not the same -> fail. + with pytest.raises(AssertionError): + s3_resource.write(df.drop("a", axis=1)) + + +def test_postgres_uhura(tmpdir, test_df): + postgres_resource = PostgresResource(db_user="asdf", db_host="asdf", db_name="asdf", table_name="tabular_table") + LocalFileResource(path=tmpdir / "uhura" / "input" / "postgres" / "public.tabular_table.parquet").write(test_df) + LocalFileResource(path=tmpdir / "uhura" / "output" / "postgres" / "public.tabular_table.parquet").write(test_df) + with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"): + postgres_resource.read() + postgres_resource.write(test_df) + with pytest.raises(AssertionError): + postgres_resource.write(test_df.drop("a", axis=1)) + + +def test_kafka_uhura(tmpdir, test_df): + kafka_resource = KafkaResource(topic="tropico", server="asdf") + LocalFileResource(path=tmpdir / "uhura" / "output" / "kafka" / "tropico.json").write(test_df) + with task_test_mode(input_path=tmpdir / "uhura" / "input", known_good_path=tmpdir / "uhura" / "output"): + kafka_resource.write(test_df) + with pytest.raises(AssertionError): + kafka_resource.write(test_df.drop("a", axis=1)) diff --git a/tests/test_validations.py b/tests/test_validations.py index a7137b1..e05907e 100644 --- a/tests/test_validations.py +++ b/tests/test_validations.py @@ -1,450 +1,52 @@ -# pylint: disable=missing-module-docstring, missing-class-docstring, missing-function-docstring, too-many-public-methods +import pandas as pd import pytest +from pandera import SchemaModel +from pandera.errors import SchemaError +from pandera.typing import Series -from dynamicio.validations import ( - has_acceptable_percentage_of_nulls, - has_no_null_values, - has_unique_values, - is_between, - is_greater_than, - is_greater_than_or_equal, - is_in, - is_lower_than, - is_lower_than_or_equal, -) +from dynamicio import LocalFileResource +import tests.constants as constants +from tests.fixtures.schemas import SampleSchema +file_path = constants.TEST_FIXTURES / "sample.parquet" -class TestHasUniqueValues: - @pytest.mark.unit - def test_returns_true_if_column_has_no_duplicate_values(self, input_df): - # Given - df = input_df - # When - validation = has_unique_values("TEST", df, column="id") +def test_parquet_resource_read_with_schema(): + resource = LocalFileResource(path=file_path, pa_schema=SampleSchema) + df = resource.read() - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "TEST[id] has unique values" + target_df = pd.read_parquet(file_path) + pd.testing.assert_frame_equal(df, target_df) - @pytest.mark.unit - def test_returns_false_if_column_has_duplicate_values(self, input_df): - # Given - df = input_df - # When - validation = has_unique_values("TEST", df, column="activity") +def test_parquet_resource_write_with_schema(tmpdir): + output_path = tmpdir / "test_parquet_resource_write.parquet" + in_memory_df = pd.read_parquet(file_path) - # Then - assert not validation.valid and validation.value == 3 and validation.message == "Values ['discharge', 'pass_through', 'load'] for TEST[activity] are duplicated!" + resource = LocalFileResource(path=output_path, pa_schema=SampleSchema) + resource.write(in_memory_df) + target_df = pd.read_parquet(output_path) + pd.testing.assert_frame_equal(in_memory_df, target_df) -class TestHasNoNullValues: - @pytest.mark.unit - def test_returns_true_if_column_in_df_has_no_nulls(self, input_df): - # Given - df = input_df - # When - validation = has_no_null_values("TEST", df, column="activity") +def test_parquet_resource_read_with_schema_fails_validation(): + class FailingSchema(SchemaModel): + z: Series[int] - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "TEST[activity] has 0 nulls" + resource = LocalFileResource(path=file_path, pa_schema=FailingSchema) + with pytest.raises(SchemaError): + resource.read() - @pytest.mark.unit - def test_returns_false_if_column_in_df_has_none_values(self, input_df): - # Given - df = input_df - # When - validation = has_no_null_values("TEST", df, column="duration_a") +def test_parquet_resource_read_with_schema_pandera_config_is_applied(): + class FailingSchema(SchemaModel): + a: Series[int] + b: Series[str] - # Then - assert not validation.valid and validation.value == 1 and validation.message == "TEST[duration_a] has 1 nulls" + class Config: + strict = True - @pytest.mark.unit - def test_returns_false_if_column_in_df_has_nat_values(self, input_df): - # Given - df = input_df - - # When - validation = has_no_null_values("TEST", df, column="start_time") - - # Then - assert not validation.valid and validation.value == 1 and validation.message == "TEST[start_time] has 1 nulls" - - -class TestHasAcceptablePercentageOfNulls: - @pytest.mark.unit - def test_throws_exception_if_threshold_is_greater_than_1(self, input_df): - # Given - df = input_df - - # When/Then - with pytest.raises(ValueError): - has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=1.2) - - @pytest.mark.unit - def test_throws_exception_if_threshold_is_lower_than_0(self, input_df): - # Given - df = input_df - - # When/Then - with pytest.raises(ValueError): - has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=-0.1) - - @pytest.mark.unit - def test_returns_true_if_percentage_threshold_is_not_exceeded(self, input_df): - # Given - df = input_df - - # When - validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=0.11) - - # Then - assert validation.valid is True and validation.value == 0.1 and validation.message == "Percentage of nulls of for TEST[duration_a] is 0.1" - - @pytest.mark.unit - def test_returns_true_if_inpu_df_is_empty(self, empty_df): - # Given - df = empty_df - - # When - validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=0.11) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Percentage of nulls of for TEST[duration_a] is 0" - - @pytest.mark.unit - def test_returns_true_if_threshold_is_not_exceeded_for_any_null_type_value(self, input_df): - # Given - df = input_df - - # When - validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_b", threshold=0.2) - - # Then - assert not validation.valid and validation.value == 0.3 and validation.message == "Percentage of nulls of for TEST[duration_b] is 0.3 which exceeds threshold: 0.2" - - @pytest.mark.unit - def test_returns_false_if_threshold_is_exceeded(self, input_df): - # Given - df = input_df - - # When - validation = has_acceptable_percentage_of_nulls("TEST", df, column="duration_a", threshold=0.09) - - # Then - assert not validation.valid and validation.value == 0.1 and validation.message == "Percentage of nulls of for TEST[duration_a] is 0.1 which exceeds threshold: 0.09" - - -class TestHasAcceptableCategoricalValues: - @pytest.mark.unit - def test_returns_true_if_columns_unique_values_are_a_subset_of_input_set(self, input_df): - # Given - df = input_df - - # When - validation = is_in( - "TEST", - df, - column="activity", - categorical_values={"load", "discharge", "pass_through", "one_more"}, - ) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[activity] are acceptable" - - @pytest.mark.unit - def test_returns_true_only_if_columns_unique_vals_are_an_exact_match_of_the_input_set_when_match_all_is_set_to_false(self, input_df): - # Given - df = input_df - - # When - validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge", "pass_through"}, match_all=False) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All acceptable categorical values for TEST[activity] are present" - - @pytest.mark.unit - def test_returns_false_if_columns_unique_vals_are_less_than_the_acceptable_categoricals_when_match_all_is_set_to_false(self, input_df): - # Given - df = input_df - - # When - validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge", "pass_through", "one_more"}, match_all=False) - - # Then - assert validation.valid is False and validation.value == 1 and validation.message == "Missing categorical values for TEST[activity]: {'one_more'}" - - @pytest.mark.unit - def test_returns_false_if_columns_unique_vals_are_more_than_the_acceptable_categoricals_when_match_all_is_set_to_false(self, input_df): - # Given - df = input_df - - # When - validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge"}, match_all=False) - - # Then - assert validation.valid is False and validation.value == 3 and validation.message == "Values {'pass_through'} for TEST[activity] are not acceptable for 3 cells" - - @pytest.mark.unit - def test_returns_true_if_columns_unique_vals_are_an_exact_match_of_the_input_set(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_in("TEST", df, column="activity", categorical_values={"load", "discharge", "pass_through"}) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[activity] are acceptable" - - @pytest.mark.unit - def test_returns_false_if_columns_unique_values_are_not_a_subset_of_input_set(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_in("TEST", df, column="activity", categorical_values={"load", "pass_through"}) - - # Then - assert not validation.valid and validation.value == 5 and validation.message == "Values {'discharge'} for TEST[activity] are not acceptable for 5 cells" - - @pytest.mark.unit - def test_returns_true_if_nulls_are_an_allowed_categorical_value(self, input_df): - # Given - df = input_df - - # When - validation = is_in("TEST", df, column="category_a", categorical_values={"A", "B", "C", None}) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_a] are acceptable" - - @pytest.mark.unit - def test_ignores_the_existence_of_null_values(self, input_df): - # Given - df = input_df - - # When - validation = is_in("TEST", df, column="category_a", categorical_values={"A", "B", "C"}) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_a] are acceptable" - - @pytest.mark.unit - def test_treats_nan_and_na_values_as_nulls_and_returns_true_if_null_is_acceptable(self, input_df): - # Given - df = input_df # where category_b has None, pd.NA and np.nan values - - # When - validation = is_in("TEST", df, column="category_b", categorical_values={"A", "B", "C", None}) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_b] are acceptable" - - @pytest.mark.unit - def test_ignores_nan_and_na_values_as_it_does_with_nulls(self, input_df): - # Given - df = input_df # where category_b has None, pd.NA and np.nan values - - # When - validation = is_in("TEST", df, column="category_b", categorical_values={"A", "B", "C"}) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "Categorical values for TEST[category_b] are acceptable" - - -class TestIsGreaterThan: - @pytest.mark.unit - def test_returns_true_if_all_column_values_are_above_threshold(self, input_df): - # Given - df = input_df - - # When - validation = is_greater_than("TEST", df, column="weight_a", threshold=4) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are above 4" - - @pytest.mark.unit - def test_returns_false_if_any_column_values_are_below_threshold(self, input_df): - # Given - df = input_df - - # When - validation = is_greater_than("TEST", df, column="weight_a", threshold=6) - - # Then - assert not validation.valid and validation.value == 0.5 and validation.message == "5 cell values for TEST[weight_a] are below 6" - - @pytest.mark.unit - def test_returns_false_if_any_column_values_are_below_or_equal_to_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_greater_than("TEST", df, column="weight_a", threshold=5) - - # Then - assert not validation.valid and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are below 5" - - @pytest.mark.unit - def test_is_greater_than_returns_true_if_all_column_values_are_below_threshold_irrespective_of_nulls(self, input_df): - # Given - df = input_df - - # When - validation = is_greater_than("TEST", df, column="weight_b", threshold=4) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_b] are above 4" - - -class TestIsGreaterThanOrEqual: - @pytest.mark.unit - def test_returns_true_if_all_column_values_are_above_or_equal_to_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_greater_than_or_equal("TEST", df, column="weight_a", threshold=5) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are above 5" - - def test_returns_false_if_any_column_values_are_below_the_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_greater_than_or_equal("TEST", df, column="weight_a", threshold=6) - - # Then - assert validation.valid is False and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are below 6" - - -class TestIsLowerThan: - @pytest.mark.unit - def test_returns_true_if_all_column_values_are_below_threshold(self, input_df): - # Given - df = input_df - - # When - validation = is_lower_than("TEST", df, column="weight_a", threshold=10) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are below 10" - - @pytest.mark.unit - def test_returns_false_if_any_column_values_are_above_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_lower_than("TEST", df, column="weight_a", threshold=8) - - # Then - assert not validation.valid and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are above 8" - - @pytest.mark.unit - def test_is_lower_than_returns_false_if_any_column_values_are_below_or_equal_to_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_lower_than("TEST", df, column="weight_a", threshold=9) - - # Then - assert not validation.valid and validation.value == 0.1 and validation.message == "1 cell values for TEST[weight_a] are above 9" - - @pytest.mark.unit - def test_is_lower_than_returns_true_if_all_columns_values_are_below_threshold_irrespective_of_nulls(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_lower_than("TEST", df, column="weight_b", threshold=10) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_b] are below 10" - - -class TestIsLowerThanOrEqual: - @pytest.mark.unit - def test_returns_true_if_all_column_values_are_below_or_equal_to_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_lower_than_or_equal("TEST", df, column="weight_a", threshold=9) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] are below 9" - - @pytest.mark.unit - def test_returns_false_if_any_column_values_are_above_the_threshold(self, input_df): - # Given - df = input_df - - # When/Then - validation = is_lower_than_or_equal("TEST", df, column="weight_a", threshold=8) - - # Then - assert not validation.valid and validation.value == 0.1 and validation.message == "1 cell values for TEST[weight_a] are above 8" - - -class TestIsBetween: - @pytest.mark.integration - def test_returns_true_if_all_column_values_are_between_upper_and_lower_bounds(self, input_df): - # Given - df = input_df - - # When - validation = is_between("TEST", df, column="weight_a", lower=4, upper=10) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] is between 4 and 10 thresholds" - - @pytest.mark.integration - def test_returns_false_if_any_column_values_are_below_the_lower_bound(self, input_df): - # Given - df = input_df - - # When - validation = is_between("TEST", df, column="weight_a", lower=6, upper=10) - - # Then - assert not validation.valid and validation.value == 0.5 and validation.message == "5 cell values for TEST[weight_a] are either below 6 or above 10" - - @pytest.mark.integration - def test_returns_false_if_any_column_values_are_above_the_upper_bound(self, input_df): - # Given - df = input_df - - # When - validation = is_between("TEST", df, column="weight_a", lower=4, upper=8) - - # Then - assert not validation.valid and validation.value == 0.3 and validation.message == "3 cell values for TEST[weight_a] are either below 4 or above 8" - - @pytest.mark.integration - def test_returns_true_if_all_column_values_are_within_bounds_bounds_included(self, input_df): - # Given - df = input_df - - # When - validation = is_between("TEST", df, column="weight_a", lower=5, upper=9, include_left=True, include_right=True) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_a] is between 5 and 9 thresholds" - - @pytest.mark.integration - def test_returns_true_if_all_column_values_are_between_upper_and_lower_bounds_irrespective_of_nulls(self, input_df): - # Given - df = input_df - - # When - validation = is_between("TEST", df, column="weight_b", lower=4, upper=10) - - # Then - assert validation.valid is True and validation.value == 0 and validation.message == "All values of TEST[weight_b] is between 4 and 10 thresholds" + resource = LocalFileResource(path=file_path, pa_schema=FailingSchema) + with pytest.raises(SchemaError): + resource.read()