From 8f0d52570047b697f7ac321e2a900d45b33c2c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 29 May 2024 08:11:59 +0000 Subject: [PATCH 01/64] [DOP-16270] Bump version --- onetl/VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/VERSION b/onetl/VERSION index af88ba82..bc859cbd 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.11.1 +0.11.2 From f50f008a132ed9619a83c0368815aebf1125ef4a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 4 Jun 2024 12:19:41 +0000 Subject: [PATCH 02/64] [DOP-13853] Update MongoDB example --- .../db_connection/mongodb/types.rst | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst index 2a023164..4b22b7cb 100644 --- a/docs/connection/db_connection/mongodb/types.rst +++ b/docs/connection/db_connection/mongodb/types.rst @@ -14,49 +14,49 @@ MongoDB is, by design, __schemaless__. So there are 2 ways how this can be handl .. dropdown:: See example - .. code-block:: python - - from onetl.connection import MongoDB - from onetl.db import DBReader - - from pyspark.sql.types import ( - StructType, - StructField, - IntegerType, - StringType, - TimestampType, - ) - - mongodb = MongoDB(...) - - df_schema = StructType( - [ - StructField("_id", StringType()), - StructField("some", StringType()), - StructField( - "field", - StructType( - [ - StructField("nested", IntegerType()), - ] - ), - ), - ] - ) - - reader = DBReader( - connection=mongodb, - source="some_collection", - df_schema=df_schema, - ) - df = reader.run() - - # or - - df = mongodb.pipeline( - collection="some_collection", - df_schema=df_schema, - ) + .. code-block:: python + + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ] + ), + ), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + ) + df = reader.run() + + # or + + df = mongodb.pipeline( + collection="some_collection", + df_schema=df_schema, + ) * Rely on MongoDB connector schema infer: From a015b80e851a2395c08d864dd03021f3410f7336 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 4 Jun 2024 12:36:39 +0000 Subject: [PATCH 03/64] [DOP-14063] Drop onetl._internal module --- .github/workflows/data/core/tracked.txt | 1 - .github/workflows/test-core.yml | 2 +- onetl/_internal.py | 172 ------------------ onetl/_util/file.py | 34 +++- onetl/_util/spark.py | 63 +++++++ onetl/_util/sql.py | 25 +++ .../db_connection/hive/connection.py | 2 +- .../jdbc_connection/connection.py | 2 +- .../db_connection/jdbc_connection/options.py | 26 ++- .../db_connection/jdbc_mixin/connection.py | 4 +- .../db_connection/kafka/connection.py | 3 +- .../kafka/kafka_kerberos_auth.py | 2 +- .../db_connection/kafka/kafka_scram_auth.py | 2 +- .../db_connection/kafka/kafka_ssl_protocol.py | 2 +- .../db_connection/oracle/connection.py | 2 +- .../db_connection/teradata/connection.py | 2 +- .../file_df_connection/spark_s3/connection.py | 3 +- onetl/file/file_downloader/file_downloader.py | 2 +- onetl/file/file_uploader/file_uploader.py | 2 +- onetl/file/format/csv.py | 3 +- onetl/file/format/json.py | 2 +- .../test_generate_temp_path.py | 5 +- .../test_jdbc_options_unit.py | 4 +- 23 files changed, 153 insertions(+), 212 deletions(-) delete mode 100644 onetl/_internal.py create mode 100644 onetl/_util/sql.py diff --git a/.github/workflows/data/core/tracked.txt b/.github/workflows/data/core/tracked.txt index 5b2a3ca4..855cb884 100644 --- a/.github/workflows/data/core/tracked.txt +++ b/.github/workflows/data/core/tracked.txt @@ -3,6 +3,5 @@ onetl/plugins/** onetl/impl/** onetl/hwm/** onetl/_util/** -onetl/_internal.py onetl/log.py .github/workflows/data/core/** diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index 65d681dc..6008f925 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -72,7 +72,7 @@ jobs: - name: Run tests run: | ./run_tests.sh -m 'not connection' - ./run_tests.sh onetl/_util onetl/_internal.py onetl/hooks onetl/file/filter onetl/file/limit onetl/hwm/store/hwm_class_registry.py + ./run_tests.sh onetl/_util onetl/hooks onetl/file/filter onetl/file/limit onetl/hwm/store/hwm_class_registry.py - name: Upload coverage results uses: actions/upload-artifact@v4 diff --git a/onetl/_internal.py b/onetl/_internal.py deleted file mode 100644 index 361bb3e8..00000000 --- a/onetl/_internal.py +++ /dev/null @@ -1,172 +0,0 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) -# SPDX-License-Identifier: Apache-2.0 -""" - Helpers -""" - -from __future__ import annotations - -import os -from datetime import datetime -from typing import TYPE_CHECKING, Any - -try: - from pydantic.v1 import SecretStr -except (ImportError, AttributeError): - from pydantic import SecretStr # type: ignore[no-redef, assignment] - -if TYPE_CHECKING: - from pathlib import PurePath - -# e.g. 20230524122150 -DATETIME_FORMAT = "%Y%m%d%H%M%S" - - -def clear_statement(statement: str) -> str: - """ - Clear unnecessary spaces and semicolons at the statement end. - - Oracle-specific: adds semicolon after END statement. - - Examples - -------- - - >>> clear_statement("SELECT * FROM mytable") - 'SELECT * FROM mytable' - >>> clear_statement("SELECT * FROM mytable ; ") - 'SELECT * FROM mytable' - >>> clear_statement("CREATE TABLE mytable (id NUMBER)") - 'CREATE TABLE mytable (id NUMBER)' - >>> clear_statement("BEGIN ... END") - 'BEGIN ... END;' - """ - - statement = statement.rstrip().lstrip("\n\r").rstrip(";").rstrip() - if statement.lower().endswith("end"): - statement += ";" - return statement - - -def uniq_ignore_case(orig_list: list[str]) -> list[str]: - """ - Return only uniq values from a list, case ignore. - - Examples - -------- - - >>> uniq_ignore_case(["a", "c"]) - ['a', 'c'] - >>> uniq_ignore_case(["A", "a", "c"]) - ['A', 'c'] - >>> uniq_ignore_case(["a", "A", "c"]) - ['a', 'c'] - """ - - result: list[str] = [] - already_visited: set[str] = set() - - for orig_value in orig_list: - if orig_value.casefold() not in already_visited: - result.append(orig_value) - already_visited.add(orig_value.casefold()) - - return result - - -def stringify(value: Any, quote: bool = False) -> Any: # noqa: WPS212 - """ - Convert values to strings. - - Values ``True``, ``False`` and ``None`` become ``"true"``, ``"false"`` and ``"null"``. - - If input is dict, return dict with stringified values and keys (recursive). - - If input is list, return list with stringified values (recursive). - - If ``quote=True``, wrap string values with double quotes. - - Examples - -------- - - >>> stringify(1) - '1' - >>> stringify(True) - 'true' - >>> stringify(False) - 'false' - >>> stringify(None) - 'null' - >>> stringify("string") - 'string' - >>> stringify("string", quote=True) - '"string"' - >>> stringify({"abc": 1}) - {'abc': '1'} - >>> stringify([1, True, False, None, "string"]) - ['1', 'true', 'false', 'null', 'string'] - """ - - if isinstance(value, dict): - return {stringify(k): stringify(v, quote) for k, v in value.items()} - - if isinstance(value, list): - return [stringify(v, quote) for v in value] - - if value is None: - return "null" - - if isinstance(value, bool): - return "true" if value else "false" - - if isinstance(value, SecretStr): - value = value.get_secret_value() - - if isinstance(value, os.PathLike): - value = os.fspath(value) - - if isinstance(value, str): - return f'"{value}"' if quote else value - - return str(value) - - -def to_camel(string: str) -> str: - """ - Convert ``snake_case`` strings to ``camelCase`` (with first symbol in lowercase) - - Examples - -------- - - >>> to_camel("some_value") - 'someValue' - """ - - return "".join(word.capitalize() if index > 0 else word for index, word in enumerate(string.split("_"))) - - -def generate_temp_path(root: PurePath) -> PurePath: - """ - Returns prefix which will be used for creating temp directory - - Returns - ------- - RemotePath - Temp path, containing current host name, process name and datetime - - Examples - -------- - - >>> from etl_entities.process import Process - >>> from pathlib import Path - >>> generate_temp_path(Path("/tmp")) # doctest: +SKIP - Path("/tmp/onetl/currenthost/myprocess/20230524122150") - >>> with Process(dag="mydag", task="mytask"): # doctest: +SKIP - ... generate_temp_path(Path("/abc")) - Path("/abc/onetl/currenthost/mydag.mytask.myprocess/20230524122150") - """ - - from etl_entities.process import ProcessStackManager - - current_process = ProcessStackManager.get_current() - current_dt = datetime.now().strftime(DATETIME_FORMAT) - return root / "onetl" / current_process.host / current_process.full_name / current_dt diff --git a/onetl/_util/file.py b/onetl/_util/file.py index 06e6ef04..ee27c57f 100644 --- a/onetl/_util/file.py +++ b/onetl/_util/file.py @@ -5,11 +5,15 @@ import hashlib import io import os -from pathlib import Path +from datetime import datetime +from pathlib import Path, PurePath from onetl.exception import NotAFileError from onetl.impl import path_repr +# e.g. 20230524122150 +DATETIME_FORMAT = "%Y%m%d%H%M%S" + def get_file_hash( path: os.PathLike | str, @@ -41,3 +45,31 @@ def is_file_readable(path: str | os.PathLike) -> Path: raise OSError(f"No read access to file {path_repr(path)}") return path + + +def generate_temp_path(root: PurePath) -> PurePath: + """ + Returns prefix which will be used for creating temp directory + + Returns + ------- + RemotePath + Temp path, containing current host name, process name and datetime + + Examples + -------- + + >>> from etl_entities.process import Process + >>> from pathlib import Path + >>> generate_temp_path(Path("/tmp")) # doctest: +SKIP + Path("/tmp/onetl/currenthost/myprocess/20230524122150") + >>> with Process(dag="mydag", task="mytask"): # doctest: +SKIP + ... generate_temp_path(Path("/abc")) + Path("/abc/onetl/currenthost/mydag.mytask.myprocess/20230524122150") + """ + + from etl_entities.process import ProcessStackManager + + current_process = ProcessStackManager.get_current() + current_dt = datetime.now().strftime(DATETIME_FORMAT) + return root / "onetl" / current_process.host / current_process.full_name / current_dt diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index 230abe80..f172b1c9 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +import os import textwrap from contextlib import contextmanager from math import inf @@ -9,11 +10,73 @@ from onetl._util.version import Version +try: + from pydantic.v1 import SecretStr +except (ImportError, AttributeError): + from pydantic import SecretStr # type: ignore[no-redef, assignment] + if TYPE_CHECKING: from pyspark.sql import SparkSession from pyspark.sql.conf import RuntimeConfig +def stringify(value: Any, quote: bool = False) -> Any: # noqa: WPS212 + """ + Convert values to strings. + + Values ``True``, ``False`` and ``None`` become ``"true"``, ``"false"`` and ``"null"``. + + If input is dict, return dict with stringified values and keys (recursive). + + If input is list, return list with stringified values (recursive). + + If ``quote=True``, wrap string values with double quotes. + + Examples + -------- + + >>> stringify(1) + '1' + >>> stringify(True) + 'true' + >>> stringify(False) + 'false' + >>> stringify(None) + 'null' + >>> stringify("string") + 'string' + >>> stringify("string", quote=True) + '"string"' + >>> stringify({"abc": 1}) + {'abc': '1'} + >>> stringify([1, True, False, None, "string"]) + ['1', 'true', 'false', 'null', 'string'] + """ + + if isinstance(value, dict): + return {stringify(k): stringify(v, quote) for k, v in value.items()} + + if isinstance(value, list): + return [stringify(v, quote) for v in value] + + if value is None: + return "null" + + if isinstance(value, bool): + return "true" if value else "false" + + if isinstance(value, SecretStr): + value = value.get_secret_value() + + if isinstance(value, os.PathLike): + value = os.fspath(value) + + if isinstance(value, str): + return f'"{value}"' if quote else value + + return str(value) + + @contextmanager def inject_spark_param(conf: RuntimeConfig, name: str, value: Any): """ diff --git a/onetl/_util/sql.py b/onetl/_util/sql.py new file mode 100644 index 00000000..37aa09a7 --- /dev/null +++ b/onetl/_util/sql.py @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +def clear_statement(statement: str) -> str: + """ + Clear unnecessary spaces and semicolons at the statement end. + + Oracle-specific: adds semicolon after END statement. + + Examples + -------- + + >>> clear_statement("SELECT * FROM mytable") + 'SELECT * FROM mytable' + >>> clear_statement("SELECT * FROM mytable ; ") + 'SELECT * FROM mytable' + >>> clear_statement("CREATE TABLE mytable (id NUMBER)") + 'CREATE TABLE mytable (id NUMBER)' + >>> clear_statement("BEGIN ... END") + 'BEGIN ... END;' + """ + + statement = statement.rstrip().lstrip("\n\r").rstrip(";").rstrip() + if statement.lower().endswith("end"): + statement += ";" + return statement diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index fbedebef..7fcb4dce 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -13,8 +13,8 @@ except (ImportError, AttributeError): from pydantic import validator # type: ignore[no-redef, assignment] -from onetl._internal import clear_statement from onetl._util.spark import inject_spark_param +from onetl._util.sql import clear_statement from onetl.connection.db_connection.db_connection import DBConnection from onetl.connection.db_connection.hive.dialect import HiveDialect from onetl.connection.db_connection.hive.options import ( diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index e6716ae5..5b0aebeb 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -7,7 +7,7 @@ import warnings from typing import TYPE_CHECKING, Any -from onetl._internal import clear_statement +from onetl._util.sql import clear_statement from onetl.connection.db_connection.db_connection import DBConnection from onetl.connection.db_connection.jdbc_connection.dialect import JDBCDialect from onetl.connection.db_connection.jdbc_connection.options import ( diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index cd4538f2..a2aa39ad 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -15,7 +15,6 @@ from typing_extensions import deprecated -from onetl._internal import to_camel from onetl.impl import GenericOptions # options from spark.read.jdbc which are populated by JDBCConnection methods @@ -144,10 +143,9 @@ class Config: known_options = READ_OPTIONS | READ_WRITE_OPTIONS prohibited_options = GENERIC_PROHIBITED_OPTIONS | WRITE_OPTIONS extra = "allow" - alias_generator = to_camel # Options in DataFrameWriter.jdbc() method - partition_column: Optional[str] = None + partition_column: Optional[str] = Field(default=None, alias="partitionColumn") """Column used to parallelize reading from a table. .. warning:: @@ -164,17 +162,17 @@ class Config: See documentation for :obj:`~partitioning_mode` for more details""" - num_partitions: PositiveInt = 1 + num_partitions: PositiveInt = Field(default=1, alias="numPartitions") """Number of jobs created by Spark to read the table content in parallel. See documentation for :obj:`~partitioning_mode` for more details""" - lower_bound: Optional[int] = None + lower_bound: Optional[int] = Field(default=None, alias="lowerBound") """See documentation for :obj:`~partitioning_mode` for more details""" # noqa: WPS322 - upper_bound: Optional[int] = None + upper_bound: Optional[int] = Field(default=None, alias="upperBound") """See documentation for :obj:`~partitioning_mode` for more details""" # noqa: WPS322 - session_init_statement: Optional[str] = None + session_init_statement: Optional[str] = Field(default=None, alias="sessionInitStatement") '''After each database session is opened to the remote DB and before starting to read data, this option executes a custom SQL statement (or a PL/SQL block). @@ -423,7 +421,6 @@ class Config: known_options = WRITE_OPTIONS | READ_WRITE_OPTIONS prohibited_options = GENERIC_PROHIBITED_OPTIONS | READ_OPTIONS extra = "allow" - alias_generator = to_camel if_exists: JDBCTableExistBehavior = Field(default=JDBCTableExistBehavior.APPEND, alias="mode") """Behavior of writing data into existing table. @@ -528,7 +525,7 @@ class Config: Changed default value from 1000 to 20_000 """ - isolation_level: str = "READ_UNCOMMITTED" + isolation_level: str = Field(default="READ_UNCOMMITTED", alias="isolationLevel") """The transaction isolation level, which applies to current connection. Possible values: @@ -571,7 +568,7 @@ class JDBCSQLOptions(GenericOptions): Split up ``ReadOptions`` to ``SQLOptions`` """ - partition_column: Optional[str] = None + partition_column: Optional[str] = Field(default=None, alias="partitionColumn") """Column used to partition data across multiple executors for parallel query processing. .. warning:: @@ -600,16 +597,16 @@ class JDBCSQLOptions(GenericOptions): -- Where ``stride`` is calculated as ``(upper_bound - lower_bound) / num_partitions``. """ - num_partitions: Optional[int] = None + num_partitions: Optional[int] = Field(default=None, alias="numPartitions") """Number of jobs created by Spark to read the table content in parallel.""" # noqa: WPS322 - lower_bound: Optional[int] = None + lower_bound: Optional[int] = Field(default=None, alias="lowerBound") """Defines the starting boundary for partitioning the query's data. Mandatory if :obj:`~partition_column` is set""" # noqa: WPS322 - upper_bound: Optional[int] = None + upper_bound: Optional[int] = Field(default=None, alias="upperBound") """Sets the ending boundary for data partitioning. Mandatory if :obj:`~partition_column` is set""" # noqa: WPS322 - session_init_statement: Optional[str] = None + session_init_statement: Optional[str] = Field(default=None, alias="sessionInitStatement") '''After each database session is opened to the remote DB and before starting to read data, this option executes a custom SQL statement (or a PL/SQL block). @@ -658,7 +655,6 @@ class Config: known_options = READ_OPTIONS - {"partitioning_mode"} prohibited_options = GENERIC_PROHIBITED_OPTIONS | WRITE_OPTIONS | {"partitioning_mode"} extra = "allow" - alias_generator = to_camel @root_validator(pre=True) def _check_partition_fields(cls, values): diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 8ab43075..e8c19e38 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -16,9 +16,9 @@ except (ImportError, AttributeError): from pydantic import Field, PrivateAttr, SecretStr, validator # type: ignore[no-redef, assignment] -from onetl._internal import clear_statement, stringify from onetl._util.java import get_java_gateway, try_import_java_class -from onetl._util.spark import get_spark_version +from onetl._util.spark import get_spark_version, stringify +from onetl._util.sql import clear_statement from onetl._util.version import Version from onetl.connection.db_connection.jdbc_mixin.options import ( JDBCExecuteOptions, diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index b64fff14..ce3829e4 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -14,10 +14,9 @@ except (ImportError, AttributeError): from pydantic import root_validator, validator # type: ignore[no-redef, assignment] -from onetl._internal import stringify from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version -from onetl._util.spark import get_spark_version +from onetl._util.spark import get_spark_version, stringify from onetl._util.version import Version from onetl.connection.db_connection.db_connection import DBConnection from onetl.connection.db_connection.kafka.dialect import KafkaDialect diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 6a20a31a..40e9aa55 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -12,8 +12,8 @@ except (ImportError, AttributeError): from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] -from onetl._internal import stringify from onetl._util.file import get_file_hash, is_file_readable +from onetl._util.spark import stringify from onetl.connection.db_connection.kafka.kafka_auth import KafkaAuth from onetl.impl import GenericOptions, LocalPath, path_repr diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index add09f34..823d0f82 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -11,7 +11,7 @@ from typing_extensions import Literal -from onetl._internal import stringify +from onetl._util.spark import stringify from onetl.connection.db_connection.kafka.kafka_auth import KafkaAuth from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index 6149f5aa..24dd52f6 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -10,8 +10,8 @@ except (ImportError, AttributeError): from pydantic import Field, SecretStr, validator # type: ignore[no-redef, assignment] -from onetl._internal import stringify from onetl._util.file import is_file_readable +from onetl._util.spark import stringify from onetl.impl import GenericOptions, LocalPath if TYPE_CHECKING: diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 8ca1b6ef..04398950 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -19,8 +19,8 @@ from etl_entities.instance import Host -from onetl._internal import clear_statement from onetl._util.classproperty import classproperty +from onetl._util.sql import clear_statement from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_connection.options import JDBCReadOptions diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index d6ea76ac..6ef2637b 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -7,8 +7,8 @@ from etl_entities.instance import Host -from onetl._internal import stringify from onetl._util.classproperty import classproperty +from onetl._util.spark import stringify from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.teradata.dialect import TeradataDialect diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 04da89e0..1efe39d4 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -16,11 +16,10 @@ from typing_extensions import Literal -from onetl._internal import stringify from onetl._util.hadoop import get_hadoop_config, get_hadoop_version from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version -from onetl._util.spark import get_spark_version +from onetl._util.spark import get_spark_version, stringify from onetl._util.version import Version from onetl.base import ( BaseReadableFileFormat, diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index 3fe45ff4..069f8c69 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -22,7 +22,7 @@ except (ImportError, AttributeError): from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] -from onetl._internal import generate_temp_path +from onetl._util.file import generate_temp_path from onetl.base import BaseFileConnection, BaseFileFilter, BaseFileLimit from onetl.base.path_protocol import PathProtocol from onetl.file.file_downloader.options import FileDownloaderOptions diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index 9ab5f088..fc6709ce 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -15,7 +15,7 @@ except (ImportError, AttributeError): from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] -from onetl._internal import generate_temp_path +from onetl._util.file import generate_temp_path from onetl.base import BaseFileConnection from onetl.base.path_protocol import PathWithStatsProtocol from onetl.base.pure_path_protocol import PurePathProtocol diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 353a8e98..1c4442fd 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -10,8 +10,7 @@ except (ImportError, AttributeError): from pydantic import Field # type: ignore[no-redef, assignment] -from onetl._internal import stringify -from onetl._util.spark import get_spark_version +from onetl._util.spark import get_spark_version, stringify from onetl.file.format.file_format import ReadWriteFileFormat from onetl.hooks import slot, support_hooks diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 69887442..085d125e 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -6,7 +6,7 @@ from typing_extensions import Literal -from onetl._internal import stringify +from onetl._util.spark import stringify from onetl.file.format.file_format import ReadOnlyFileFormat from onetl.hooks import slot, support_hooks diff --git a/tests/tests_unit/test_internal_unit/test_generate_temp_path.py b/tests/tests_unit/test_internal_unit/test_generate_temp_path.py index faad170f..0b8f9885 100644 --- a/tests/tests_unit/test_internal_unit/test_generate_temp_path.py +++ b/tests/tests_unit/test_internal_unit/test_generate_temp_path.py @@ -3,13 +3,14 @@ from pathlib import PurePath import pytest -from etl_entities.process import Process -from onetl._internal import generate_temp_path +from onetl._util.file import generate_temp_path @pytest.mark.flaky(reruns=5) def test_generate_temp_path(): + from etl_entities.process import Process + root = PurePath("/path") dt_prefix = datetime.now().strftime("%Y%m%d%H%M") # up to minutes, not seconds diff --git a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py index 69983888..6d3ff132 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_jdbc_options_unit.py @@ -2,7 +2,6 @@ import pytest -from onetl._internal import to_camel from onetl.connection import MSSQL, Clickhouse, MySQL, Oracle, Postgres, Teradata from onetl.connection.db_connection.jdbc_connection import JDBCTableExistBehavior @@ -181,7 +180,8 @@ def test_jdbc_old_options_allowed_but_deprecated(arg, value): with pytest.warns(UserWarning, match=warning_msg): options = Postgres.Options.parse({arg: value}) - assert options.dict(by_alias=True)[to_camel(arg)] == value + parsed_value = options.dict().get(arg) or options.dict(by_alias=True).get(arg) + assert parsed_value == value @pytest.mark.parametrize( From 0fff2b5e1c46ac59705f8d94fafbd6d787c326b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 6 Jun 2024 08:28:39 +0000 Subject: [PATCH 04/64] [DOP-16555] Add notes about local Greenplum tests --- CONTRIBUTING.rst | 8 ++++++++ docker-compose.yml | 4 +++- tests/fixtures/spark.py | 7 +++---- .../test_clickhouse_writer_integration.py | 2 +- .../test_greenplum_writer_integration.py | 2 +- .../test_kafka_writer_integration.py | 2 +- .../test_mongodb_writer_integration.py | 2 +- .../test_mssql_writer_integration.py | 2 +- .../test_mysql_writer_integration.py | 2 +- .../test_oracle_writer_integration.py | 2 +- .../test_postgres_writer_integration.py | 6 +++--- 11 files changed, 24 insertions(+), 15 deletions(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index e7a60fc1..7a70dbac 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -176,6 +176,14 @@ Without docker-compose * Download `VMware Greenplum connector for Spark `_ * Either move it to ``~/.ivy2/jars/``, or pass file path to ``CLASSPATH`` + * Set environment variable ``ONETL_GP_PACKAGE_VERSION=local``. + * On Linux, you may have to set environment variable ``SPARK_EXTERNAL_IP`` to IP of ``onetl_onetl`` network gateway: + + .. code:: bash + + export SPARK_EXTERNAL_IP=$(docker network inspect onetl_onetl --format '{{ (index .IPAM.Config 0).Gateway }}') + + This is because in some cases Spark does not properly detect hsot machine IP address, so Greenplum segments cannot connect to Spark executors. Start all containers with dependencies: diff --git a/docker-compose.yml b/docker-compose.yml index 3a61170e..34f2c4fe 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -24,13 +24,15 @@ services: # no dependencies from other containers to allow running limited set of tests instead of all greenplum: - image: ${GREENPLUM_IMAGE:-andruche/greenplum:7.0.0} + image: ${GREENPLUM_IMAGE:-andruche/greenplum:6} restart: unless-stopped env_file: .env.dependencies ports: - 5433:5432 networks: - onetl + extra_hosts: + - host.docker.internal:host-gateway sysctls: - net.ipv6.conf.all.disable_ipv6=1 diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index dbc03ba3..e7248e84 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -73,11 +73,12 @@ def maven_packages(request): if "teradata" in markers: packages.extend(Teradata.get_packages()) - if "greenplum" in markers: + gp_package_version = os.getenv("ONETL_GP_PACKAGE_VERSION") + if "greenplum" in markers and gp_package_version != "local": packages.extend( Greenplum.get_packages( spark_version=str(pyspark_version), - package_version=os.getenv("ONETL_GP_PACKAGE_VERSION") or None, + package_version=gp_package_version, ), ) @@ -139,8 +140,6 @@ def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, mave .config("spark.driver.memory", "1g") .config("spark.driver.maxResultSize", "1g") .config("spark.executor.cores", "1") - .config("spark.driver.bindAddress", "127.0.0.1") # prevent Spark from unreachable network connection - .config("spark.driver.host", "127.0.0.1") .config("spark.executor.memory", "1g") .config("spark.executor.allowSparkContext", "true") # Greenplum uses SparkContext on executor if master==local .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_clickhouse_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_clickhouse_writer_integration.py index 459794b3..884cd015 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_clickhouse_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_clickhouse_writer_integration.py @@ -6,7 +6,7 @@ pytestmark = pytest.mark.clickhouse -def test_clickhouse_writer_snapshot(spark, processing, prepare_schema_table): +def test_clickhouse_writer(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark) clickhouse = Clickhouse( diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py index 338de0c6..d25d38f8 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_greenplum_writer_integration.py @@ -16,7 +16,7 @@ {"if_exists": "ignore"}, ], ) -def test_greenplum_writer_snapshot(spark, processing, get_schema_table, options): +def test_greenplum_writer(spark, processing, get_schema_table, options): df = processing.create_spark_df(spark=spark) greenplum = Greenplum( diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_kafka_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_kafka_writer_integration.py index cf045b31..dd4c045f 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_kafka_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_kafka_writer_integration.py @@ -62,7 +62,7 @@ def kafka_spark_df(spark, kafka_processing): return spark.createDataFrame(data, schema=schema) -def test_kafka_writer_snapshot(spark, kafka_processing, kafka_spark_df): +def test_kafka_writer(spark, kafka_processing, kafka_spark_df): from pyspark.sql.functions import lit if get_spark_version(spark).major < 3: diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py index 503d2ee2..edfd2151 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mongodb_writer_integration.py @@ -19,7 +19,7 @@ {"if_exists": "ignore"}, ], ) -def test_mongodb_writer_snapshot(spark, processing, get_schema_table, options, caplog): +def test_mongodb_writer(spark, processing, get_schema_table, options, caplog): df = processing.create_spark_df(spark=spark) mongo = MongoDB( diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mssql_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mssql_writer_integration.py index 3e6cf35b..6f79f5fa 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mssql_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mssql_writer_integration.py @@ -6,7 +6,7 @@ pytestmark = pytest.mark.mssql -def test_mssql_writer_snapshot(spark, processing, prepare_schema_table): +def test_mssql_writer(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark) mssql = MSSQL( diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mysql_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mysql_writer_integration.py index 86bc7cbb..5a345971 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mysql_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_mysql_writer_integration.py @@ -6,7 +6,7 @@ pytestmark = pytest.mark.mysql -def test_mysql_writer_snapshot(spark, processing, prepare_schema_table): +def test_mysql_writer(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark) mysql = MySQL( diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_oracle_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_oracle_writer_integration.py index f5083bab..779fc2e9 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_oracle_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_oracle_writer_integration.py @@ -6,7 +6,7 @@ pytestmark = pytest.mark.oracle -def test_oracle_writer_snapshot(spark, processing, prepare_schema_table): +def test_oracle_writer(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark) oracle = Oracle( diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py index ed651948..57483da0 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_postgres_writer_integration.py @@ -16,7 +16,7 @@ {"if_exists": "ignore"}, ], ) -def test_postgres_writer_snapshot(spark, processing, get_schema_table, options): +def test_postgres_writer(spark, processing, get_schema_table, options): df = processing.create_spark_df(spark=spark) postgres = Postgres( @@ -44,7 +44,7 @@ def test_postgres_writer_snapshot(spark, processing, get_schema_table, options): ) -def test_postgres_writer_snapshot_with_dict_options(spark, processing, prepare_schema_table): +def test_postgres_writer_with_dict_options(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark) postgres = Postgres( @@ -72,7 +72,7 @@ def test_postgres_writer_snapshot_with_dict_options(spark, processing, prepare_s ) -def test_postgres_writer_snapshot_with_pydantic_options(spark, processing, prepare_schema_table): +def test_postgres_writer_with_pydantic_options(spark, processing, prepare_schema_table): df = processing.create_spark_df(spark=spark) postgres = Postgres( From 287eddf0acf556dbe94b88c92e625a4914752fb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 10 Jun 2024 09:54:40 +0000 Subject: [PATCH 05/64] Update README --- README.rst | 99 ++++++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 56 deletions(-) diff --git a/README.rst b/README.rst index 4f97ce08..0a4cbc97 100644 --- a/README.rst +++ b/README.rst @@ -65,62 +65,49 @@ Supported storages Database ~~~~~~~~ -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Storage | Powered by | -+==============+=========================================================================================================================+ -| Clickhouse | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MSSQL | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MySQL | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Postgres | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Oracle | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Teradata | Apache Spark `JDBC Data Source `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Hive | Apache Spark `Hive integration `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Kafka | Apache Spark `Kafka integration `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| Greenplum | VMware `Greenplum Spark connector `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ -| MongoDB | `MongoDB Spark connector `_ | -+--------------+-------------------------------------------------------------------------------------------------------------------------+ - -File -~~~~ -+--------------+--------------------------------------------------------------------+ -| Storage | Powered by | -+==============+====================================================================+ -| HDFS | `HDFS Python client `_ | -+--------------+--------------------------------------------------------------------+ -| S3 | `minio-py client `_ | -+--------------+--------------------------------------------------------------------+ -| SFTP | `Paramiko library `_ | -+--------------+--------------------------------------------------------------------+ -| FTP | `FTPUtil library `_ | -+--------------+--------------------------------------------------------------------+ -| FTPS | `FTPUtil library `_ | -+--------------+--------------------------------------------------------------------+ -| WebDAV | `WebdavClient3 library `_ | -+--------------+--------------------------------------------------------------------+ -| Samba | `pysmb library `_ | -+--------------+--------------------------------------------------------------------+ - -Files as DataFrame -~~~~~~~~~~~~~~~~~~ - -+--------------+---------------------------------------------------------------------------------------------------------------+ -| Storage | Powered by | -+==============+===============================================================================================================+ -| SparkLocalFS | Apache Spark `File Data Source `_ | -+--------------+---------------------------------------------------------------------------------------------------------------+ -| SparkHDFS | Apache Spark `File Data Source `_ | -+--------------+---------------------------------------------------------------------------------------------------------------+ -| SparkS3 | `Hadoop AWS `_ library | -+--------------+---------------------------------------------------------------------------------------------------------------+ ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Type | Storage | Powered by | ++====================+==============+=========================================================================================================================+ +| Database | Clickhouse | Apache Spark `JDBC Data Source `_ | ++ +--------------+ + +| | MSSQL | | ++ +--------------+ + +| | MySQL | | ++ +--------------+ + +| | Postgres | | ++ +--------------+ + +| | Oracle | | ++ +--------------+ + +| | Teradata | | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Hive | Apache Spark `Hive integration `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Kafka | Apache Spark `Kafka integration `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Greenplum | VMware `Greenplum Spark connector `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | MongoDB | `MongoDB Spark connector `_ | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| File | HDFS | `HDFS Python client `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | S3 | `minio-py client `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | SFTP | `Paramiko library `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | FTP | `FTPUtil library `_ | ++ +--------------+ + +| | FTPS | | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | WebDAV | `WebdavClient3 library `_ | ++ +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | Samba | `pysmb library `_ | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ +| Files as DataFrame | SparkLocalFS | Apache Spark `File Data Source `_ | +| +--------------+ + +| | SparkHDFS | | +| +--------------+-------------------------------------------------------------------------------------------------------------------------+ +| | SparkS3 | `Hadoop AWS `_ library | ++--------------------+--------------+-------------------------------------------------------------------------------------------------------------------------+ .. documentation From 3211aaf26764ba77665fc21757f7b2ab7527656b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 10 Jun 2024 21:13:14 +0000 Subject: [PATCH 06/64] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.15.2 → v3.16.0](https://github.com/asottile/pyupgrade/compare/v3.15.2...v3.16.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index baa40c29..202ed4f5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,7 +90,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.15.2 + rev: v3.16.0 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] From ce5a13526b8940bc2933b9ee7df4fc42f3bfd3ca Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Thu, 13 Jun 2024 16:44:37 +0300 Subject: [PATCH 07/64] [DOP-16103] - add class format to HiveWriteOptions --- docs/changelog/next_release/292.feature.rst | 1 + .../db_connection/hive/connection.py | 27 ++++++++++---- .../connection/db_connection/hive/options.py | 35 +++++++++++++++++-- .../test_hive_writer_integration.py | 18 ++++++---- 4 files changed, 67 insertions(+), 14 deletions(-) create mode 100644 docs/changelog/next_release/292.feature.rst diff --git a/docs/changelog/next_release/292.feature.rst b/docs/changelog/next_release/292.feature.rst new file mode 100644 index 00000000..e50a5fcd --- /dev/null +++ b/docs/changelog/next_release/292.feature.rst @@ -0,0 +1 @@ +Add support for specifying file formats (``ORC``, ``Parquet``, ``CSV``, etc.) in ``HiveWriteOptions.format``: ``Hive.WriteOptions(format=ORC(compression="snappy"))``. diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 7fcb4dce..857f8836 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -23,6 +23,7 @@ HiveWriteOptions, ) from onetl.connection.db_connection.hive.slots import HiveSlots +from onetl.file.format.file_format import ReadWriteFileFormat from onetl.hooks import slot, support_hooks from onetl.hwm import Window from onetl.log import log_lines, log_with_indent @@ -423,7 +424,12 @@ def _insert_into( ) -> None: write_options = self.WriteOptions.parse(options) - unsupported_options = write_options.dict(by_alias=True, exclude_unset=True, exclude={"if_exists"}) + unsupported_options = write_options.dict( + by_alias=True, + exclude_unset=True, + exclude_defaults=True, + exclude={"if_exists"}, + ) if unsupported_options: log.warning( "|%s| User-specified options %r are ignored while inserting into existing table. " @@ -458,17 +464,26 @@ def _save_as_table( write_options = self.WriteOptions.parse(options) writer = df.write - for method, value in write_options.dict(by_alias=True, exclude_none=True, exclude={"if_exists"}).items(): - # is the arguments that will be passed to the - # format orc, parquet methods and format simultaneously + for method, value in write_options.dict( # noqa: WPS352 + by_alias=True, + exclude_none=True, + exclude={"if_exists", "format"}, + ).items(): if hasattr(writer, method): if isinstance(value, Iterable) and not isinstance(value, str): - writer = getattr(writer, method)(*value) # noqa: WPS220 + writer = getattr(writer, method)(*value) else: - writer = getattr(writer, method)(value) # noqa: WPS220 + writer = getattr(writer, method)(value) else: writer = writer.option(method, value) + # deserialize passed OCR(), Parquet(), CSV(), etc. file formats + if isinstance(write_options.format, ReadWriteFileFormat): + writer = writer.format(write_options.format.name) + writer = writer.options(**write_options.format.dict()) + elif isinstance(write_options.format, str): + writer = writer.format(write_options.format) + mode = "append" if write_options.if_exists == HiveTableExistBehavior.APPEND else "overwrite" log.info("|%s| Saving data to a table %r ...", self.__class__.__name__, table) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index a196487a..f6b5fde8 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -13,6 +13,7 @@ from typing_extensions import deprecated +from onetl.file.format.file_format import ReadWriteFileFormat from onetl.impl import GenericOptions @@ -198,10 +199,30 @@ class Config: does not affect behavior. """ - format: str = "orc" + format: Union[str, ReadWriteFileFormat] = "orc" """Format of files which should be used for storing table data. - Examples: ``orc`` (default), ``parquet``, ``csv`` (NOT recommended) + Examples + -------- + + - string format: ``"orc"`` (default), ``"parquet"``, ``"csv"`` (NOT recommended). + - format class instance: ``ORC(compression="snappy")``, ``Parquet()``, ``CSV(header=True, delimiter=",")``. + + .. code:: + + options = Hive.WriteOptions( + if_exists="append", + partition_by="reg_id", + format="orc", + ) + + # or using an ORC format class instance: + + options = Hive.WriteOptions( + if_exists="append", + partition_by="reg_id", + format=ORC(compression="snappy"), + ) .. note:: @@ -285,6 +306,16 @@ class Config: Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ + def dict(self, **kwargs): + d = super().dict(**kwargs) + if isinstance(self.format, ReadWriteFileFormat): + if self.format.name != self.__fields__["format"].default: + d["format"] = self.format.name + elif "format" in d: + d.pop("format") + d.update(self.format.dict(exclude={"name"})) + return d + @validator("sort_by") def _sort_by_cannot_be_used_without_bucket_by(cls, sort_by, values): options = values.copy() diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py index a841f09f..20e2dc94 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py @@ -7,6 +7,7 @@ from onetl._util.spark import get_spark_version from onetl.connection import Hive from onetl.db import DBWriter +from onetl.file.format import CSV, ORC, Parquet pytestmark = pytest.mark.hive @@ -69,14 +70,17 @@ def test_hive_writer_with_options(spark, processing, get_schema_table, options): @pytest.mark.parametrize( - "options, fmt", + "options, format", [ - (Hive.WriteOptions(format="orc"), "orc"), (Hive.WriteOptions(), "orc"), # default + (Hive.WriteOptions(format="orc"), "orc"), (Hive.WriteOptions(format="parquet"), "parquet"), + (Hive.WriteOptions(format=ORC(compression="snappy")), "orc"), + (Hive.WriteOptions(format=CSV(sep=",", encoding="utf-8", inferSchema=True, compression="gzip")), "csv"), + (Hive.WriteOptions(format=Parquet(compression="snappy")), "parquet"), ], ) -def test_hive_writer_with_format(spark, processing, get_schema_table, options, fmt): +def test_hive_writer_with_format(spark, processing, get_schema_table, options, format): df = processing.create_spark_df(spark) hive = Hive(cluster="rnd-dwh", spark=spark) @@ -90,7 +94,7 @@ def test_hive_writer_with_format(spark, processing, get_schema_table, options, f response = hive.sql(f"SHOW CREATE TABLE {get_schema_table.full_name}") response = response.collect()[0][0] - assert f"USING {fmt}" in response + assert f"USING {format}" in response @pytest.mark.parametrize( @@ -264,11 +268,13 @@ def test_hive_writer_create_table_if_exists(spark, processing, get_schema_table, Hive.WriteOptions(bucketBy=(5, "id_int"), sortBy="hwm_int"), "{'bucketBy': (5, 'id_int'), 'sortBy': 'hwm_int'}", ), - (Hive.WriteOptions(compression="snappy"), "{'compression': 'snappy'}"), (Hive.WriteOptions(format="parquet"), "{'format': 'parquet'}"), + (Hive.WriteOptions(format=Parquet()), "{'format': 'parquet'}"), + (Hive.WriteOptions(compression="snappy"), "{'compression': 'snappy'}"), + (Hive.WriteOptions(format=ORC(compression="snappy")), "{'compression': 'snappy'}"), ], ) -def test_hive_writer_insert_into_with_options(spark, processing, get_schema_table, options, option_kv, caplog): +def test_hive_writer_insert_into_with_options_ignored(spark, processing, get_schema_table, options, option_kv, caplog): df = processing.create_spark_df(spark) hive = Hive(cluster="rnd-dwh", spark=spark) From 539da6a6ab123795de1215166e9746cfcd94889f Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 14 Jun 2024 11:31:13 +0300 Subject: [PATCH 08/64] [DOP-16103] - move format logic to _format_write_options method --- .../db_connection/hive/connection.py | 33 ++++++++++++------- .../connection/db_connection/hive/options.py | 12 ++----- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 857f8836..0a2b0ffa 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -23,7 +23,7 @@ HiveWriteOptions, ) from onetl.connection.db_connection.hive.slots import HiveSlots -from onetl.file.format.file_format import ReadWriteFileFormat +from onetl.file.format.file_format import WriteOnlyFileFormat from onetl.hooks import slot, support_hooks from onetl.hwm import Window from onetl.log import log_lines, log_with_indent @@ -423,13 +423,7 @@ def _insert_into( options: HiveWriteOptions | dict | None = None, ) -> None: write_options = self.WriteOptions.parse(options) - - unsupported_options = write_options.dict( - by_alias=True, - exclude_unset=True, - exclude_defaults=True, - exclude={"if_exists"}, - ) + unsupported_options = self._format_write_options(write_options) if unsupported_options: log.warning( "|%s| User-specified options %r are ignored while inserting into existing table. " @@ -455,6 +449,24 @@ def _insert_into( log.info("|%s| Data is successfully inserted into table %r.", self.__class__.__name__, table) + def _format_write_options(self, write_options: HiveWriteOptions) -> dict: + options_dict = write_options.dict( + by_alias=True, + exclude_unset=True, + exclude_defaults=True, + exclude={"if_exists"}, + ) + + if isinstance(write_options.format, WriteOnlyFileFormat): + if write_options.format.name != HiveWriteOptions.__fields__["format"].default: + options_dict["format"] = write_options.format.name + elif "format" in options_dict: + options_dict.pop("format") # remove format key if it matches the default + + options_dict.update(write_options.format.dict(exclude={"name"})) + + return options_dict + def _save_as_table( self, df: DataFrame, @@ -478,9 +490,8 @@ def _save_as_table( writer = writer.option(method, value) # deserialize passed OCR(), Parquet(), CSV(), etc. file formats - if isinstance(write_options.format, ReadWriteFileFormat): - writer = writer.format(write_options.format.name) - writer = writer.options(**write_options.format.dict()) + if isinstance(write_options.format, WriteOnlyFileFormat): + writer = write_options.format.apply_to_writer(writer) elif isinstance(write_options.format, str): writer = writer.format(write_options.format) diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index f6b5fde8..9b059a83 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -218,6 +218,8 @@ class Config: # or using an ORC format class instance: + from onetl.file.format import ORC + options = Hive.WriteOptions( if_exists="append", partition_by="reg_id", @@ -306,16 +308,6 @@ class Config: Used **only** while **creating new table**, or in case of ``if_exists=replace_entire_table`` """ - def dict(self, **kwargs): - d = super().dict(**kwargs) - if isinstance(self.format, ReadWriteFileFormat): - if self.format.name != self.__fields__["format"].default: - d["format"] = self.format.name - elif "format" in d: - d.pop("format") - d.update(self.format.dict(exclude={"name"})) - return d - @validator("sort_by") def _sort_by_cannot_be_used_without_bucket_by(cls, sort_by, values): options = values.copy() From dc052288b945c3f3787975bb5aa14066929a98bf Mon Sep 17 00:00:00 2001 From: maxim-lixakov Date: Fri, 14 Jun 2024 13:28:57 +0300 Subject: [PATCH 09/64] [DOP-16103] - improve logging --- docs/connection/db_connection/hive/write.rst | 4 ++++ onetl/connection/db_connection/hive/connection.py | 7 +------ .../test_hive_writer_integration.py | 3 ++- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/connection/db_connection/hive/write.rst b/docs/connection/db_connection/hive/write.rst index 95e54a5a..6b7ca91e 100644 --- a/docs/connection/db_connection/hive/write.rst +++ b/docs/connection/db_connection/hive/write.rst @@ -5,6 +5,10 @@ Writing to Hive using ``DBWriter`` For writing data to Hive, use :obj:`DBWriter `. +.. warning:: + When using ``DBWriter`` with ``Hive.WriteOptions``, the default spark data format configured in ``spark.sql.sources.default`` is overridden to use ``orc`` by default. This may affect performance and storage characteristics. + + Examples -------- diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 0a2b0ffa..81c50e87 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -453,16 +453,11 @@ def _format_write_options(self, write_options: HiveWriteOptions) -> dict: options_dict = write_options.dict( by_alias=True, exclude_unset=True, - exclude_defaults=True, exclude={"if_exists"}, ) if isinstance(write_options.format, WriteOnlyFileFormat): - if write_options.format.name != HiveWriteOptions.__fields__["format"].default: - options_dict["format"] = write_options.format.name - elif "format" in options_dict: - options_dict.pop("format") # remove format key if it matches the default - + options_dict["format"] = write_options.format.name options_dict.update(write_options.format.dict(exclude={"name"})) return options_dict diff --git a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py index 20e2dc94..97f774f7 100644 --- a/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py +++ b/tests/tests_integration/tests_core_integration/tests_db_writer_integration/test_hive_writer_integration.py @@ -271,7 +271,8 @@ def test_hive_writer_create_table_if_exists(spark, processing, get_schema_table, (Hive.WriteOptions(format="parquet"), "{'format': 'parquet'}"), (Hive.WriteOptions(format=Parquet()), "{'format': 'parquet'}"), (Hive.WriteOptions(compression="snappy"), "{'compression': 'snappy'}"), - (Hive.WriteOptions(format=ORC(compression="snappy")), "{'compression': 'snappy'}"), + (Hive.WriteOptions(format="orc"), "{'format': 'orc'}"), + (Hive.WriteOptions(format=ORC(compression="snappy")), "{'format': 'orc', 'compression': 'snappy'}"), ], ) def test_hive_writer_insert_into_with_options_ignored(spark, processing, get_schema_table, options, option_kv, caplog): From 4e2c681bcd4a323646694d5315296213fedce2c8 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Fri, 14 Jun 2024 13:42:44 +0300 Subject: [PATCH 10/64] Update docs/connection/db_connection/hive/write.rst Co-authored-by: Maxim Martynov --- docs/connection/db_connection/hive/write.rst | 10 +++++----- onetl/connection/db_connection/hive/options.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/connection/db_connection/hive/write.rst b/docs/connection/db_connection/hive/write.rst index 6b7ca91e..cd707fae 100644 --- a/docs/connection/db_connection/hive/write.rst +++ b/docs/connection/db_connection/hive/write.rst @@ -5,9 +5,6 @@ Writing to Hive using ``DBWriter`` For writing data to Hive, use :obj:`DBWriter `. -.. warning:: - When using ``DBWriter`` with ``Hive.WriteOptions``, the default spark data format configured in ``spark.sql.sources.default`` is overridden to use ``orc`` by default. This may affect performance and storage characteristics. - Examples -------- @@ -55,13 +52,16 @@ Use column-based write formats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Prefer these write formats: - * `ORC `_ + * `ORC `_ (**default**) * `Parquet `_ * `Iceberg `_ * `Hudi `_ * `Delta `_ -For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains +.. warning:: + When using ``DBWriter``, the default spark data format configured in ``spark.sql.sources.default`` is ignored, as ``Hive.WriteOptions(format=...)`` default value is explicitly set to ``orc``. + +For column-based write formats, each file contains separated sections where column data is stored. The file footer contains location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, to drastically speed up the query. diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 9b059a83..16d21a0e 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -13,7 +13,7 @@ from typing_extensions import deprecated -from onetl.file.format.file_format import ReadWriteFileFormat +from onetl.file.format.file_format import WriteOnlyFileFormat from onetl.impl import GenericOptions @@ -199,7 +199,7 @@ class Config: does not affect behavior. """ - format: Union[str, ReadWriteFileFormat] = "orc" + format: Union[str, WriteOnlyFileFormat] = "orc" """Format of files which should be used for storing table data. Examples From a0442768b51f8aeb0681d8a0a7de370821a27d2c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 17 Jun 2024 06:26:56 +0000 Subject: [PATCH 11/64] Bump mikefarah/yq from 4.44.1 to 4.44.2 in the github-actions group Bumps the github-actions group with 1 update: [mikefarah/yq](https://github.com/mikefarah/yq). Updates `mikefarah/yq` from 4.44.1 to 4.44.2 - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.44.1...v4.44.2) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-patch dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index 8f024cf8..5963ff9c 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.44.1 + uses: mikefarah/yq@v4.44.2 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From 86714c9f7a99e640e98a99625ba291de80819571 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 17 Jun 2024 08:42:01 +0000 Subject: [PATCH 12/64] Add Spark metastore & warehouse to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 49d22c6e..31223eb6 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,6 @@ dmypy.json # Local stuff docker-compose*override* !docker-compose.override.sample.yml + +metastore_db/ +spark-warehouse/ From d9c44c5fbb7a6054cd67776152b23e38215437c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 17 Jun 2024 09:07:31 +0000 Subject: [PATCH 13/64] [DOP-16923] Disable IPv6 for Greenplum container --- .github/workflows/test-greenplum.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index d54e9697..1bbc0658 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -45,6 +45,7 @@ jobs: TZ: UTC ports: - 5433:5432 + options: --sysctl net.ipv6.conf.all.disable_ipv6=1 steps: - name: Checkout code From 6b1582c8b24a1484f681cca0c38c9b2e5fcb6ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 17 Jun 2024 09:23:24 +0000 Subject: [PATCH 14/64] [DOP-16923] Update MySQL image minimal version --- .github/workflows/data/mysql/matrix.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 8e46b42e..39061bde 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -24,14 +24,14 @@ matrix: - mysql-version: 8.4.0 <<: *max full: - # Min supported version by JDBC driver is 5.7 - - mysql-version: 5.7.6 + # Tags 5.7.6-5.6.12 cannot be downloaded since Docker v26: + # "Docker Image Format v1 and Docker Image manifest version 2, schema 1 support is disabled by default" + - mysql-version: 5.7.13 <<: *min - # Max supported version by JDBC driver is 8.3 - mysql-version: 8.4.0 <<: *max nightly: - - mysql-version: 5.7.6 + - mysql-version: 5.7.13 <<: *min - mysql-version: latest <<: *latest From 11e27fdcff3fb817aefbc0604107a21861fdea4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 17 Jun 2024 10:08:30 +0000 Subject: [PATCH 15/64] [DOP-16923] Disable IPv6 for Greenplum container --- .github/workflows/test-greenplum.yml | 1 + docker-compose.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 1bbc0658..4c1b4045 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -45,6 +45,7 @@ jobs: TZ: UTC ports: - 5433:5432 + # TODO: remove after https://github.com/andruche/docker-greenplum/pull/2 options: --sysctl net.ipv6.conf.all.disable_ipv6=1 steps: diff --git a/docker-compose.yml b/docker-compose.yml index 34f2c4fe..5316891a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,6 +33,7 @@ services: - onetl extra_hosts: - host.docker.internal:host-gateway + # TODO: remove after https://github.com/andruche/docker-greenplum/pull/2 sysctls: - net.ipv6.conf.all.disable_ipv6=1 From 75c44b9189a1e7b507e9eb16aefaa0259fa5b436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 17 Jun 2024 11:45:54 +0000 Subject: [PATCH 16/64] Fix CREATE TABLE examples --- docs/connection/db_connection/clickhouse/execute.rst | 2 +- docs/connection/db_connection/clickhouse/types.rst | 6 +++--- docs/connection/db_connection/greenplum/execute.rst | 2 +- docs/connection/db_connection/greenplum/types.rst | 2 +- docs/connection/db_connection/hive/execute.rst | 2 +- docs/connection/db_connection/mssql/execute.rst | 2 +- docs/connection/db_connection/mssql/types.rst | 4 ++-- docs/connection/db_connection/mysql/execute.rst | 2 +- docs/connection/db_connection/mysql/types.rst | 2 +- docs/connection/db_connection/oracle/execute.rst | 2 +- docs/connection/db_connection/oracle/types.rst | 2 +- docs/connection/db_connection/postgres/execute.rst | 2 +- docs/connection/db_connection/postgres/types.rst | 2 +- 13 files changed, 16 insertions(+), 16 deletions(-) diff --git a/docs/connection/db_connection/clickhouse/execute.rst b/docs/connection/db_connection/clickhouse/execute.rst index f33369c5..9232710a 100644 --- a/docs/connection/db_connection/clickhouse/execute.rst +++ b/docs/connection/db_connection/clickhouse/execute.rst @@ -90,7 +90,7 @@ Examples clickhouse.execute("DROP TABLE schema.table") clickhouse.execute( """ - CREATE TABLE schema.table AS ( + CREATE TABLE schema.table ( id UInt8, key String, value Float32 diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 00a71551..21ddf0ba 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -84,7 +84,7 @@ Always prefer creating tables with specific types **BEFORE WRITING DATA**: clickhouse.execute( """ - CREATE TABLE default.target_tbl AS ( + CREATE TABLE default.target_tbl ( id UInt8, value DateTime64(6) -- specific type and precision ) @@ -398,7 +398,7 @@ For writing JSON data to ClickHouse, use the :obj:`JSON.serialize_column Date: Mon, 17 Jun 2024 21:12:21 +0000 Subject: [PATCH 17/64] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pycqa/bandit: 1.7.8 → 1.7.9](https://github.com/pycqa/bandit/compare/1.7.8...1.7.9) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 202ed4f5..15f38046 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -114,7 +114,7 @@ repos: - black==24.4.2 - repo: https://github.com/pycqa/bandit - rev: 1.7.8 + rev: 1.7.9 hooks: - id: bandit args: From 6b3f0d1635277b010f0fe96036cedeb04b69bbb0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 19 Jun 2024 15:17:44 +0000 Subject: [PATCH 18/64] Update coverage download step --- .github/workflows/test-core.yml | 2 +- .github/workflows/test-ftp.yml | 2 +- .github/workflows/test-ftps.yml | 2 +- .github/workflows/test-greenplum.yml | 2 +- .github/workflows/test-hdfs.yml | 2 +- .github/workflows/test-hive.yml | 2 +- .github/workflows/test-kafka.yml | 2 +- .github/workflows/test-local-fs.yml | 2 +- .github/workflows/test-mongodb.yml | 2 +- .github/workflows/test-mssql.yml | 2 +- .github/workflows/test-mysql.yml | 2 +- .github/workflows/test-oracle.yml | 2 +- .github/workflows/test-postgres.yml | 2 +- .github/workflows/test-s3.yml | 2 +- .github/workflows/test-samba.yml | 2 +- .github/workflows/test-sftp.yml | 2 +- .github/workflows/test-teradata.yml | 2 +- .github/workflows/test-webdav.yml | 2 +- .github/workflows/tests.yml | 7 +++---- 19 files changed, 21 insertions(+), 22 deletions(-) diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index 6008f925..8a0b3b7a 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -77,5 +77,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: core-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-core-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index e41e1f3e..dee06115 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -75,5 +75,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: ftp-${{ inputs.ftp-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-ftp-${{ inputs.ftp-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index 4fb9c623..fadb4406 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -75,5 +75,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: ftps-${{ inputs.ftps-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-ftps-${{ inputs.ftps-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 4c1b4045..9bc79fdd 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -121,5 +121,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: greenplum-${{ inputs.greenplum-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-greenplum-${{ inputs.greenplum-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index 6e52a5df..41c339dc 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -98,5 +98,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: hdfs-${{ inputs.hadoop-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-hdfs-${{ inputs.hadoop-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 893348ab..17af1d93 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -79,5 +79,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: hive-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-hive-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index 34c2894a..31f5e6e1 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -119,5 +119,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: kafka-${{ inputs.kafka-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-kafka-${{ inputs.kafka-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index f4b37c45..2672afe4 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -79,5 +79,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: local-fs-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-local-fs-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index a617450b..ec9e7fa4 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -91,5 +91,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: mongodb-${{ inputs.mongodb-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-mongodb-${{ inputs.mongodb-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 0819887a..037a2c13 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -96,5 +96,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: mssql-${{ inputs.mssql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-mssql-${{ inputs.mssql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index e2035cfc..97691a6d 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -93,5 +93,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: mysql-${{ inputs.mysql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-mysql-${{ inputs.mysql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index e11a57b8..2ba490c1 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -113,5 +113,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: oracle-${{ inputs.oracle-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-oracle-${{ inputs.oracle-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index ef31a037..7a9022b3 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -92,5 +92,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: postgres-${{ inputs.postgres-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-postgres-${{ inputs.postgres-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 8da4540c..3179002c 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -93,5 +93,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: s3-${{ inputs.minio-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-s3-${{ inputs.minio-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 58db08b8..4a2e30d1 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -73,5 +73,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index ffbf786f..eaa5e5a4 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -70,5 +70,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: sftp-${{ inputs.openssh-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-sftp-${{ inputs.openssh-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index b348da5f..8ba3ff60 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -79,5 +79,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: teradata-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-teradata-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index 47251964..34a94326 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -75,5 +75,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: webdav-${{ inputs.webdav-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-webdav-${{ inputs.webdav-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f7a7cf07..fcd6352a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -384,10 +384,9 @@ jobs: - name: Download all coverage reports uses: actions/download-artifact@v4 with: - path: reports - - - name: Move coverage data to the root folder - run: find reports -type f -exec mv '{}' reports \; + path: reports/ + pattern: coverage-* + merge-multiple: true - name: Generate coverate reports run: ./combine_coverage.sh From 06b561a7a7e642893c1feb500109a83de741207c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 19 Jun 2024 15:21:31 +0000 Subject: [PATCH 19/64] Update coverage download step --- .github/workflows/test-clickhouse.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index 4f8d436e..ba3bc21d 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -93,5 +93,5 @@ jobs: - name: Upload coverage results uses: actions/upload-artifact@v4 with: - name: clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + name: coverage-clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* From 59af63dcd09078d5e9f5a51e2faf8d629242c9d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 20 Jun 2024 14:04:26 +0000 Subject: [PATCH 20/64] Add 'Last updated at' fields to build documentation --- .readthedocs.yml | 3 +++ docs/conf.py | 4 ++++ .../file_df_connection/spark_s3/troubleshooting.rst | 2 +- requirements/docs.txt | 1 + .../test_file_format_integration/test_json_integration.py | 5 +---- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index efb1a83c..3e5f91d2 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,6 +4,9 @@ build: os: ubuntu-22.04 tools: python: "3.12" + jobs: + post_checkout: + - git fetch --unshallow || true # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 commands: - python -m virtualenv $READTHEDOCS_VIRTUALENV_PATH diff --git a/docs/conf.py b/docs/conf.py index 9427e190..3cee9a25 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -59,6 +59,7 @@ "sphinx.ext.extlinks", "sphinx_favicon", "sphinxcontrib.autodoc_pydantic", + "sphinx_last_updated_by_git", ] numpydoc_show_class_members = False autodoc_pydantic_model_show_config = False @@ -80,6 +81,9 @@ towncrier_draft_include_empty = False towncrier_draft_working_directory = PROJECT_ROOT_DIR +# TODO: remove after https://github.com/mgeier/sphinx-last-updated-by-git/pull/77 +git_exclude_patterns = ["docs/_static/logo_wide.svg"] + github_username = "MobileTeleSystems" github_repository = "onetl" diff --git a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst index 20b3b989..97d846de 100644 --- a/docs/connection/file_df_connection/spark_s3/troubleshooting.rst +++ b/docs/connection/file_df_connection/spark_s3/troubleshooting.rst @@ -38,7 +38,7 @@ How to determine reason Make logging more verbose ^^^^^^^^^^^^^^^^^^^^^^^^^ -Change Spark session log level to :ref:`DEBUG ` to print result of each attempt. +Change Spark session log level to :ref:`DEBUG ` to print result of each attempt. Resulting logs will look like this .. dropdown:: See log diff --git a/requirements/docs.txt b/requirements/docs.txt index 3776dbb0..154dbd31 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -7,6 +7,7 @@ sphinx sphinx-copybutton sphinx-design sphinx-favicon +sphinx-last-updated-by-git # TODO: uncomment after https://github.com/zqmillet/sphinx-plantuml/pull/4 # sphinx-plantuml sphinx-tabs diff --git a/tests/tests_integration/test_file_format_integration/test_json_integration.py b/tests/tests_integration/test_file_format_integration/test_json_integration.py index dcdbbc03..46fbc8c9 100644 --- a/tests/tests_integration/test_file_format_integration/test_json_integration.py +++ b/tests/tests_integration/test_file_format_integration/test_json_integration.py @@ -22,13 +22,10 @@ StructField, StructType, ) -except ImportError: - pytest.skip("Missing pyspark", allow_module_level=True) -try: from tests.util.assert_df import assert_equal_df except ImportError: - pytest.skip("Missing pandas", allow_module_level=True) + pytest.skip("Missing pandas or pyspark", allow_module_level=True) pytestmark = [pytest.mark.local_fs, pytest.mark.file_df_connection, pytest.mark.connection, pytest.mark.json] From dc589618cdc3f9effac9d2076ac9c18400440e20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 20 Jun 2024 14:07:11 +0000 Subject: [PATCH 21/64] Add 'Last updated at' fields to build documentation --- .readthedocs.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index 3e5f91d2..ad825a0f 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,11 +4,9 @@ build: os: ubuntu-22.04 tools: python: "3.12" - jobs: - post_checkout: - - git fetch --unshallow || true - # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 commands: + - git fetch --unshallow || true + # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 - python -m virtualenv $READTHEDOCS_VIRTUALENV_PATH - python -m pip install --upgrade --no-cache-dir pip setuptools wheel - python -m pip install --upgrade --no-cache-dir sphinx readthedocs-sphinx-ext From 5bc200488d9b92bab9b59dd71d8e3f670431af46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 20 Jun 2024 14:27:19 +0000 Subject: [PATCH 22/64] Update ReadTheDocs config --- .readthedocs.yml | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index ad825a0f..a0fc204d 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -4,17 +4,13 @@ build: os: ubuntu-22.04 tools: python: "3.12" - commands: - - git fetch --unshallow || true - # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 - - python -m virtualenv $READTHEDOCS_VIRTUALENV_PATH - - python -m pip install --upgrade --no-cache-dir pip setuptools wheel - - python -m pip install --upgrade --no-cache-dir sphinx readthedocs-sphinx-ext - - python -m pip install --exists-action=w --no-cache-dir -r requirements/docs.txt - - python -m pip install --exists-action=w --no-cache-dir --no-deps sphinx-plantuml - - python -m pip install --exists-action=w --upgrade --upgrade-strategy only-if-needed --no-cache-dir .[ftp,ftps,hdfs,samba,s3,sftp,webdav,spark] - - cat docs/conf.py - - cd docs && python -m sphinx -T -E -b html -d _build/doctrees -D language=en . $READTHEDOCS_OUTPUT/html + jobs: + post_checkout: + - git fetch --unshallow || true + post_create_environment: + - python -m pip install --exists-action=w --no-cache-dir --no-deps sphinx-plantuml # remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 + - python -m pip install --exists-action=w --no-cache-dir -r requirements/docs.txt + - python -m pip install --exists-action=w --upgrade --upgrade-strategy only-if-needed --no-cache-dir .[ftp,ftps,hdfs,samba,s3,sftp,webdav,spark] # TODO: uncomment after https://github.com/zqmillet/sphinx-plantuml/pull/4 #python: From 69fce1b75abbc75bc7688aed4922926be64db8e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 20 Jun 2024 14:43:52 +0000 Subject: [PATCH 23/64] Update ReadTheDocs config --- docs/conf.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 3cee9a25..f781dddd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -81,9 +81,6 @@ towncrier_draft_include_empty = False towncrier_draft_working_directory = PROJECT_ROOT_DIR -# TODO: remove after https://github.com/mgeier/sphinx-last-updated-by-git/pull/77 -git_exclude_patterns = ["docs/_static/logo_wide.svg"] - github_username = "MobileTeleSystems" github_repository = "onetl" @@ -122,6 +119,10 @@ favicons = [ {"rel": "icon", "href": "icon.svg", "type": "image/svg+xml"}, ] + +# TODO: remove after https://github.com/mgeier/sphinx-last-updated-by-git/pull/77 +git_exclude_patterns = ["docs/_static/logo_wide.svg"] + # The master toctree document. master_doc = "index" From 26c5fed031f1b01991295c168b4e409080369a26 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 01:46:37 +0000 Subject: [PATCH 24/64] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/blacken-docs: 1.16.0 → 1.18.0](https://github.com/asottile/blacken-docs/compare/1.16.0...1.18.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 15f38046..ccab6bca 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -107,7 +107,7 @@ repos: language_version: python3 - repo: https://github.com/asottile/blacken-docs - rev: 1.16.0 + rev: 1.18.0 hooks: - id: blacken-docs additional_dependencies: From b65732247ba76d82fe2a509c2877642eddc31692 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 15 Jul 2024 21:17:59 +0000 Subject: [PATCH 25/64] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/macisamuele/language-formatters-pre-commit-hooks: v2.13.0 → v2.14.0](https://github.com/macisamuele/language-formatters-pre-commit-hooks/compare/v2.13.0...v2.14.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ccab6bca..f0c94a33 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -58,7 +58,7 @@ repos: args: [-w] - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks - rev: v2.13.0 + rev: v2.14.0 hooks: - id: pretty-format-yaml args: [--autofix, --indent, '2', --preserve-quotes, --offset, '2'] From c757b7089f4930811f3bf2dc0a356c7a3e9673f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 29 Jul 2024 11:09:11 +0000 Subject: [PATCH 26/64] Fix nightly tests --- docker/mssql/configure-db.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/mssql/configure-db.sh b/docker/mssql/configure-db.sh index 51b39ae3..38e224e3 100755 --- a/docker/mssql/configure-db.sh +++ b/docker/mssql/configure-db.sh @@ -19,7 +19,7 @@ while true; do exit 1 fi - DBSTATUS=$(/opt/mssql-tools/bin/sqlcmd -h -1 -t 1 -U sa -P ${MSSQL_SA_PASSWORD} -Q "SET NOCOUNT ON; Select SUM(state) from sys.databases" 2>/dev/null | sed -e 's/^[[:space:]]*//') + DBSTATUS=$(sqlcmd -h -1 -t 1 -U sa -P ${MSSQL_SA_PASSWORD} -Q "SET NOCOUNT ON; Select SUM(state) from sys.databases" 2>/dev/null | sed -e 's/^[[:space:]]*//') ERRCODE=$? if [[ "$DBSTATUS" -eq "0" && "$ERRCODE" -eq "0" ]]; then echo "INFO: Database ready." @@ -32,5 +32,5 @@ done # Run the setup script to create the DB and the schema in the DB echo "Running setup.sql"; -/opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P $MSSQL_SA_PASSWORD -d master -i /usr/config/setup.sql; +sqlcmd -S localhost -U sa -P $MSSQL_SA_PASSWORD -d master -i /usr/config/setup.sql; echo "Success"; From df869d0e3ede22157c88e7b46ae3d050488b4d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 29 Jul 2024 11:31:13 +0000 Subject: [PATCH 27/64] Fix MSSQL tests --- .env.dependencies | 3 +++ .github/workflows/data/mssql/matrix.yml | 4 ++-- docker-compose.yml | 9 +++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.env.dependencies b/.env.dependencies index 5fccfa15..ec75df37 100644 --- a/.env.dependencies +++ b/.env.dependencies @@ -21,6 +21,9 @@ MONGO_INITDB_ROOT_PASSWORD=E4j7h!9A # MSSQL ACCEPT_EULA=Y MSSQL_SA_PASSWORD=2astazeY +MSSQL_DATABASE=onetl +MSSQL_USER=onetl +MSSQL_PASSWORD=7ellowEl7akey # MySQL MYSQL_ROOT_PASSWORD=ohbuz9Eochaj9saibooK3thooGa5aesh diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index c46d98d0..a074ed7a 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -21,12 +21,12 @@ latest: &latest matrix: small: - - mssql-version: 2022-CU12-ubuntu-22.04 + - mssql-version: 2022-CU14-ubuntu-22.04 <<: *max full: - mssql-version: 2017-GA-ubuntu <<: *min - - mssql-version: 2022-CU12-ubuntu-22.04 + - mssql-version: 2022-CU14-ubuntu-22.04 <<: *max nightly: - mssql-version: 2017-GA-ubuntu diff --git a/docker-compose.yml b/docker-compose.yml index 5316891a..41d8af80 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -91,6 +91,9 @@ services: image: ${MSSQL_IMAGE:-mcr.microsoft.com/mssql/server:latest} restart: unless-stopped env_file: .env.dependencies + environment: + # fix for https://github.com/microsoft/mssql-docker/issues/892 + PATH: /opt/mssql-tools18/bin:/opt/mssql-tools/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ports: - 1433:1433 volumes: @@ -99,6 +102,12 @@ services: networks: - onetl platform: linux/amd64 + healthcheck: + # Container is healthy only when database is created + test: ["CMD-SHELL", "sqlcmd -S localhost -d $$MSSQL_DATABASE -U $$MSSQL_USER -P $$MSSQL_PASSWORD -Q 'SELECT 1'"] + interval: 10s + timeout: 5s + retries: 5 mysql: image: ${MYSQL_IMAGE:-mysql:latest} From 267cc50562d16417907b81c91dbc8520345910c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 29 Jul 2024 12:01:05 +0000 Subject: [PATCH 28/64] Fix MSSQL tests --- .github/workflows/test-clickhouse.yml | 14 ++++++++++++++ .github/workflows/test-ftp.yml | 14 ++++++++++++++ .github/workflows/test-ftps.yml | 14 ++++++++++++++ .github/workflows/test-greenplum.yml | 14 ++++++++++++++ .github/workflows/test-hdfs.yml | 14 ++++++++++++++ .github/workflows/test-kafka.yml | 14 ++++++++++++++ .github/workflows/test-mongodb.yml | 14 ++++++++++++++ .github/workflows/test-mssql.yml | 14 ++++++++++++++ .github/workflows/test-mysql.yml | 14 ++++++++++++++ .github/workflows/test-oracle.yml | 16 +++++++++++++++- .github/workflows/test-postgres.yml | 14 ++++++++++++++ .github/workflows/test-s3.yml | 14 ++++++++++++++ .github/workflows/test-samba.yml | 14 ++++++++++++++ .github/workflows/test-sftp.yml | 14 ++++++++++++++ .github/workflows/test-webdav.yml | 14 ++++++++++++++ docker-compose.yml | 6 ++---- docker/mssql/configure-db.sh | 11 +++++++++-- 17 files changed, 222 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index ba3bc21d..db05402d 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -95,3 +95,17 @@ jobs: with: name: coverage-clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + + - name: Dump Clickhouse logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: ${{ inputs.clickhouse-image }} + dest: ./logs + + - name: Upload Clickhouse logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }} + path: logs/* diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index dee06115..8e45ec32 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -67,6 +67,20 @@ jobs: source ./env ./pytest_runner.sh -m ftp + - name: Dump FTP logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: chonjay21/ftps + dest: ./logs + + - name: Upload FTP logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-ftp-${{ inputs.ftp-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Shutdown FTP if: always() run: | diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index fadb4406..dfe8ffed 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -67,6 +67,20 @@ jobs: source ./env ./pytest_runner.sh -m ftps + - name: Dump FTPS logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: chonjay21/ftps + dest: ./logs + + - name: Upload FTPS logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-ftps-${{ inputs.ftps-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Shutdown FTPS if: always() run: | diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 9bc79fdd..5f24f779 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -118,6 +118,20 @@ jobs: GREENPLUM_PACKAGES_USER: ${{ secrets.GREENPLUM_PACKAGES_USER }} GREENPLUM_PACKAGES_PASSWORD: ${{ secrets.GREENPLUM_PACKAGES_PASSWORD }} + - name: Dump Greenplum logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: andruche/greenplum + dest: ./logs + + - name: Upload Greenplum logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-greenplum-${{ inputs.greenplum-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index 41c339dc..e06f01ed 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -90,6 +90,20 @@ jobs: echo "127.0.0.1 hdfs" | sudo tee -a /etc/hosts ./pytest_runner.sh -m hdfs + - name: Dump HDFS logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: mtsrus/hadoop + dest: ./logs + + - name: Upload HDFS logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-hdfs-${{ inputs.hadoop-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Shutdown HDFS if: always() run: | diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index 31f5e6e1..b1f06552 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -116,6 +116,20 @@ jobs: source ./env ./pytest_runner.sh -m kafka + - name: Dump Kafka logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: bitnami/kafka + dest: ./logs + + - name: Upload Kafka logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-kafka-${{ inputs.kafka-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index ec9e7fa4..334bbfc1 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -88,6 +88,20 @@ jobs: source ./env ./pytest_runner.sh -m mongodb + - name: Dump MongoDB logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: mongo + dest: ./logs + + - name: Upload MongoDB logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-mongodb-${{ inputs.mongodb-version }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 037a2c13..0865492a 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -88,6 +88,20 @@ jobs: source ./env ./pytest_runner.sh -m mssql + - name: Dump MSSQL logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: mcr.microsoft.com/mssql/server + dest: ./logs + + - name: Upload MSSQL logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-mssql-${{ inputs.mssql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Shutdown MSSQL if: always() run: | diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index 97691a6d..e305af6d 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -90,6 +90,20 @@ jobs: source ./env ./pytest_runner.sh -m mysql + - name: Dump MySQL logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: mysql + dest: ./logs + + - name: Upload MySQL logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-mysql-${{ inputs.mysql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index 2ba490c1..38a21daf 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -38,7 +38,7 @@ jobs: runs-on: ${{ inputs.os }} services: oracle: - image: "${{ inputs.oracle-image }}:${{ inputs.oracle-version }}" + image: ${{ inputs.oracle-image }}:${{ inputs.oracle-version }} env: TZ: UTC ORACLE_PASSWORD: maaxohmiGe9eep5x @@ -110,6 +110,20 @@ jobs: export "ONETL_ORA_SERVICE_NAME=${{ inputs.db-name }}" ./pytest_runner.sh -m oracle + - name: Dump Oracle logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: ${{ inputs.oracle-image }} + dest: ./logs + + - name: Upload Oracle logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-oracle-${{ inputs.oracle-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 7a9022b3..68236134 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -89,6 +89,20 @@ jobs: source ./env ./pytest_runner.sh -m postgres + - name: Dump Postgres logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: postgres + dest: ./logs + + - name: Upload Postgres logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-postgres-${{ inputs.postgres-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 3179002c..1ef595e6 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -90,6 +90,20 @@ jobs: source ./env ./pytest_runner.sh -m s3 + - name: Dump S3 logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: bitnami/minio + dest: ./logs + + - name: Upload S3 logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-s3-${{ inputs.minio-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index 4a2e30d1..f7b07131 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -65,6 +65,20 @@ jobs: source ./env ./pytest_runner.sh -m samba + - name: Dump Samba logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: elswork/samba + dest: ./logs + + - name: Upload Samba logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Shutdown Samba if: always() run: | diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index eaa5e5a4..2ab8de1c 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -67,6 +67,20 @@ jobs: source ./env ./pytest_runner.sh -m sftp + - name: Dump SFTP logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: linuxserver/openssh-server + dest: ./logs + + - name: Upload SFTP logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-sftp-${{ inputs.openssh-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Upload coverage results uses: actions/upload-artifact@v4 with: diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index 34a94326..2ce0e4ef 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -67,6 +67,20 @@ jobs: source ./env ./pytest_runner.sh -m webdav + - name: Dump WebDAV logs on failure + if: failure() + uses: jwalton/gh-docker-logs@v2 + with: + images: chonjay21/webdav + dest: ./logs + + - name: Upload WebDAV logs + uses: actions/upload-artifact@v4 + if: failure() + with: + name: container-logs-webdav-${{ inputs.webdav-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: logs/* + - name: Shutdown WebDAV if: always() run: | diff --git a/docker-compose.yml b/docker-compose.yml index 41d8af80..f5859bb5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -91,9 +91,6 @@ services: image: ${MSSQL_IMAGE:-mcr.microsoft.com/mssql/server:latest} restart: unless-stopped env_file: .env.dependencies - environment: - # fix for https://github.com/microsoft/mssql-docker/issues/892 - PATH: /opt/mssql-tools18/bin:/opt/mssql-tools/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ports: - 1433:1433 volumes: @@ -104,7 +101,8 @@ services: platform: linux/amd64 healthcheck: # Container is healthy only when database is created - test: ["CMD-SHELL", "sqlcmd -S localhost -d $$MSSQL_DATABASE -U $$MSSQL_USER -P $$MSSQL_PASSWORD -Q 'SELECT 1'"] + # TODO: replace with SELECT after fixing sqlcmd location: https://github.com/microsoft/mssql-docker/issues/892 + test: ["CMD-SHELL", "ls -lsah /var/opt/mssql/data/onetl.mdf"] interval: 10s timeout: 5s retries: 5 diff --git a/docker/mssql/configure-db.sh b/docker/mssql/configure-db.sh index 38e224e3..6c31f067 100755 --- a/docker/mssql/configure-db.sh +++ b/docker/mssql/configure-db.sh @@ -12,6 +12,13 @@ TIMEOUT=60 START=$(date +%s) echo "Configure DB script started at $(date)" +# fix for https://github.com/microsoft/mssql-docker/issues/892 +if [[ -d "/opt/mssql-tools18/bin" ]]; then + SQLCMD="/opt/mssql-tools18/bin/sqlcmd -No" +else + SQLCMD=/opt/mssql-tools/bin/sqlcmd +fi + while true; do DELTA=$(($(date +%s) - START)) if [[ $DELTA -gt $TIMEOUT ]]; then @@ -19,7 +26,7 @@ while true; do exit 1 fi - DBSTATUS=$(sqlcmd -h -1 -t 1 -U sa -P ${MSSQL_SA_PASSWORD} -Q "SET NOCOUNT ON; Select SUM(state) from sys.databases" 2>/dev/null | sed -e 's/^[[:space:]]*//') + DBSTATUS=$($SQLCMD -h -1 -t 1 -U sa -P ${MSSQL_SA_PASSWORD} -Q "SET NOCOUNT ON; Select SUM(state) from sys.databases" 2>/dev/null | sed -e 's/^[[:space:]]*//') ERRCODE=$? if [[ "$DBSTATUS" -eq "0" && "$ERRCODE" -eq "0" ]]; then echo "INFO: Database ready." @@ -32,5 +39,5 @@ done # Run the setup script to create the DB and the schema in the DB echo "Running setup.sql"; -sqlcmd -S localhost -U sa -P $MSSQL_SA_PASSWORD -d master -i /usr/config/setup.sql; +$SQLCMD -S localhost -U sa -P $MSSQL_SA_PASSWORD -d master -i /usr/config/setup.sql; echo "Success"; From 8d81a0e83a9ad853c1d9c02997d97174a3b0b449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 29 Jul 2024 12:19:11 +0000 Subject: [PATCH 29/64] Update test matrix to use latest DB versions --- .github/workflows/data/clickhouse/matrix.yml | 4 +-- .github/workflows/data/kafka/matrix.yml | 2 +- .github/workflows/data/mongodb/matrix.yml | 21 ++++------- .github/workflows/data/mssql/matrix.yml | 19 ++++------ .github/workflows/data/mysql/matrix.yml | 23 +++++------- .github/workflows/data/oracle/matrix.yml | 35 +++++++------------ .github/workflows/data/postgres/matrix.yml | 21 ++++------- .github/workflows/data/s3/matrix.yml | 13 +++---- .github/workflows/data/samba/matrix.yml | 21 ++++------- .github/workflows/data/sftp/matrix.yml | 21 ++++------- .github/workflows/data/webdav/matrix.yml | 21 ++++------- .../db_connection/mysql/prerequisites.rst | 2 +- .../db_connection/postgres/prerequisites.rst | 2 +- 13 files changed, 70 insertions(+), 135 deletions(-) diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 1469100a..928d315e 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -22,7 +22,7 @@ latest: &latest matrix: small: - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 24.3.2.23-alpine + clickhouse-version: 24.6.3.70-alpine <<: *max full: # Clickhouse version with proper DateTime > DateTime64 comparison @@ -30,7 +30,7 @@ matrix: clickhouse-version: '21.1' <<: *min - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 24.3.2.23-alpine + clickhouse-version: 24.6.3.70-alpine <<: *max nightly: - clickhouse-image: yandex/clickhouse-server diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 8050948b..c5242cb5 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -10,7 +10,7 @@ min: &min os: ubuntu-latest max: &max - kafka-version: 3.7.0 + kafka-version: 3.7.1 pydantic-version: 2 spark-version: 3.5.1 python-version: '3.12' diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index a07bdd3b..98e1fe97 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -1,5 +1,6 @@ min: &min - # MongoDB connector does not support Spark 2 + mongodb-version: 4.0.0 + # MongoDB connector does not support Spark 2.x spark-version: 3.2.4 pydantic-version: 1 python-version: '3.7' @@ -7,6 +8,7 @@ min: &min os: ubuntu-latest max: &max + mongodb-version: 7.0.12 spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -14,6 +16,7 @@ max: &max os: ubuntu-latest latest: &latest + mongodb-version: latest spark-version: latest pydantic-version: latest python-version: '3.12' @@ -21,16 +24,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - mongodb-version: 7.0.9 - <<: *max - full: - - mongodb-version: 4.0.0 - <<: *min - - mongodb-version: 7.0.9 - <<: *max - nightly: - - mongodb-version: 4.0.0 - <<: *min - - mongodb-version: latest - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index a074ed7a..fad2e738 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -1,4 +1,5 @@ min: &min + mssql-version: 2017-GA-ubuntu spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' @@ -6,6 +7,7 @@ min: &min os: ubuntu-latest max: &max + mssql-version: 2022-CU14-ubuntu-22.04 spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -13,6 +15,7 @@ max: &max os: ubuntu-latest latest: &latest + mssql-version: latest spark-version: latest pydantic-version: latest python-version: '3.12' @@ -20,16 +23,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - mssql-version: 2022-CU14-ubuntu-22.04 - <<: *max - full: - - mssql-version: 2017-GA-ubuntu - <<: *min - - mssql-version: 2022-CU14-ubuntu-22.04 - <<: *max - nightly: - - mssql-version: 2017-GA-ubuntu - <<: *min - - mssql-version: latest - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index 39061bde..d2e70314 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -1,4 +1,7 @@ min: &min + # Tags 5.7.6-5.6.12 cannot be downloaded since Docker v26: + # "Docker Image Format v1 and Docker Image manifest version 2, schema 1 support is disabled by default" + mysql-version: 5.7.13 spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' @@ -6,6 +9,7 @@ min: &min os: ubuntu-latest max: &max + mysql-version: 9.0.1 spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -13,6 +17,7 @@ max: &max os: ubuntu-latest latest: &latest + mysql-version: latest spark-version: latest pydantic-version: latest python-version: '3.12' @@ -20,18 +25,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - mysql-version: 8.4.0 - <<: *max - full: - # Tags 5.7.6-5.6.12 cannot be downloaded since Docker v26: - # "Docker Image Format v1 and Docker Image manifest version 2, schema 1 support is disabled by default" - - mysql-version: 5.7.13 - <<: *min - - mysql-version: 8.4.0 - <<: *max - nightly: - - mysql-version: 5.7.13 - <<: *min - - mysql-version: latest - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index c0a50fc2..7a79c68a 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -1,4 +1,7 @@ min: &min + oracle-image: gvenzl/oracle-xe + oracle-version: 11.2.0.2-slim-faststart + db-name: XE spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' @@ -6,6 +9,9 @@ min: &min os: ubuntu-latest max: &max + oracle-image: gvenzl/oracle-free + oracle-version: 23.4-slim-faststart + db-name: FREEPDB1 spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -13,6 +19,9 @@ max: &max os: ubuntu-latest latest: &latest + oracle-image: gvenzl/oracle-free + oracle-version: slim-faststart + db-name: FREEPDB1 spark-version: latest pydantic-version: latest python-version: '3.12' @@ -20,26 +29,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - oracle-image: gvenzl/oracle-free - oracle-version: 23.3-slim-faststart - db-name: FREEPDB1 - <<: *max - full: - - oracle-image: gvenzl/oracle-xe - oracle-version: 11.2.0.2-slim-faststart - db-name: XE - <<: *min - - oracle-image: gvenzl/oracle-free - oracle-version: 23.3-slim-faststart - db-name: FREEPDB1 - <<: *max - nightly: - - oracle-image: gvenzl/oracle-xe - oracle-version: 11.2.0.2-slim-faststart - db-name: XE - <<: *min - - oracle-image: gvenzl/oracle-free - oracle-version: slim-faststart - db-name: FREEPDB1 - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 7b8e296e..4c5b5f4e 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -1,4 +1,6 @@ min: &min + # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life + postgres-version: 9.4.26-alpine spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' @@ -6,6 +8,7 @@ min: &min os: ubuntu-latest max: &max + postgres-version: 16.3-alpine spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -13,6 +16,7 @@ max: &max os: ubuntu-latest latest: &latest + postgres-version: alpine spark-version: latest pydantic-version: latest python-version: '3.12' @@ -20,17 +24,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - postgres-version: 16.2-alpine - <<: *max - full: - # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life - - postgres-version: 9.4.26-alpine - <<: *min - - postgres-version: 16.2-alpine - <<: *max - nightly: - - postgres-version: 9.4.26-alpine - <<: *min - - postgres-version: alpine - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index d9b9338f..06d4f748 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -9,7 +9,7 @@ min: &min os: ubuntu-latest max: &max - minio-version: 2024.4.18 + minio-version: 2024.7.26 spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -25,11 +25,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - *max - full: - - *min - - *max - nightly: - - *min - - *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index b1e6b56d..045a093b 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -1,30 +1,23 @@ min: &min + # elswork/samba image versions does not correlate with smbd version, it is always 4.x + server-version: latest pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + server-version: latest pydantic-version: 2 python-version: '3.12' os: ubuntu-latest latest: &latest + server-version: latest pydantic-version: latest python-version: '3.12' os: ubuntu-latest matrix: - small: - # elswork/samba image versions does not correlate with smbd version, it is always 4.x - - server-version: latest - <<: *max - full: - - server-version: latest - <<: *min - - server-version: latest - <<: *max - nightly: - - server-version: latest - <<: *min - - server-version: latest - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index a32f6f82..5a5a757c 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -1,30 +1,23 @@ min: &min + # prior image versions does not accept incoming connections, seems like a bug + openssh-version: 8.1_p1-r0-ls5 pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + openssh-version: 9.6_p1-r0-ls154 pydantic-version: 2 python-version: '3.12' os: ubuntu-latest latest: &latest + openssh-version: latest pydantic-version: latest python-version: '3.12' os: ubuntu-latest matrix: - small: - - openssh-version: 9.6_p1-r0-ls154 - <<: *max - full: - # prior image versions does not accept incoming connections, seems like a bug - - openssh-version: 8.1_p1-r0-ls5 - <<: *min - - openssh-version: 9.6_p1-r0-ls154 - <<: *max - nightly: - - openssh-version: 8.1_p1-r0-ls5 - <<: *min - - openssh-version: latest - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/.github/workflows/data/webdav/matrix.yml b/.github/workflows/data/webdav/matrix.yml index fb76e328..39c09fcd 100644 --- a/.github/workflows/data/webdav/matrix.yml +++ b/.github/workflows/data/webdav/matrix.yml @@ -1,30 +1,23 @@ min: &min + # chonjay21/webdav image has only latest tag + webdav-version: latest pydantic-version: 1 python-version: '3.7' os: ubuntu-latest max: &max + webdav-version: latest pydantic-version: 2 python-version: '3.12' os: ubuntu-latest latest: &latest + webdav-version: latest pydantic-version: latest python-version: '3.12' os: ubuntu-latest matrix: - small: - # chonjay21/webdav image has only latest tag - - webdav-version: latest - <<: *max - full: - - webdav-version: latest - <<: *min - - webdav-version: latest - <<: *max - nightly: - - webdav-version: latest - <<: *min - - webdav-version: latest - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *latest] diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index 225e630b..b92f3320 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -6,7 +6,7 @@ Prerequisites Version Compatibility --------------------- -* MySQL server versions: 5.7 - 8.4 +* MySQL server versions: 5.7 - 9.0 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst index 509b54bc..ef83144f 100644 --- a/docs/connection/db_connection/postgres/prerequisites.rst +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -6,7 +6,7 @@ Prerequisites Version Compatibility --------------------- -* PostgreSQL server versions: 8.2 or higher +* PostgreSQL server versions: 8.2 - 16 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 From 8f7f62fe8af5d95c961886ca64209377aeb0660b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 29 Jul 2024 13:03:26 +0000 Subject: [PATCH 30/64] Update test matrix --- .github/workflows/data/clickhouse/matrix.yml | 29 +++++++------------- .github/workflows/data/greenplum/matrix.yml | 27 ++++++------------ .github/workflows/data/hdfs/matrix.yml | 11 ++------ .github/workflows/data/kafka/matrix.yml | 11 ++------ 4 files changed, 25 insertions(+), 53 deletions(-) diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 928d315e..6f1d7261 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -1,4 +1,7 @@ min: &min + # Clickhouse version with proper DateTime > DateTime64 comparison + clickhouse-image: yandex/clickhouse-server + clickhouse-version: '21.1' spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' @@ -6,6 +9,8 @@ min: &min os: ubuntu-latest max: &max + clickhouse-image: clickhouse/clickhouse-server + clickhouse-version: 24.6.3.70-alpine spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -13,6 +18,8 @@ max: &max os: ubuntu-latest latest: &latest + clickhouse-image: clickhouse/clickhouse-server + clickhouse-version: latest-alpine spark-version: latest pydantic-version: latest python-version: '3.12' @@ -20,22 +27,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 24.6.3.70-alpine - <<: *max - full: - # Clickhouse version with proper DateTime > DateTime64 comparison - - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '21.1' - <<: *min - - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 24.6.3.70-alpine - <<: *max - nightly: - - clickhouse-image: yandex/clickhouse-server - clickhouse-version: '21.1' - <<: *min - - clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: latest-alpine - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *max, *latest] diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 28ec20e7..0935a821 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -1,4 +1,6 @@ min: &min + greenplum-version: 6.23.1 + package-version: 2.2.0 # Spark 2.3.0 does not support passing ivysettings.xml spark-version: 2.3.1 pydantic-version: 1 @@ -7,6 +9,8 @@ min: &min os: ubuntu-latest max: &max + greenplum-version: 7.0.0 + package-version: 2.3.1 # Greenplum connector does not support Spark 3.3+ spark-version: 3.2.4 pydantic-version: 2 @@ -15,6 +19,8 @@ max: &max os: ubuntu-latest latest: &latest + greenplum-version: 7.0.0 + package-version: 2.3.1 # Greenplum connector does not support Spark 3.3+ spark-version: 3.2.4 pydantic-version: latest @@ -23,21 +29,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - greenplum-version: 7.0.0 - package-version: 2.3.1 - <<: *max - full: - - greenplum-version: 6.23.1 - package-version: 2.2.0 - <<: *min - - greenplum-version: 7.0.0 - package-version: 2.3.1 - <<: *max - nightly: - - greenplum-version: 6.23.1 - package-version: 2.2.0 - <<: *min - - greenplum-version: 7.0.0 - package-version: 2.3.1 - <<: *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *max, *latest] diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index 6d8156c5..af4553f1 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -23,11 +23,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - *max - full: - - *min - - *max - nightly: - - *min - - *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *max, *latest] diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index c5242cb5..1b9b2336 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -26,11 +26,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - *max - full: - - *min - - *max - nightly: - - *min - - *latest + small: [*max] + full: [*min, *max] + nightly: [*min, *max, *latest] From 9e955f7a083a3d3d8ef1dd828c93e64250a07aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 29 Jul 2024 14:23:31 +0000 Subject: [PATCH 31/64] Fix mypy warnings --- onetl/hooks/hook.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index 45200384..d49297f1 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -340,7 +340,7 @@ def __exit__(self, exc_type, value, traceback): # noqa: WPS231 raise raise RuntimeError("generator didn't stop after throw()") - def process_result(self, result: T) -> T | None: + def process_result(self, result): """ Handle original method call result, and return new value. From 31ee03fadd1153c22676433a3d567c9ec2dd994d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 21:16:51 +0000 Subject: [PATCH 32/64] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/asottile/pyupgrade: v3.16.0 → v3.17.0](https://github.com/asottile/pyupgrade/compare/v3.16.0...v3.17.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f0c94a33..d4f50d79 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -90,7 +90,7 @@ repos: - id: text-unicode-replacement-char - repo: https://github.com/asottile/pyupgrade - rev: v3.16.0 + rev: v3.17.0 hooks: - id: pyupgrade args: [--py37-plus, --keep-runtime-typing] From 4eac4668a1164d3daa8282b50a4a76b94f11fa2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 30 Jul 2024 07:42:09 +0000 Subject: [PATCH 33/64] Fix documentation build --- requirements/docs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index 154dbd31..a840f3da 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,4 +1,4 @@ -autodoc-pydantic<2 +autodoc-pydantic furo importlib-resources<6 numpydoc From 8d64cfbe111b2f8c528cdc6580b04564d5f841ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 30 Jul 2024 07:45:05 +0000 Subject: [PATCH 34/64] Fix documentation build --- requirements/docs.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index a840f3da..5ba51314 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,9 +1,9 @@ -autodoc-pydantic +autodoc-pydantic<2 furo importlib-resources<6 numpydoc pygments-csv-lexer -sphinx +sphinx<8 sphinx-copybutton sphinx-design sphinx-favicon From 48e6a64f333b85cf735f4e32d47e79df89d89877 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 30 Jul 2024 07:50:10 +0000 Subject: [PATCH 35/64] Fix documentation build --- .readthedocs.yml | 3 +++ requirements/docs.txt | 2 ++ 2 files changed, 5 insertions(+) diff --git a/.readthedocs.yml b/.readthedocs.yml index a0fc204d..aa073dab 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -11,6 +11,9 @@ build: - python -m pip install --exists-action=w --no-cache-dir --no-deps sphinx-plantuml # remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 - python -m pip install --exists-action=w --no-cache-dir -r requirements/docs.txt - python -m pip install --exists-action=w --upgrade --upgrade-strategy only-if-needed --no-cache-dir .[ftp,ftps,hdfs,samba,s3,sftp,webdav,spark] + post_install: + # TODO: remove after upgrading autodoc-pydantic to v2 + - python -m pip install --exists-action=w --no-cache-dir "sphinx<8" # TODO: uncomment after https://github.com/zqmillet/sphinx-plantuml/pull/4 #python: diff --git a/requirements/docs.txt b/requirements/docs.txt index 5ba51314..03da763e 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -1,8 +1,10 @@ +# TODO: remove version limit after upgrading all Pydantic models to v2 autodoc-pydantic<2 furo importlib-resources<6 numpydoc pygments-csv-lexer +# TODO: remove version limit after upgrading autodoc-pydantic to v2 sphinx<8 sphinx-copybutton sphinx-design From e0f1555db01452bd13546140b007bf4a604c7027 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 06:15:05 +0000 Subject: [PATCH 36/64] Bump mikefarah/yq from 4.44.2 to 4.44.3 in the github-actions group Bumps the github-actions group with 1 update: [mikefarah/yq](https://github.com/mikefarah/yq). Updates `mikefarah/yq` from 4.44.2 to 4.44.3 - [Release notes](https://github.com/mikefarah/yq/releases) - [Changelog](https://github.com/mikefarah/yq/blob/master/release_notes.txt) - [Commits](https://github.com/mikefarah/yq/compare/v4.44.2...v4.44.3) --- updated-dependencies: - dependency-name: mikefarah/yq dependency-type: direct:production update-type: version-update:semver-patch dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 38 ++++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index 5963ff9c..eba22eaf 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -154,7 +154,7 @@ jobs: - name: Get Core matrix id: matrix-core - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/core/matrix.yml @@ -184,7 +184,7 @@ jobs: - name: Get Clickhouse matrix id: matrix-clickhouse - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/clickhouse/matrix.yml @@ -214,7 +214,7 @@ jobs: - name: Get Greenplum matrix id: matrix-greenplum - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/greenplum/matrix.yml @@ -244,7 +244,7 @@ jobs: - name: Get Hive matrix id: matrix-hive - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/hive/matrix.yml @@ -274,7 +274,7 @@ jobs: - name: Get Kafka matrix id: matrix-kafka - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/kafka/matrix.yml @@ -304,7 +304,7 @@ jobs: - name: Get LocalFS matrix id: matrix-local-fs - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/local-fs/matrix.yml @@ -334,7 +334,7 @@ jobs: - name: Get MongoDB matrix id: matrix-mongodb - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/mongodb/matrix.yml @@ -364,7 +364,7 @@ jobs: - name: Get MSSQL matrix id: matrix-mssql - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/mssql/matrix.yml @@ -394,7 +394,7 @@ jobs: - name: Get MySQL matrix id: matrix-mysql - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/mysql/matrix.yml @@ -424,7 +424,7 @@ jobs: - name: Get Oracle matrix id: matrix-oracle - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/oracle/matrix.yml @@ -454,7 +454,7 @@ jobs: - name: Get Postgres matrix id: matrix-postgres - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/postgres/matrix.yml @@ -484,7 +484,7 @@ jobs: - name: Get Teradata matrix id: matrix-teradata - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/teradata/matrix.yml @@ -514,7 +514,7 @@ jobs: - name: Get FTP matrix id: matrix-ftp - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftp/matrix.yml @@ -544,7 +544,7 @@ jobs: - name: Get FTPS matrix id: matrix-ftps - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/ftps/matrix.yml @@ -574,7 +574,7 @@ jobs: - name: Get HDFS matrix id: matrix-hdfs - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/hdfs/matrix.yml @@ -604,7 +604,7 @@ jobs: - name: Get S3 matrix id: matrix-s3 - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/s3/matrix.yml @@ -634,7 +634,7 @@ jobs: - name: Get SFTP matrix id: matrix-sftp - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/sftp/matrix.yml @@ -664,7 +664,7 @@ jobs: - name: Get Samba matrix id: matrix-samba - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/samba/matrix.yml @@ -694,6 +694,6 @@ jobs: - name: Get WebDAV matrix id: matrix-webdav - uses: mikefarah/yq@v4.44.2 + uses: mikefarah/yq@v4.44.3 with: cmd: yq -o=json '.matrix' .github/workflows/data/webdav/matrix.yml From 411f58cff049e6f22dfb3a87a4ad7d64b3d5898e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 5 Aug 2024 14:41:25 +0000 Subject: [PATCH 37/64] Fix documentation build --- requirements/docs.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements/docs.txt b/requirements/docs.txt index 03da763e..be2cd127 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -16,4 +16,5 @@ sphinx-tabs sphinx-toolbox sphinx_substitution_extensions sphinxcontrib-towncrier -towncrier +# TODO: remove upper limit after https://github.com/sphinx-contrib/sphinxcontrib-towncrier/issues/92 +towncrier<24.7 From c4a9cb895e02f2e50be988093c2fa915449e60df Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 21:53:19 +0000 Subject: [PATCH 38/64] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 24.4.2 → 24.8.0](https://github.com/psf/black/compare/24.4.2...24.8.0) --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d4f50d79..8d9215d3 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -101,7 +101,7 @@ repos: - id: add-trailing-comma - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black language_version: python3 From abff632d9ed44626dcdd8275b9ca8429e1b71af4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 7 Aug 2024 20:36:58 +0000 Subject: [PATCH 39/64] [DOP-18570] Implement SparkMetricsRecorder --- .github/workflows/data/file-df/tracked.txt | 6 +- onetl/_metrics/__init__.py | 17 ++ onetl/_metrics/command.py | 57 +++++ onetl/_metrics/driver.py | 39 ++++ onetl/_metrics/executor.py | 54 +++++ onetl/_metrics/extract.py | 113 ++++++++++ onetl/_metrics/input.py | 55 +++++ onetl/_metrics/listener/__init__.py | 29 +++ onetl/_metrics/listener/base.py | 178 +++++++++++++++ onetl/_metrics/listener/execution.py | 109 ++++++++++ onetl/_metrics/listener/job.py | 87 ++++++++ onetl/_metrics/listener/listener.py | 133 ++++++++++++ onetl/_metrics/listener/stage.py | 66 ++++++ onetl/_metrics/listener/task.py | 94 ++++++++ onetl/_metrics/output.py | 50 +++++ onetl/_metrics/recorder.py | 30 +++ onetl/_util/java.py | 34 +++ onetl/_util/scala.py | 7 + onetl/strategy/hwm_store/__init__.py | 2 +- onetl/version.py | 2 +- setup.cfg | 7 +- tests/.coveragerc | 1 + tests/fixtures/global_hwm_store.py | 2 +- tests/fixtures/processing/fixtures.py | 10 +- .../test_spark_metrics_recorder_file_df.py | 171 +++++++++++++++ .../test_spark_metrics_recorder_hive.py | 159 ++++++++++++++ .../test_spark_metrics_recorder_postgres.py | 205 ++++++++++++++++++ .../test_spark_command_metrics.py | 70 ++++++ .../test_metrics/test_spark_driver_metrics.py | 22 ++ .../test_spark_executor_metrics.py | 58 +++++ .../test_metrics/test_spark_input_metrics.py | 50 +++++ .../test_metrics/test_spark_output_metrics.py | 46 ++++ 32 files changed, 1952 insertions(+), 11 deletions(-) create mode 100644 onetl/_metrics/__init__.py create mode 100644 onetl/_metrics/command.py create mode 100644 onetl/_metrics/driver.py create mode 100644 onetl/_metrics/executor.py create mode 100644 onetl/_metrics/extract.py create mode 100644 onetl/_metrics/input.py create mode 100644 onetl/_metrics/listener/__init__.py create mode 100644 onetl/_metrics/listener/base.py create mode 100644 onetl/_metrics/listener/execution.py create mode 100644 onetl/_metrics/listener/job.py create mode 100644 onetl/_metrics/listener/listener.py create mode 100644 onetl/_metrics/listener/stage.py create mode 100644 onetl/_metrics/listener/task.py create mode 100644 onetl/_metrics/output.py create mode 100644 onetl/_metrics/recorder.py create mode 100644 tests/tests_integration/test_metrics/test_spark_metrics_recorder_file_df.py create mode 100644 tests/tests_integration/test_metrics/test_spark_metrics_recorder_hive.py create mode 100644 tests/tests_integration/test_metrics/test_spark_metrics_recorder_postgres.py create mode 100644 tests/tests_unit/test_metrics/test_spark_command_metrics.py create mode 100644 tests/tests_unit/test_metrics/test_spark_driver_metrics.py create mode 100644 tests/tests_unit/test_metrics/test_spark_executor_metrics.py create mode 100644 tests/tests_unit/test_metrics/test_spark_input_metrics.py create mode 100644 tests/tests_unit/test_metrics/test_spark_output_metrics.py diff --git a/.github/workflows/data/file-df/tracked.txt b/.github/workflows/data/file-df/tracked.txt index 880912b1..c1230737 100644 --- a/.github/workflows/data/file-df/tracked.txt +++ b/.github/workflows/data/file-df/tracked.txt @@ -1,6 +1,4 @@ .github/workflows/data/file-df/** -onetl/file_df_connection/spark_file_df_connection.py -onetl/file/file_df_reader/** -onetl/file/file_df_writer/** onetl/file/__init__.py -tests/resources/file_df_connection/** +**/*file_df* +**/*file_df*/** diff --git a/onetl/_metrics/__init__.py b/onetl/_metrics/__init__.py new file mode 100644 index 00000000..5d7482b6 --- /dev/null +++ b/onetl/_metrics/__init__.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from onetl._metrics.command import SparkCommandMetrics +from onetl._metrics.driver import SparkDriverMetrics +from onetl._metrics.executor import SparkExecutorMetrics +from onetl._metrics.input import SparkInputMetrics +from onetl._metrics.output import SparkOutputMetrics +from onetl._metrics.recorder import SparkMetricsRecorder + +__all__ = [ + "SparkCommandMetrics", + "SparkDriverMetrics", + "SparkMetricsRecorder", + "SparkExecutorMetrics", + "SparkInputMetrics", + "SparkOutputMetrics", +] diff --git a/onetl/_metrics/command.py b/onetl/_metrics/command.py new file mode 100644 index 00000000..2a8a53c6 --- /dev/null +++ b/onetl/_metrics/command.py @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import os +import textwrap + +try: + from pydantic.v1 import Field +except (ImportError, AttributeError): + from pydantic import Field # type: ignore[no-redef, assignment] + +from onetl._metrics.driver import SparkDriverMetrics +from onetl._metrics.executor import SparkExecutorMetrics +from onetl._metrics.input import SparkInputMetrics +from onetl._metrics.output import SparkOutputMetrics +from onetl.impl import BaseModel + +INDENT = " " * 4 + + +class SparkCommandMetrics(BaseModel): + input: SparkInputMetrics = Field(default_factory=SparkInputMetrics) + output: SparkOutputMetrics = Field(default_factory=SparkOutputMetrics) + driver: SparkDriverMetrics = Field(default_factory=SparkDriverMetrics) + executor: SparkExecutorMetrics = Field(default_factory=SparkExecutorMetrics) + + @property + def is_empty(self) -> bool: + return all([self.input.is_empty, self.output.is_empty]) + + def update(self, other: SparkCommandMetrics) -> SparkCommandMetrics: + self.input.update(other.input) + self.output.update(other.output) + self.driver.update(other.driver) + self.executor.update(other.executor) + return self + + @property + def details(self) -> str: + if self.is_empty: + return "No data" + + result = [] + if not self.input.is_empty: + result.append(f"Input:{os.linesep}{textwrap.indent(self.input.details, INDENT)}") + if not self.output.is_empty: + result.append(f"Output:{os.linesep}{textwrap.indent(self.output.details, INDENT)}") + if not self.driver.is_empty: + result.append(f"Driver:{os.linesep}{textwrap.indent(self.driver.details, INDENT)}") + if not self.executor.is_empty: + result.append(f"Executor:{os.linesep}{textwrap.indent(self.executor.details, INDENT)}") + + return os.linesep.join(result) + + def __str__(self): + return self.details diff --git a/onetl/_metrics/driver.py b/onetl/_metrics/driver.py new file mode 100644 index 00000000..4e685719 --- /dev/null +++ b/onetl/_metrics/driver.py @@ -0,0 +1,39 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import os + +from humanize import naturalsize + +from onetl.impl import BaseModel + +# Metrics themselves are considered a part of driver result, +# ignore if result is smaller than 1MB +MIN_DRIVER_BYTES = 1_000_000 + + +class SparkDriverMetrics(BaseModel): + in_memory_bytes: int = 0 + + @property + def is_empty(self) -> bool: + return self.in_memory_bytes < MIN_DRIVER_BYTES + + def update(self, other: SparkDriverMetrics) -> SparkDriverMetrics: + self.in_memory_bytes += other.in_memory_bytes + return self + + @property + def details(self) -> str: + if self.is_empty: + return "No data" + + result = [] + if self.in_memory_bytes >= MIN_DRIVER_BYTES: + result.append(f"In-memory data (approximate): {naturalsize(self.in_memory_bytes)}") + + return os.linesep.join(result) + + def __str__(self): + return self.details diff --git a/onetl/_metrics/executor.py b/onetl/_metrics/executor.py new file mode 100644 index 00000000..3fd6f3fc --- /dev/null +++ b/onetl/_metrics/executor.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import os +from datetime import timedelta + +from humanize import naturalsize, precisedelta + +from onetl.impl import BaseModel + + +class SparkExecutorMetrics(BaseModel): + total_run_time: timedelta = timedelta() + total_cpu_time: timedelta = timedelta() + peak_memory_bytes: int = 0 + memory_spilled_bytes: int = 0 + disk_spilled_bytes: int = 0 + + @property + def is_empty(self) -> bool: + return not self.total_run_time + + def update(self, other: SparkExecutorMetrics) -> SparkExecutorMetrics: + self.total_run_time += other.total_run_time + self.total_cpu_time += other.total_cpu_time + self.peak_memory_bytes += other.peak_memory_bytes + self.memory_spilled_bytes += other.memory_spilled_bytes + self.disk_spilled_bytes += other.disk_spilled_bytes + return self + + @property + def details(self) -> str: + if self.is_empty: + return "No data" + + result = [ + f"Total run time: {precisedelta(self.total_run_time)}", + f"Total CPU time: {precisedelta(self.total_cpu_time)}", + ] + + if self.peak_memory_bytes: + result.append(f"Peak memory: {naturalsize(self.peak_memory_bytes)}") + + if self.memory_spilled_bytes: + result.append(f"Memory spilled: {naturalsize(self.memory_spilled_bytes)}") + + if self.disk_spilled_bytes: + result.append(f"Disk spilled: {naturalsize(self.disk_spilled_bytes)}") + + return os.linesep.join(result) + + def __str__(self): + return self.details diff --git a/onetl/_metrics/extract.py b/onetl/_metrics/extract.py new file mode 100644 index 00000000..4789d8fd --- /dev/null +++ b/onetl/_metrics/extract.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import re +from datetime import timedelta +from typing import Any + +try: + from pydantic.v1 import ByteSize +except (ImportError, AttributeError): + from pydantic import ByteSize # type: ignore[no-redef, assignment] + +from onetl._metrics.command import SparkCommandMetrics +from onetl._metrics.driver import SparkDriverMetrics +from onetl._metrics.executor import SparkExecutorMetrics +from onetl._metrics.input import SparkInputMetrics +from onetl._metrics.listener.execution import ( + SparkListenerExecution, + SparkSQLMetricNames, +) +from onetl._metrics.output import SparkOutputMetrics + +# in some cases byte metrics have format "7.6 MiB", but sometimes it is: +# total (min, med, max (stageId: taskId))\n7.6 MiB (0.0 B, 7.6 MiB, 7.6 MiB (driver)) +NON_BYTE_SIZE = re.compile(r"^[^\d.]+|\(.*\)", flags=re.DOTALL) + + +def _get_int(data: dict[SparkSQLMetricNames, list[str]], key: Any) -> int | None: + try: + return int(data[key][0]) + except Exception: + return None + + +def _get_bytes(data: dict[SparkSQLMetricNames, list[str]], key: Any) -> int | None: + try: + raw_value = data[key][0] + normalized_value = NON_BYTE_SIZE.sub("", raw_value) + return int(ByteSize.validate(normalized_value)) + except Exception: + return None + + +def extract_metrics_from_execution(execution: SparkListenerExecution) -> SparkCommandMetrics: + input_read_bytes: int = 0 + input_read_rows: int = 0 + output_bytes: int = 0 + output_rows: int = 0 + + run_time_milliseconds: int = 0 + cpu_time_nanoseconds: int = 0 + peak_memory_bytes: int = 0 + memory_spilled_bytes: int = 0 + disk_spilled_bytes: int = 0 + result_size_bytes: int = 0 + + # some metrics are per-stage, and have to be summed, others are per-execution + for job in execution.jobs: + for stage in job.stages: + input_read_bytes += stage.metrics.input_metrics.bytes_read + input_read_rows += stage.metrics.input_metrics.records_read + output_bytes += stage.metrics.output_metrics.bytes_written + output_rows += stage.metrics.output_metrics.records_written + + run_time_milliseconds += stage.metrics.executor_run_time_milliseconds + cpu_time_nanoseconds += stage.metrics.executor_cpu_time_nanoseconds + peak_memory_bytes += stage.metrics.peak_execution_memory_bytes + memory_spilled_bytes += stage.metrics.memory_spilled_bytes + disk_spilled_bytes += stage.metrics.disk_spilled_bytes + result_size_bytes += stage.metrics.result_size_bytes + + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 + input_file_count = ( + _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_FILES_READ) + or _get_int(execution.metrics, SparkSQLMetricNames.STATIC_NUMBER_OF_FILES_READ) + or 0 + ) + input_raw_file_bytes = ( + _get_bytes(execution.metrics, SparkSQLMetricNames.SIZE_OF_FILES_READ) + or _get_bytes(execution.metrics, SparkSQLMetricNames.STATIC_SIZE_OF_FILES_READ) + or 0 + ) + input_read_partitions = _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_PARTITIONS_READ) or 0 + + output_files = _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_WRITTEN_FILES) or 0 + output_dynamic_partitions = _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_DYNAMIC_PART) or 0 + + return SparkCommandMetrics( + input=SparkInputMetrics( + read_rows=input_read_rows, + read_files=input_file_count, + read_bytes=input_read_bytes, + raw_file_bytes=input_raw_file_bytes, + read_partitions=input_read_partitions, + ), + output=SparkOutputMetrics( + written_rows=output_rows, + written_bytes=output_bytes, + created_files=output_files, + created_partitions=output_dynamic_partitions, + ), + driver=SparkDriverMetrics( + in_memory_bytes=result_size_bytes, + ), + executor=SparkExecutorMetrics( + total_run_time=timedelta(milliseconds=run_time_milliseconds), + total_cpu_time=timedelta(microseconds=cpu_time_nanoseconds / 1000), + peak_memory_bytes=peak_memory_bytes, + memory_spilled_bytes=memory_spilled_bytes, + disk_spilled_bytes=disk_spilled_bytes, + ), + ) diff --git a/onetl/_metrics/input.py b/onetl/_metrics/input.py new file mode 100644 index 00000000..39061311 --- /dev/null +++ b/onetl/_metrics/input.py @@ -0,0 +1,55 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import os +from pprint import pformat + +from humanize import naturalsize + +from onetl.impl import BaseModel + + +class SparkInputMetrics(BaseModel): + read_rows: int = 0 + read_files: int = 0 + read_partitions: int = 0 + read_bytes: int = 0 + raw_file_bytes: int = 0 + + @property + def is_empty(self) -> bool: + return not any([self.read_bytes, self.read_files, self.read_rows]) + + def update(self, other: SparkInputMetrics) -> SparkInputMetrics: + self.read_rows += other.read_rows + self.read_files += other.read_files + self.read_partitions += other.read_partitions + self.read_bytes += other.read_bytes + self.raw_file_bytes += other.raw_file_bytes + return self + + @property + def details(self) -> str: + if self.is_empty: + return "No data" + + result = [] + result.append(f"Read rows: {pformat(self.read_rows)}") + + if self.read_partitions: + result.append(f"Read partitions: {pformat(self.read_partitions)}") + + if self.read_files: + result.append(f"Read files: {pformat(self.read_files)}") + + if self.read_bytes: + result.append(f"Read size: {naturalsize(self.read_bytes)}") + + if self.raw_file_bytes and self.read_bytes != self.raw_file_bytes: + result.append(f"Raw files size: {naturalsize(self.raw_file_bytes)}") + + return os.linesep.join(result) + + def __str__(self): + return self.details diff --git a/onetl/_metrics/listener/__init__.py b/onetl/_metrics/listener/__init__.py new file mode 100644 index 00000000..112e4fba --- /dev/null +++ b/onetl/_metrics/listener/__init__.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from onetl._metrics.listener.execution import ( + SparkListenerExecution, + SparkListenerExecutionStatus, + SparkSQLMetricNames, +) +from onetl._metrics.listener.job import SparkListenerJob, SparkListenerJobStatus +from onetl._metrics.listener.listener import SparkMetricsListener +from onetl._metrics.listener.stage import SparkListenerStage, SparkListenerStageStatus +from onetl._metrics.listener.task import ( + SparkListenerTask, + SparkListenerTaskMetrics, + SparkListenerTaskStatus, +) + +__all__ = [ + "SparkListenerTask", + "SparkListenerTaskStatus", + "SparkListenerTaskMetrics", + "SparkListenerStage", + "SparkListenerStageStatus", + "SparkListenerJob", + "SparkListenerJobStatus", + "SparkListenerExecution", + "SparkListenerExecutionStatus", + "SparkSQLMetricNames", + "SparkMetricsListener", +] diff --git a/onetl/_metrics/listener/base.py b/onetl/_metrics/listener/base.py new file mode 100644 index 00000000..90432c7c --- /dev/null +++ b/onetl/_metrics/listener/base.py @@ -0,0 +1,178 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from contextlib import suppress +from dataclasses import dataclass +from typing import TYPE_CHECKING + +from onetl._util.java import get_java_gateway, start_callback_server + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + + +@dataclass +class BaseSparkListener: + """Base no-op SparkListener implementation. + + See `SparkListener `_ interface. + """ + + spark: SparkSession + + def activate(self): + start_callback_server(self.spark) + + # passing python listener object directly to addSparkListener or removeSparkListener leads to creating new java object each time. + # But removeSparkListener call has effect only on the same Java object passed to removeSparkListener. + # So we need to explicitly create Java object, and then pass it both calls. + gateway = get_java_gateway(self.spark) + java_list = gateway.jvm.java.util.ArrayList() + java_list.append(self) + self._java_listener = java_list[0] + + spark_context = self.spark.sparkContext._jsc.sc() # noqa: WPS437 + spark_context.addSparkListener(self._java_listener) + + def deactivate(self): + with suppress(Exception): + spark_context = self.spark.sparkContext._jsc.sc() # noqa: WPS437 + spark_context.removeSparkListener(self._java_listener) + + with suppress(Exception): + del self._java_listener + + def __enter__(self): + self.activate() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.deactivate() + + def __del__(self): # noqa: WPS603 + # If current object is collected by GC, deactivate listener + # and free bind Java object + self.deactivate() + + def equals(self, other): + # Java does not provide proper way to get object id for comparison, + # so we compare string representation which should contain some form of id + return other.toString() == self._java_listener.toString() + + def toString(self): + return type(self).__qualname__ + "@" + hex(id(self)) + + def hashCode(self): + return hash(self) + + # no cover: start + # method names are important for Java interface compatibility! + def onApplicationEnd(self, application): + pass + + def onApplicationStart(self, application): + pass + + def onBlockManagerAdded(self, block_manager): + pass + + def onBlockManagerRemoved(self, block_manager): + pass + + def onBlockUpdated(self, block): + pass + + def onEnvironmentUpdate(self, environment): + pass + + def onExecutorAdded(self, executor): + pass + + def onExecutorMetricsUpdate(self, executor): + pass + + def onExecutorRemoved(self, executor): + pass + + def onExecutorBlacklisted(self, event): + pass + + def onExecutorBlacklistedForStage(self, event): + pass + + def onExecutorExcluded(self, event): + pass + + def onExecutorExcludedForStage(self, event): + pass + + def onExecutorUnblacklisted(self, event): + pass + + def onExecutorUnexcluded(self, event): + pass + + def onJobStart(self, event): + pass + + def onJobEnd(self, event): + pass + + def onNodeBlacklisted(self, node): + pass + + def onNodeBlacklistedForStage(self, stage): + pass + + def onNodeExcluded(self, node): + pass + + def onNodeExcludedForStage(self, node): + pass + + def onNodeUnblacklisted(self, node): + pass + + def onNodeUnexcluded(self, node): + pass + + def onOtherEvent(self, event): + pass + + def onResourceProfileAdded(self, resource_profile): + pass + + def onSpeculativeTaskSubmitted(self, task): + pass + + def onStageCompleted(self, event): + pass + + def onStageExecutorMetrics(self, metrics): + pass + + def onStageSubmitted(self, event): + pass + + def onTaskEnd(self, event): + pass + + def onTaskGettingResult(self, task): + pass + + def onTaskStart(self, event): + pass + + def onUnpersistRDD(self, rdd): + pass + + def onUnschedulableTaskSetAdded(self, task_set): + pass + + def onUnschedulableTaskSetRemoved(self, task_set): + pass + + # no cover: stop + class Java: + implements = ["org.apache.spark.scheduler.SparkListenerInterface"] diff --git a/onetl/_metrics/listener/execution.py b/onetl/_metrics/listener/execution.py new file mode 100644 index 00000000..728c4c2c --- /dev/null +++ b/onetl/_metrics/listener/execution.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from collections import defaultdict +from dataclasses import dataclass, field +from enum import Enum + +from onetl._metrics.listener.job import SparkListenerJob, SparkListenerJobStatus + + +class SparkListenerExecutionStatus(str, Enum): + STARTED = "STARTED" + COMPLETE = "COMPLETE" + FAILED = "FAILED" + + def __str__(self): + return self.value + + +class SparkSQLMetricNames(str, Enum): # noqa: WPS338 + # Metric names passed to SQLMetrics.createMetric(...) + # But only those we're interested in. + + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L233C55-L233C87 + NUMBER_OF_PARTITIONS_READ = "number of partitions read" + + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 + NUMBER_OF_FILES_READ = "number of files read" + SIZE_OF_FILES_READ = "size of files read" + + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L455-L456 + STATIC_NUMBER_OF_FILES_READ = "static number of files read" + STATIC_SIZE_OF_FILES_READ = "static size of files read" + + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 + NUMBER_OF_DYNAMIC_PART = "number of dynamic part" + NUMBER_OF_WRITTEN_FILES = "number of written files" + + def __str__(self): + return self.value + + +@dataclass +class SparkListenerExecution: + id: int + description: str | None = None + external_id: str | None = None + status: SparkListenerExecutionStatus = SparkListenerExecutionStatus.STARTED + + # These metrics are emitted by any command performed within this execution, so we can have multiple values. + # Some metrics can be summarized, but some not, so we store a list. + metrics: dict[SparkSQLMetricNames, list[str]] = field(default_factory=lambda: defaultdict(list), repr=False) + + _jobs: dict[int, SparkListenerJob] = field(default_factory=dict, repr=False, init=False) + + @property + def jobs(self) -> list[SparkListenerJob]: + result = [] + for job_id in sorted(self._jobs.keys()): + result.append(self._jobs[job_id]) + return result + + def on_execution_start(self, event): + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 + self.status = SparkListenerExecutionStatus.STARTED + + def on_execution_end(self, event): + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 + for job in self._jobs.values(): + if job.status == SparkListenerJobStatus.FAILED: + self.status = SparkListenerExecutionStatus.FAILED + break + else: + self.status = SparkListenerExecutionStatus.COMPLETE + + def on_job_start(self, event): + job_id = event.jobId() + job = SparkListenerJob.create(event) + self._jobs[job_id] = job + job.on_job_start(event) + + def on_job_end(self, event): + job_id = event.jobId() + job = self._jobs.get(job_id) + + if job: + job.on_job_end(event) + + # in some cases Execution consists of just one job with same id + if job_id == self.id: + self.on_execution_end(event) + + # push down events + def on_stage_start(self, event): + for job in self._jobs.values(): + job.on_stage_start(event) + + def on_stage_end(self, event): + for job in self._jobs.values(): + job.on_stage_end(event) + + def on_task_start(self, event): + for job in self._jobs.values(): + job.on_task_start(event) + + def on_task_end(self, event): + for job in self._jobs.values(): + job.on_task_end(event) diff --git a/onetl/_metrics/listener/job.py b/onetl/_metrics/listener/job.py new file mode 100644 index 00000000..b3abbd06 --- /dev/null +++ b/onetl/_metrics/listener/job.py @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + +from onetl._metrics.listener.stage import SparkListenerStage, SparkListenerStageStatus +from onetl._util.scala import scala_seq_to_python_list + + +class SparkListenerJobStatus(str, Enum): + RUNNING = "RUNNING" + SUCCEEDED = "SUCCEEDED" + FAILED = "FAILED" + UNKNOWN = "UNKNOWN" + + def __str__(self): + return self.value + + +@dataclass +class SparkListenerJob: + id: int + description: str | None = None + group_id: str | None = None + call_site: str | None = None + status: SparkListenerJobStatus = SparkListenerJobStatus.UNKNOWN + + _stages: dict[int, SparkListenerStage] = field(default_factory=dict, repr=False, init=False) + + @property + def stages(self) -> list[SparkListenerStage]: + result = [] + for stage_id in sorted(self._stages.keys()): + result.append(self._stages[stage_id]) + return result + + @classmethod + def create(cls, event): + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html + result = cls( + id=event.jobId(), + description=event.properties().get("spark.job.description"), + group_id=event.properties().get("spark.jobGroup.id"), + call_site=event.properties().get("callSite.short"), + ) + + stage_ids = scala_seq_to_python_list(event.stageIds()) + stage_infos = scala_seq_to_python_list(event.stageInfos()) + for stage_id, stage_info in zip(stage_ids, stage_infos): + result._stages[stage_id] = SparkListenerStage.create(stage_info) # noqa: WPS437 + + return result + + def on_job_start(self, event): + self.status = SparkListenerJobStatus.RUNNING + + def on_job_end(self, event): + for stage in self._stages.values(): + if stage.status == SparkListenerStageStatus.FAILED: + self.status = SparkListenerJobStatus.FAILED + break + else: + self.status = SparkListenerJobStatus.SUCCEEDED + + def on_stage_start(self, event): + stage_id = event.stageInfo().stageId() + stage = self._stages.get(stage_id) + if stage: + stage.on_stage_start(event) + + def on_stage_end(self, event): + stage_id = event.stageInfo().stageId() + stage = self._stages.get(stage_id) + if stage: + stage.on_stage_end(event) + + # push down events + def on_task_start(self, event): + for stage in self._stages.values(): + stage.on_task_start(event) + + def on_task_end(self, event): + for stage in self._stages.values(): + stage.on_task_end(event) diff --git a/onetl/_metrics/listener/listener.py b/onetl/_metrics/listener/listener.py new file mode 100644 index 00000000..3421e5ae --- /dev/null +++ b/onetl/_metrics/listener/listener.py @@ -0,0 +1,133 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from dataclasses import dataclass, field +from threading import current_thread +from typing import ClassVar + +from onetl._metrics.listener.base import BaseSparkListener +from onetl._metrics.listener.execution import ( + SparkListenerExecution, + SparkSQLMetricNames, +) + + +@dataclass +class SparkMetricsListener(BaseSparkListener): + THREAD_ID_KEY = "python.thread.id" + SQL_START_CLASS_NAME: ClassVar[str] = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart" + SQL_STOP_CLASS_NAME: ClassVar[str] = "org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd" + + _thread_id: str = field(default_factory=lambda: str(current_thread().ident), repr=False, init=False) + _recorded_executions: dict[int, SparkListenerExecution] = field(default_factory=dict, repr=False, init=False) + + def activate(self): + # we cannot override execution_id property as it set by Spark + # we also cannot use job tags, as they were implemented only in Spark 3.5+ + self.spark.sparkContext.setLocalProperty(self.THREAD_ID_KEY, self._thread_id) + return super().activate() + + def reset(self): + self._recorded_executions.clear() + return self + + @property + def executions(self): + return [ + execution for execution in self._recorded_executions.values() if execution.external_id == self._thread_id + ] + + def __enter__(self): + """Record only executions performed by current Spark thread. + + It is important to use this method only in combination with + :obj:`pyspark.util.InheritableThread` to preserve thread-local variables + between Python thread and Java thread. + """ + self.reset() + return super().__enter__() + + def onOtherEvent(self, event): + class_name = event.getClass().getName() + if class_name == self.SQL_START_CLASS_NAME: + self.onExecutionStart(event) + elif class_name == self.SQL_STOP_CLASS_NAME: + self.onExecutionEnd(event) + + def onExecutionStart(self, event): + execution_id = event.executionId() + description = event.description() + execution = SparkListenerExecution( + id=execution_id, + description=description, + ) + self._recorded_executions[execution_id] = execution + execution.on_execution_start(event) + + def onExecutionEnd(self, event): + execution_id = event.executionId() + execution = self._recorded_executions.get(execution_id) + if execution: + execution.on_execution_end(event) + + # Get execution metrics from SQLAppStatusStore, + # as SparkListenerSQLExecutionEnd event does not provide them: + # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala + session_status_store = self.spark._jsparkSession.sharedState().statusStore() # noqa: WPS437 + raw_execution = session_status_store.execution(execution.id).get() + metrics = raw_execution.metrics() + metric_values = session_status_store.executionMetrics(execution.id) + for i in range(metrics.size()): + metric = metrics.apply(i) + metric_name = metric.name() + if metric_name not in SparkSQLMetricNames: + continue + metric_value = metric_values.get(metric.accumulatorId()) + if not metric_value.isDefined(): + continue + execution.metrics[SparkSQLMetricNames(metric_name)].append(metric_value.get()) + + def onJobStart(self, event): + execution_id = event.properties().get("spark.sql.execution.id") + execution_thread_id = event.properties().get(self.THREAD_ID_KEY) + if execution_id is None: + # single job execution + job_id = event.jobId() + execution = SparkListenerExecution( + id=job_id, + description=event.properties().get("spark.job.description"), + external_id=execution_thread_id, + ) + self._recorded_executions[job_id] = execution + else: + execution = self._recorded_executions.get(int(execution_id)) + if execution is None: + return + + if execution_thread_id: + # SparkListenerSQLExecutionStart does not have properties, but SparkListenerJobStart does, + # use it as a source of external_id + execution.external_id = execution_thread_id + + execution.on_job_start(event) + + def onJobEnd(self, event): + for execution in self._recorded_executions.values(): + execution.on_job_end(event) + + def onStageSubmitted(self, event): + for execution in self._recorded_executions.values(): + execution.on_stage_start(event) + + def onStageCompleted(self, event): + for execution in self._recorded_executions.values(): + execution.on_stage_end(event) + + def onTaskStart(self, event): + for execution in self._recorded_executions.values(): + execution.on_task_start(event) + + def onTaskEnd(self, event): + for execution in self._recorded_executions.values(): + execution.on_task_end(event) diff --git a/onetl/_metrics/listener/stage.py b/onetl/_metrics/listener/stage.py new file mode 100644 index 00000000..4bf4dffb --- /dev/null +++ b/onetl/_metrics/listener/stage.py @@ -0,0 +1,66 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + +from onetl._metrics.listener.task import SparkListenerTask, SparkListenerTaskMetrics + + +class SparkListenerStageStatus(str, Enum): + ACTIVE = "ACTIVE" + COMPLETE = "COMPLETE" + FAILED = "FAILED" + PENDING = "PENDING" + SKIPPED = "SKIPPED" + + def __str__(self): + return self.value + + +@dataclass +class SparkListenerStage: + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/StageInfo.html + id: int + status: SparkListenerStageStatus = SparkListenerStageStatus.PENDING + metrics: SparkListenerTaskMetrics = field(default_factory=SparkListenerTaskMetrics, repr=False, init=False) + _tasks: dict[int, SparkListenerTask] = field(default_factory=dict, repr=False, init=False) + + @property + def tasks(self) -> list[SparkListenerTask]: + result = [] + for task_id in sorted(self._tasks.keys()): + result.append(self._tasks[task_id]) + return result + + @classmethod + def create(cls, stage_info): + return cls(id=stage_info.stageId()) + + def on_stage_start(self, event): + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html + self.status = SparkListenerStageStatus.ACTIVE + + def on_stage_end(self, event): + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html + stage_info = event.stageInfo() + if stage_info.failureReason().isDefined(): + self.status = SparkListenerStageStatus.FAILED + elif not self.tasks: + self.status = SparkListenerStageStatus.SKIPPED + else: + self.status = SparkListenerStageStatus.COMPLETE + + self.metrics = SparkListenerTaskMetrics.create(stage_info.taskMetrics()) + + def on_task_start(self, event): + task_info = event.taskInfo() + task_id = task_info.taskId() + self._tasks[task_id] = SparkListenerTask.create(task_info) + + def on_task_end(self, event): + task_id = event.taskInfo().taskId() + task = self._tasks.get(task_id) + if task: + task.on_task_end(event) diff --git a/onetl/_metrics/listener/task.py b/onetl/_metrics/listener/task.py new file mode 100644 index 00000000..4b27ffcf --- /dev/null +++ b/onetl/_metrics/listener/task.py @@ -0,0 +1,94 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from dataclasses import dataclass, field +from enum import Enum + + +class SparkListenerTaskStatus(str, Enum): + PENDING = "PENDING" + RUNNING = "RUNNING" + SUCCESS = "SUCCESS" + FAILED = "FAILED" + KILLED = "KILLED" + + def __str__(self): + return self.value + + +@dataclass +class SparkListenerTaskInputMetrics: + bytes_read: int = 0 + records_read: int = 0 + + @classmethod + def create(cls, task_input_metrics): + return cls( + bytes_read=task_input_metrics.bytesRead(), + records_read=task_input_metrics.recordsRead(), + ) + + +@dataclass +class SparkListenerTaskOutputMetrics: + bytes_written: int = 0 + records_written: int = 0 + + @classmethod + def create(cls, task_output_metrics): + return cls( + bytes_written=task_output_metrics.bytesWritten(), + records_written=task_output_metrics.recordsWritten(), + ) + + +@dataclass +class SparkListenerTaskMetrics: + """Python representation of Spark TaskMetrics object. + + See `documentation `_. + """ + + executor_run_time_milliseconds: int = 0 + executor_cpu_time_nanoseconds: int = 0 + peak_execution_memory_bytes: int = 0 + memory_spilled_bytes: int = 0 + disk_spilled_bytes: int = 0 + result_size_bytes: int = 0 + input_metrics: SparkListenerTaskInputMetrics = field(default_factory=SparkListenerTaskInputMetrics) + output_metrics: SparkListenerTaskOutputMetrics = field(default_factory=SparkListenerTaskOutputMetrics) + + @classmethod + def create(cls, task_metrics): + return cls( + executor_run_time_milliseconds=task_metrics.executorRunTime(), + executor_cpu_time_nanoseconds=task_metrics.executorCpuTime(), + peak_execution_memory_bytes=task_metrics.peakExecutionMemory(), + memory_spilled_bytes=task_metrics.memoryBytesSpilled(), + disk_spilled_bytes=task_metrics.diskBytesSpilled(), + result_size_bytes=task_metrics.resultSize(), + input_metrics=SparkListenerTaskInputMetrics.create(task_metrics.inputMetrics()), + output_metrics=SparkListenerTaskOutputMetrics.create(task_metrics.outputMetrics()), + ) + + +@dataclass +class SparkListenerTask: + id: int + status: SparkListenerTaskStatus = SparkListenerTaskStatus.PENDING + metrics: SparkListenerTaskMetrics | None = field(default=None, repr=False, init=False) + + @classmethod + def create(cls, task_info): + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/TaskInfo.html + return cls(id=task_info.taskId()) + + def on_task_start(self, event): + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html + self.status = SparkListenerTaskStatus(event.taskInfo().status()) + + def on_task_end(self, event): + # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html + self.status = SparkListenerTaskStatus(event.taskInfo().status()) + self.metrics = SparkListenerTaskMetrics.create(event.taskMetrics()) diff --git a/onetl/_metrics/output.py b/onetl/_metrics/output.py new file mode 100644 index 00000000..8600bb68 --- /dev/null +++ b/onetl/_metrics/output.py @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import os +from pprint import pformat + +from humanize import naturalsize + +from onetl.impl import BaseModel + + +class SparkOutputMetrics(BaseModel): + written_bytes: int = 0 + written_rows: int = 0 + created_files: int = 0 + created_partitions: int = 0 + + @property + def is_empty(self) -> bool: + return not any([self.written_bytes, self.written_rows, self.created_files]) + + def update(self, other: SparkOutputMetrics) -> SparkOutputMetrics: + self.written_bytes += other.written_bytes + self.written_rows += other.written_rows + self.created_files += other.created_files + self.created_partitions = max([self.created_partitions, other.created_partitions]) + return self + + @property + def details(self) -> str: + if self.is_empty: + return "No data" + + result = [] + result.append(f"Written rows: {pformat(self.written_rows)}") + + if self.written_bytes: + result.append(f"Written size: {naturalsize(self.written_bytes)}") + + if self.created_files: + result.append(f"Created files: {pformat(self.created_files)}") + + if self.created_partitions: + result.append(f"Created partitions: {pformat(self.created_partitions)}") + + return os.linesep.join(result) + + def __str__(self): + return self.details diff --git a/onetl/_metrics/recorder.py b/onetl/_metrics/recorder.py new file mode 100644 index 00000000..4cc5745b --- /dev/null +++ b/onetl/_metrics/recorder.py @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from typing import TYPE_CHECKING + +from onetl._metrics.command import SparkCommandMetrics +from onetl._metrics.extract import extract_metrics_from_execution +from onetl._metrics.listener import SparkMetricsListener + +if TYPE_CHECKING: + from pyspark.sql import SparkSession + + +class SparkMetricsRecorder: + def __init__(self, spark: SparkSession): + self._listener = SparkMetricsListener(spark=spark) + + def __enter__(self): + self._listener.__enter__() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._listener.__exit__(exc_type, exc_val, exc_tb) + + def metrics(self) -> SparkCommandMetrics: + result = SparkCommandMetrics() + for execution in self._listener.executions: + result = result.update(extract_metrics_from_execution(execution)) + return result diff --git a/onetl/_util/java.py b/onetl/_util/java.py index df88b1a5..45111432 100644 --- a/onetl/_util/java.py +++ b/onetl/_util/java.py @@ -4,6 +4,9 @@ from typing import TYPE_CHECKING +from onetl._util.spark import get_spark_version +from onetl._util.version import Version + if TYPE_CHECKING: from py4j.java_gateway import JavaGateway from pyspark.sql import SparkSession @@ -24,3 +27,34 @@ def try_import_java_class(spark_session: SparkSession, name: str): klass = getattr(gateway.jvm, name) gateway.help(klass, display=False) return klass + + +def start_callback_server(spark_session: SparkSession): + """ + Start Py4J callback server. Important to receive Java events on Python side, + e.g. in Spark Listener implementations. + """ + gateway = get_java_gateway(spark_session) + if get_spark_version(spark_session) >= Version("2.4"): + from pyspark.java_gateway import ensure_callback_server_started + + ensure_callback_server_started(gateway) + return + + # PySpark 2.3 + if "_callback_server" not in gateway.__dict__ or gateway._callback_server is None: + from py4j.java_gateway import JavaObject + + gateway.callback_server_parameters.eager_load = True + gateway.callback_server_parameters.daemonize = True + gateway.callback_server_parameters.daemonize_connections = True + gateway.callback_server_parameters.port = 0 + gateway.start_callback_server(gateway.callback_server_parameters) + cbport = gateway._callback_server.server_socket.getsockname()[1] + gateway._callback_server.port = cbport + # gateway with real port + gateway._python_proxy_port = gateway._callback_server.port + # get the GatewayServer object in JVM by ID + java_gateway = JavaObject("GATEWAY_SERVER", gateway._gateway_client) + # update the port of CallbackClient with real port + java_gateway.resetCallbackClient(java_gateway.getCallbackClient().getAddress(), gateway._python_proxy_port) diff --git a/onetl/_util/scala.py b/onetl/_util/scala.py index 397a9157..5e6c21bc 100644 --- a/onetl/_util/scala.py +++ b/onetl/_util/scala.py @@ -12,3 +12,10 @@ def get_default_scala_version(spark_version: Version) -> Version: if spark_version.major < 3: return Version("2.11") return Version("2.12") + + +def scala_seq_to_python_list(seq) -> list: + result = [] + for i in range(seq.size()): + result.append(seq.apply(i)) + return result diff --git a/onetl/strategy/hwm_store/__init__.py b/onetl/strategy/hwm_store/__init__.py index 0b931301..7a0338d3 100644 --- a/onetl/strategy/hwm_store/__init__.py +++ b/onetl/strategy/hwm_store/__init__.py @@ -23,7 +23,7 @@ register_spark_type_to_hwm_type_mapping, ) -__all__ = [ # noqa: WPS410 +__all__ = [ "BaseHWMStore", "SparkTypeToHWM", "register_spark_type_to_hwm_type_mapping", diff --git a/onetl/version.py b/onetl/version.py index dada22dd..1a3c6cec 100644 --- a/onetl/version.py +++ b/onetl/version.py @@ -8,4 +8,4 @@ VERSION_FILE = Path(__file__).parent / "VERSION" -__version__ = VERSION_FILE.read_text().strip() # noqa: WPS410 +__version__ = VERSION_FILE.read_text().strip() diff --git a/setup.cfg b/setup.cfg index d12261ed..7ddb67b0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -275,7 +275,9 @@ ignore = # WPS474 Found import object collision WPS474, # WPS318 Found extra indentation - WPS318 + WPS318, +# WPS410 Found wrong metadata variable: __all__ + WPS410 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = @@ -350,6 +352,9 @@ per-file-ignores = onetl/hooks/slot.py: # WPS210 Found too many local variables WPS210, + onetl/_metrics/listener/*: +# N802 function name 'onJobStart' should be lowercase + N802, tests/*: # Found too many empty lines in `def` WPS473, diff --git a/tests/.coveragerc b/tests/.coveragerc index 08633e6c..55af8c09 100644 --- a/tests/.coveragerc +++ b/tests/.coveragerc @@ -7,6 +7,7 @@ data_file = reports/.coverage [report] exclude_lines = pragma: no cover + no cover: start(?s:.)*?no cover: stop def __repr__ if self.debug: if settings.DEBUG diff --git a/tests/fixtures/global_hwm_store.py b/tests/fixtures/global_hwm_store.py index f10a0089..2e006b92 100644 --- a/tests/fixtures/global_hwm_store.py +++ b/tests/fixtures/global_hwm_store.py @@ -5,7 +5,7 @@ @pytest.fixture(scope="function", autouse=True) def global_hwm_store(request): # noqa: WPS325 test_function = request.function - entities = test_function.__name__.split("_") if test_function else [] + entities = set(test_function.__name__.split("_")) if test_function else set() if "strategy" in entities: with MemoryHWMStore(): diff --git a/tests/fixtures/processing/fixtures.py b/tests/fixtures/processing/fixtures.py index 3f541f69..9bb62689 100644 --- a/tests/fixtures/processing/fixtures.py +++ b/tests/fixtures/processing/fixtures.py @@ -21,10 +21,14 @@ def processing(request, spark): "kafka": ("tests.fixtures.processing.kafka", "KafkaProcessing"), } - db_storage_name = request.function.__name__.split("_")[1] - if db_storage_name not in processing_classes: - raise ValueError(f"Wrong name. Please use one of: {list(processing_classes.keys())}") + test_name_parts = set(request.function.__name__.split("_")) + matches = set(processing_classes.keys()) & test_name_parts + if not matches or len(matches) > 1: + raise ValueError( + f"Test name {request.function.__name__} should have one of these components: {list(processing_classes.keys())}", + ) + db_storage_name = matches.pop() module_name, class_name = processing_classes[db_storage_name] module = import_module(module_name) db_processing = getattr(module, class_name) diff --git a/tests/tests_integration/test_metrics/test_spark_metrics_recorder_file_df.py b/tests/tests_integration/test_metrics/test_spark_metrics_recorder_file_df.py new file mode 100644 index 00000000..f59acf89 --- /dev/null +++ b/tests/tests_integration/test_metrics/test_spark_metrics_recorder_file_df.py @@ -0,0 +1,171 @@ +import time +from contextlib import suppress +from pathlib import Path + +import pytest + +from onetl._metrics.recorder import SparkMetricsRecorder +from onetl._util.spark import get_spark_version +from onetl.file import FileDFReader, FileDFWriter +from onetl.file.format import CSV, JSON + +pytestmark = [ + pytest.mark.local_fs, + pytest.mark.file_df_connection, + pytest.mark.connection, + pytest.mark.csv, + # SparkListener does not give guarantees of delivering execution metrics in time + pytest.mark.flaky(reruns=5), +] + + +def test_spark_metrics_recorder_file_df_reader( + spark, + local_fs_file_df_connection_with_path_and_files, +): + local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files + files_path: Path = source_path / "csv/with_header" + + reader = FileDFReader( + connection=local_fs, + format=CSV(header=True), + source_path=files_path, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.input.read_rows + assert metrics.input.read_bytes + # file related metrics are too flaky to assert + + +def test_spark_metrics_recorder_file_df_reader_no_files( + spark, + local_fs_file_df_connection_with_path, + file_df_schema, +): + local_fs, source_path = local_fs_file_df_connection_with_path + + reader = FileDFReader( + connection=local_fs, + format=CSV(), + source_path=source_path, + df_schema=file_df_schema, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows + assert not metrics.input.read_files + + +def test_spark_metrics_recorder_file_df_reader_no_data_after_filter( + spark, + local_fs_file_df_connection_with_path_and_files, + file_df_schema, +): + local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files + files_path = source_path / "csv/with_header" + + reader = FileDFReader( + connection=local_fs, + format=CSV(header=True), + source_path=files_path, + df_schema=file_df_schema, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run().where("str_value = 'unknown'") + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + + spark_version = get_spark_version(spark) + if spark_version.major >= 3: + # Spark 3.0 does not include skipped rows to metrics + assert not metrics.input.read_rows + else: + # Spark 2.0 does + assert metrics.input.read_rows + + +def test_spark_metrics_recorder_file_df_reader_error( + spark, + local_fs_file_df_connection_with_path_and_files, +): + local_fs, source_path, _ = local_fs_file_df_connection_with_path_and_files + files_path: Path = source_path / "csv/with_header" + + reader = FileDFReader( + connection=local_fs, + format=JSON(), + source_path=files_path, + ) + + with SparkMetricsRecorder(spark) as recorder: + with suppress(Exception): + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + # some files metadata may be scanned, but file content was not read + assert not metrics.input.raw_file_bytes + + +def test_spark_metrics_recorder_file_df_writer( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, +): + local_fs, target_path = local_fs_file_df_connection_with_path + + writer = FileDFWriter( + connection=local_fs, + format=CSV(), + target_path=target_path, + options=FileDFWriter.Options(if_exists="append"), + ) + + with SparkMetricsRecorder(spark) as recorder: + writer.run(file_df_dataframe) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.output.written_rows == file_df_dataframe.count() + assert metrics.output.written_bytes + # file related metrics are too flaky to assert + + +def test_spark_metrics_recorder_file_df_writer_empty_input( + spark, + local_fs_file_df_connection_with_path, + file_df_dataframe, +): + local_fs, target_path = local_fs_file_df_connection_with_path + + df = file_df_dataframe.limit(0) + + writer = FileDFWriter( + connection=local_fs, + format=CSV(), + target_path=target_path, + options=FileDFWriter.Options(if_exists="append"), + ) + + with SparkMetricsRecorder(spark) as recorder: + writer.run(df) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.output.written_rows + assert not metrics.output.written_bytes diff --git a/tests/tests_integration/test_metrics/test_spark_metrics_recorder_hive.py b/tests/tests_integration/test_metrics/test_spark_metrics_recorder_hive.py new file mode 100644 index 00000000..7e8dc218 --- /dev/null +++ b/tests/tests_integration/test_metrics/test_spark_metrics_recorder_hive.py @@ -0,0 +1,159 @@ +import time + +import pytest + +from onetl._metrics.recorder import SparkMetricsRecorder +from onetl.connection import Hive +from onetl.db import DBReader, DBWriter +from tests.util.rand import rand_str + +pytestmark = [ + pytest.mark.hive, + pytest.mark.db_connection, + pytest.mark.connection, + # SparkListener does not give guarantees of delivering execution metrics in time + pytest.mark.flaky(reruns=5), +] + + +def test_spark_metrics_recorder_hive_read_count(spark, load_table_data): + hive = Hive(cluster="rnd-dwh", spark=spark) + reader = DBReader( + connection=hive, + source=load_table_data.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + rows = df.count() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.input.read_rows == rows + assert metrics.input.read_bytes + # in some cases files are read, in some cases only metastore statistics is used + + +def test_spark_metrics_recorder_hive_read_collect(spark, load_table_data): + hive = Hive(cluster="rnd-dwh", spark=spark) + reader = DBReader( + connection=hive, + source=load_table_data.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + rows = len(df.collect()) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.input.read_rows == rows + assert metrics.input.read_bytes + # file related metrics are too flaky to assert + + +def test_spark_metrics_recorder_hive_read_empty_source(spark, prepare_schema_table): + hive = Hive(cluster="rnd-dwh", spark=spark) + reader = DBReader( + connection=hive, + source=prepare_schema_table.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows + assert not metrics.input.read_bytes + + +def test_spark_metrics_recorder_hive_read_no_data_after_filter(spark, load_table_data): + hive = Hive(cluster="rnd-dwh", spark=spark) + reader = DBReader( + connection=hive, + source=load_table_data.full_name, + where="1=0", + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows + assert not metrics.input.read_bytes + + +def test_spark_metrics_recorder_hive_sql(spark, load_table_data): + hive = Hive(cluster="rnd-dwh", spark=spark) + + with SparkMetricsRecorder(spark) as recorder: + df = hive.sql(f"SELECT * FROM {load_table_data.full_name}") + rows = len(df.collect()) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.input.read_rows == rows + assert metrics.input.read_bytes + # file related metrics are too flaky to assert + + +def test_spark_metrics_recorder_hive_write(spark, processing, get_schema_table): + df = processing.create_spark_df(spark) + + hive = Hive(cluster="rnd-dwh", spark=spark) + writer = DBWriter( + connection=hive, + target=get_schema_table.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + writer.run(df) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.output.written_rows == df.count() + assert metrics.output.written_bytes + # file related metrics are too flaky to assert + + +def test_spark_metrics_recorder_hive_write_empty(spark, processing, get_schema_table): + df = processing.create_spark_df(spark).limit(0) + + hive = Hive(cluster="rnd-dwh", spark=spark) + writer = DBWriter( + connection=hive, + target=get_schema_table.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + writer.run(df) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.output.written_rows + + +def test_spark_metrics_recorder_hive_execute(request, spark, processing, get_schema_table): + df = processing.create_spark_df(spark) + view_name = rand_str() + df.createOrReplaceTempView(view_name) + + def finalizer(): + spark.sql(f"DROP VIEW IF EXISTS {view_name}") + + request.addfinalizer(finalizer) + + hive = Hive(cluster="rnd-dwh", spark=spark) + + with SparkMetricsRecorder(spark) as recorder: + hive.execute(f"CREATE TABLE {get_schema_table.full_name} AS SELECT * FROM {view_name}") + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.output.written_rows == df.count() + assert metrics.output.written_bytes + # file related metrics are too flaky to assert diff --git a/tests/tests_integration/test_metrics/test_spark_metrics_recorder_postgres.py b/tests/tests_integration/test_metrics/test_spark_metrics_recorder_postgres.py new file mode 100644 index 00000000..67e31591 --- /dev/null +++ b/tests/tests_integration/test_metrics/test_spark_metrics_recorder_postgres.py @@ -0,0 +1,205 @@ +import time + +import pytest + +from onetl._metrics.recorder import SparkMetricsRecorder +from onetl._util.spark import get_spark_version +from onetl.connection import Postgres +from onetl.db import DBReader, DBWriter + +pytestmark = [ + pytest.mark.postgres, + pytest.mark.db_connection, + pytest.mark.connection, + # SparkListener does not give guarantees of delivering execution metrics in time + pytest.mark.flaky(reruns=5), +] + + +def test_spark_metrics_recorder_postgres_read(spark, processing, load_table_data): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + reader = DBReader( + connection=postgres, + source=load_table_data.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + rows = len(df.collect()) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.input.read_rows == rows + # JDBC does not provide information about data size + assert not metrics.input.read_bytes + + +def test_spark_metrics_recorder_postgres_read_empty_source(spark, processing, prepare_schema_table): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + reader = DBReader( + connection=postgres, + source=prepare_schema_table.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows + + +def test_spark_metrics_recorder_postgres_read_no_data_after_filter(spark, processing, load_table_data): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + reader = DBReader( + connection=postgres, + source=load_table_data.full_name, + where="1=0", + ) + + with SparkMetricsRecorder(spark) as recorder: + df = reader.run() + df.collect() + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows + + +def test_spark_metrics_recorder_postgres_sql(spark, processing, load_table_data): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + with SparkMetricsRecorder(spark) as recorder: + df = postgres.sql(f"SELECT * FROM {load_table_data.full_name}") + rows = len(df.collect()) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert metrics.input.read_rows == rows + + +def test_spark_metrics_recorder_postgres_write(spark, processing, get_schema_table): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + df = processing.create_spark_df(spark) + + writer = DBWriter( + connection=postgres, + target=get_schema_table.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + writer.run(df) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + spark_version = get_spark_version(spark) + if spark_version.major >= 3: + # Spark started collecting JDBC write bytes only since Spark 3.0: + # https://issues.apache.org/jira/browse/SPARK-29461 + assert metrics.output.written_rows == df.count() + else: + assert not metrics.output.written_rows + # JDBC does not provide information about data size + assert not metrics.output.written_bytes + + +def test_spark_metrics_recorder_postgres_write_empty(spark, processing, get_schema_table): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + df = processing.create_spark_df(spark).limit(0) + + writer = DBWriter( + connection=postgres, + target=get_schema_table.full_name, + ) + + with SparkMetricsRecorder(spark) as recorder: + writer.run(df) + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.output.written_rows + + +def test_spark_metrics_recorder_postgres_fetch(spark, processing, load_table_data): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + with SparkMetricsRecorder(spark) as recorder: + postgres.fetch(f"SELECT * FROM {load_table_data.full_name}") + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows + + +def test_spark_metrics_recorder_postgres_execute(spark, processing, load_table_data): + postgres = Postgres( + host=processing.host, + port=processing.port, + user=processing.user, + password=processing.password, + database=processing.database, + spark=spark, + ) + + new_table = load_table_data.full_name + "_new" + + with SparkMetricsRecorder(spark) as recorder: + postgres.execute(f"CREATE TABLE {new_table} AS SELECT * FROM {load_table_data.full_name}") + + time.sleep(0.1) # sleep to fetch late metrics from SparkListener + metrics = recorder.metrics() + assert not metrics.input.read_rows diff --git a/tests/tests_unit/test_metrics/test_spark_command_metrics.py b/tests/tests_unit/test_metrics/test_spark_command_metrics.py new file mode 100644 index 00000000..f4da3070 --- /dev/null +++ b/tests/tests_unit/test_metrics/test_spark_command_metrics.py @@ -0,0 +1,70 @@ +import textwrap +from datetime import timedelta + +from onetl._metrics.command import SparkCommandMetrics +from onetl._metrics.driver import SparkDriverMetrics +from onetl._metrics.executor import SparkExecutorMetrics +from onetl._metrics.input import SparkInputMetrics +from onetl._metrics.output import SparkOutputMetrics + + +def test_spark_metrics_command_is_empty(): + empty_metrics = SparkCommandMetrics() + assert empty_metrics.is_empty + + no_input_output = SparkCommandMetrics( + driver=SparkDriverMetrics(in_memory_bytes=1_000_000), + executor=SparkExecutorMetrics(total_run_time=timedelta(microseconds=1)), + ) + assert no_input_output.is_empty + + with_input = SparkCommandMetrics( + input=SparkInputMetrics(read_rows=1), + ) + assert not with_input.is_empty + + with_output = SparkCommandMetrics( + output=SparkOutputMetrics(written_rows=1), + ) + assert not with_output.is_empty + + +def test_spark_metrics_command_details(): + empty_metrics = SparkCommandMetrics() + assert empty_metrics.details == "No data" + assert str(empty_metrics) == empty_metrics.details + + jdbc_fetch_metrics = SparkCommandMetrics( + input=SparkInputMetrics(read_rows=1_000), + driver=SparkDriverMetrics(in_memory_bytes=1_000_000), + ) + + expected = textwrap.dedent( + """ + Input: + Read rows: 1000 + Driver: + In-memory data (approximate): 1.0 MB + """, + ) + assert jdbc_fetch_metrics.details == expected.strip() + assert str(jdbc_fetch_metrics) == jdbc_fetch_metrics.details + + jdbc_write_metrics = SparkCommandMetrics( + output=SparkOutputMetrics(written_rows=1_000), + executor=SparkExecutorMetrics( + total_run_time=timedelta(seconds=2), + total_cpu_time=timedelta(seconds=1), + ), + ) + expected = textwrap.dedent( + """ + Output: + Written rows: 1000 + Executor: + Total run time: 2 seconds + Total CPU time: 1 second + """, + ) + assert jdbc_write_metrics.details == expected.strip() + assert str(jdbc_write_metrics) == jdbc_write_metrics.details diff --git a/tests/tests_unit/test_metrics/test_spark_driver_metrics.py b/tests/tests_unit/test_metrics/test_spark_driver_metrics.py new file mode 100644 index 00000000..cd4c5dc9 --- /dev/null +++ b/tests/tests_unit/test_metrics/test_spark_driver_metrics.py @@ -0,0 +1,22 @@ +from onetl._metrics.driver import SparkDriverMetrics + + +def test_spark_metrics_driver_is_empty(): + empty_metrics = SparkDriverMetrics() + assert empty_metrics.is_empty + + metrics1 = SparkDriverMetrics(in_memory_bytes=1_000) + assert metrics1.is_empty + + metrics2 = SparkDriverMetrics(in_memory_bytes=1_000_000) + assert not metrics2.is_empty + + +def test_spark_metrics_driver_details(): + empty_metrics = SparkDriverMetrics() + assert empty_metrics.details == "No data" + assert str(empty_metrics) == empty_metrics.details + + jdbc_metrics = SparkDriverMetrics(in_memory_bytes=1_000_000) + assert jdbc_metrics.details == "In-memory data (approximate): 1.0 MB" + assert str(jdbc_metrics) == jdbc_metrics.details diff --git a/tests/tests_unit/test_metrics/test_spark_executor_metrics.py b/tests/tests_unit/test_metrics/test_spark_executor_metrics.py new file mode 100644 index 00000000..3acd7190 --- /dev/null +++ b/tests/tests_unit/test_metrics/test_spark_executor_metrics.py @@ -0,0 +1,58 @@ +import textwrap +from datetime import timedelta + +from onetl._metrics.executor import SparkExecutorMetrics + + +def test_spark_metrics_executor_is_empty(): + empty_metrics = SparkExecutorMetrics() + assert empty_metrics.is_empty + + run_metrics = SparkExecutorMetrics( + total_run_time=timedelta(microseconds=1), + ) + assert not run_metrics.is_empty + + +def test_spark_metrics_executor_details(): + empty_metrics = SparkExecutorMetrics() + assert empty_metrics.details == "No data" + assert str(empty_metrics) == empty_metrics.details + + full_metrics = SparkExecutorMetrics( + total_run_time=timedelta(hours=2), + total_cpu_time=timedelta(hours=1), + peak_memory_bytes=1_000_000_000, + memory_spilled_bytes=2_000_000_000, + disk_spilled_bytes=3_000_000_000, + ) + + assert ( + full_metrics.details + == textwrap.dedent( + """ + Total run time: 2 hours + Total CPU time: 1 hour + Peak memory: 1.0 GB + Memory spilled: 2.0 GB + Disk spilled: 3.0 GB + """, + ).strip() + ) + assert str(full_metrics) == full_metrics.details + + minimal_metrics = SparkExecutorMetrics( + total_run_time=timedelta(seconds=1), + total_cpu_time=timedelta(seconds=1), + ) + + assert ( + minimal_metrics.details + == textwrap.dedent( + """ + Total run time: 1 second + Total CPU time: 1 second + """, + ).strip() + ) + assert str(minimal_metrics) == minimal_metrics.details diff --git a/tests/tests_unit/test_metrics/test_spark_input_metrics.py b/tests/tests_unit/test_metrics/test_spark_input_metrics.py new file mode 100644 index 00000000..0de1a57a --- /dev/null +++ b/tests/tests_unit/test_metrics/test_spark_input_metrics.py @@ -0,0 +1,50 @@ +import textwrap + +from onetl._metrics.input import SparkInputMetrics + + +def test_spark_metrics_input_is_empty(): + empty_metrics = SparkInputMetrics() + assert empty_metrics.is_empty + + metrics1 = SparkInputMetrics(read_rows=1) + assert not metrics1.is_empty + + metrics2 = SparkInputMetrics(read_files=1) + assert not metrics2.is_empty + + metrics3 = SparkInputMetrics(read_bytes=1) + assert not metrics3.is_empty + + +def test_spark_metrics_input_details(): + empty_metrics = SparkInputMetrics() + assert empty_metrics.details == "No data" + assert str(empty_metrics) == empty_metrics.details + + file_df_metrics = SparkInputMetrics( + read_rows=1_000, + read_partitions=4, + read_files=4, + read_bytes=2_000_000, + raw_file_bytes=5_000_000, + ) + + expected = textwrap.dedent( + """ + Read rows: 1000 + Read partitions: 4 + Read files: 4 + Read size: 2.0 MB + Raw files size: 5.0 MB + """, + ) + assert file_df_metrics.details == expected.strip() + assert str(file_df_metrics) == file_df_metrics.details + + jdbc_metrics = SparkInputMetrics( + read_rows=1_000, + ) + + assert jdbc_metrics.details == "Read rows: 1000" + assert str(jdbc_metrics) == jdbc_metrics.details diff --git a/tests/tests_unit/test_metrics/test_spark_output_metrics.py b/tests/tests_unit/test_metrics/test_spark_output_metrics.py new file mode 100644 index 00000000..e8cb9ae7 --- /dev/null +++ b/tests/tests_unit/test_metrics/test_spark_output_metrics.py @@ -0,0 +1,46 @@ +import textwrap + +from onetl._metrics.output import SparkOutputMetrics + + +def test_spark_metrics_output_is_empty(): + empty_metrics = SparkOutputMetrics() + assert empty_metrics.is_empty + + metric1 = SparkOutputMetrics(written_rows=1) + assert not metric1.is_empty + + metric2 = SparkOutputMetrics(written_bytes=1) + assert not metric2.is_empty + + metric3 = SparkOutputMetrics(created_files=1) + assert not metric3.is_empty + + +def test_spark_metrics_output_details(): + empty_metrics = SparkOutputMetrics() + assert empty_metrics.details == "No data" + assert str(empty_metrics) == empty_metrics.details + + hive_metrics = SparkOutputMetrics( + written_rows=1_000, + written_bytes=2_000_000, + created_files=4, + created_partitions=4, + ) + + expected = textwrap.dedent( + """ + Written rows: 1000 + Written size: 2.0 MB + Created files: 4 + Created partitions: 4 + """, + ) + assert hive_metrics.details == expected.strip() + assert str(hive_metrics) == hive_metrics.details + + jdbc_metrics = SparkOutputMetrics(written_rows=1_000) + + assert jdbc_metrics.details == "Written rows: 1000" + assert str(jdbc_metrics) == jdbc_metrics.details From 3c254053e496cae8d30b70f2c2baa74785fa96b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 8 Aug 2024 09:45:59 +0000 Subject: [PATCH 40/64] [DOP-18571] Collect and log Spark metrics in various method calls --- docs/changelog/next_release/303.feature.1.rst | 1 + docs/changelog/next_release/303.feature.2.rst | 10 +++ onetl/_util/spark.py | 17 ++++- onetl/base/base_db_connection.py | 3 +- onetl/base/base_file_df_connection.py | 4 +- .../db_connection/hive/connection.py | 49 +++++++++++-- .../jdbc_connection/connection.py | 8 ++- .../db_connection/jdbc_mixin/connection.py | 69 +++++++++++-------- .../db_connection/oracle/connection.py | 39 +++-------- onetl/db/db_writer/db_writer.py | 47 ++++++++++--- onetl/file/file_df_writer/file_df_writer.py | 40 +++++++++-- tests/.coveragerc | 2 +- .../test_postgres_integration.py | 4 +- 13 files changed, 208 insertions(+), 85 deletions(-) create mode 100644 docs/changelog/next_release/303.feature.1.rst create mode 100644 docs/changelog/next_release/303.feature.2.rst diff --git a/docs/changelog/next_release/303.feature.1.rst b/docs/changelog/next_release/303.feature.1.rst new file mode 100644 index 00000000..8c0b1e19 --- /dev/null +++ b/docs/changelog/next_release/303.feature.1.rst @@ -0,0 +1 @@ +Log estimated size of in-memory dataframe created by ``JDBC.fetch`` and ``JDBC.execute`` methods. diff --git a/docs/changelog/next_release/303.feature.2.rst b/docs/changelog/next_release/303.feature.2.rst new file mode 100644 index 00000000..92bbe13c --- /dev/null +++ b/docs/changelog/next_release/303.feature.2.rst @@ -0,0 +1,10 @@ +Collect Spark execution metrics in following methods, and log then in DEBUG mode: +* ``DBWriter.run()`` +* ``FileDFWriter.run()`` +* ``Hive.sql()`` +* ``Hive.execute()`` + +This is implemented using custom ``SparkListener`` which wraps the entire method call, and +then report collected metrics. But these metrics sometimes may be missing due to Spark architecture, +so they are not reliable source of information. That's why logs are printed only in DEBUG mode, and +are not returned as method call result. diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index f172b1c9..2b2edbaf 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -16,7 +16,7 @@ from pydantic import SecretStr # type: ignore[no-redef, assignment] if TYPE_CHECKING: - from pyspark.sql import SparkSession + from pyspark.sql import DataFrame, SparkSession from pyspark.sql.conf import RuntimeConfig @@ -136,6 +136,21 @@ def get_spark_version(spark_session: SparkSession) -> Version: return Version(spark_session.version) +def estimate_dataframe_size(spark_session: SparkSession, df: DataFrame) -> int: + """ + Estimate in-memory DataFrame size in bytes. If cannot be estimated, return 0. + + Using Spark's `SizeEstimator `_. + """ + try: + size_estimator = spark_session._jvm.org.apache.spark.util.SizeEstimator # type: ignore[union-attr] + return size_estimator.estimate(df._jdf) + except Exception: + # SizeEstimator uses Java reflection which may behave differently in different Java versions, + # and also may be prohibited. + return 0 + + def get_executor_total_cores(spark_session: SparkSession, include_driver: bool = False) -> tuple[int | float, dict]: """ Calculate maximum number of cores which can be used by Spark on all executors. diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index f9c7bcac..2c427deb 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -10,7 +10,7 @@ if TYPE_CHECKING: from etl_entities.hwm import HWM - from pyspark.sql import DataFrame + from pyspark.sql import DataFrame, SparkSession from pyspark.sql.types import StructField, StructType @@ -106,6 +106,7 @@ class BaseDBConnection(BaseConnection): Implements generic methods for reading and writing dataframe from/to database-like source """ + spark: SparkSession Dialect = BaseDBDialect @property diff --git a/onetl/base/base_file_df_connection.py b/onetl/base/base_file_df_connection.py index c54390ce..28c57f3c 100644 --- a/onetl/base/base_file_df_connection.py +++ b/onetl/base/base_file_df_connection.py @@ -11,7 +11,7 @@ from onetl.base.pure_path_protocol import PurePathProtocol if TYPE_CHECKING: - from pyspark.sql import DataFrame, DataFrameReader, DataFrameWriter + from pyspark.sql import DataFrame, DataFrameReader, DataFrameWriter, SparkSession from pyspark.sql.types import StructType @@ -72,6 +72,8 @@ class BaseFileDFConnection(BaseConnection): .. versionadded:: 0.9.0 """ + spark: SparkSession + @abstractmethod def check_if_format_supported( self, diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 81c50e87..61032987 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -13,6 +13,7 @@ except (ImportError, AttributeError): from pydantic import validator # type: ignore[no-redef, assignment] +from onetl._metrics.recorder import SparkMetricsRecorder from onetl._util.spark import inject_spark_param from onetl._util.sql import clear_statement from onetl.connection.db_connection.db_connection import DBConnection @@ -210,8 +211,29 @@ def sql( log.info("|%s| Executing SQL query:", self.__class__.__name__) log_lines(log, query) - df = self._execute_sql(query) - log.info("|Spark| DataFrame successfully created from SQL statement") + with SparkMetricsRecorder(self.spark) as recorder: + try: + df = self._execute_sql(query) + except Exception: + log.error("|%s| Query failed", self.__class__.__name__) + + metrics = recorder.metrics() + if log.isEnabledFor(logging.DEBUG) and not metrics.is_empty: + # as SparkListener results are not guaranteed to be received in time, + # some metrics may be missing. To avoid confusion, log only in debug, and with a notice + log.info("|%s| Recorded metrics (some values may be missing!):", self.__class__.__name__) + log_lines(log, str(metrics), level=logging.DEBUG) + raise + + log.info("|Spark| DataFrame successfully created from SQL statement") + + metrics = recorder.metrics() + if log.isEnabledFor(logging.DEBUG) and not metrics.is_empty: + # as SparkListener results are not guaranteed to be received in time, + # some metrics may be missing. To avoid confusion, log only in debug, and with a notice + log.info("|%s| Recorded metrics (some values may be missing!):", self.__class__.__name__) + log_lines(log, str(metrics), level=logging.DEBUG) + return df @slot @@ -236,8 +258,27 @@ def execute( log.info("|%s| Executing statement:", self.__class__.__name__) log_lines(log, statement) - self._execute_sql(statement).collect() - log.info("|%s| Call succeeded", self.__class__.__name__) + with SparkMetricsRecorder(self.spark) as recorder: + try: + self._execute_sql(statement).collect() + except Exception: + log.error("|%s| Execution failed", self.__class__.__name__) + metrics = recorder.metrics() + if log.isEnabledFor(logging.DEBUG) and not metrics.is_empty: + # as SparkListener results are not guaranteed to be received in time, + # some metrics may be missing. To avoid confusion, log only in debug, and with a notice + log.info("|%s| Recorded metrics (some values may be missing!):", self.__class__.__name__) + log_lines(log, str(metrics), level=logging.DEBUG) + raise + + log.info("|%s| Execution succeeded", self.__class__.__name__) + + metrics = recorder.metrics() + if log.isEnabledFor(logging.DEBUG) and not metrics.is_empty: + # as SparkListener results are not guaranteed to be received in time, + # some metrics may be missing. To avoid confusion, log only in debug, and with a notice + log.info("|%s| Recorded metrics (some values may be missing!):", self.__class__.__name__) + log_lines(log, str(metrics), level=logging.DEBUG) @slot def write_df_to_target( diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 5b0aebeb..2fc2f7cf 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -92,9 +92,13 @@ def sql( log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) log_lines(log, query) - df = self._query_on_executor(query, self.SQLOptions.parse(options)) + try: + df = self._query_on_executor(query, self.SQLOptions.parse(options)) + except Exception: + log.error("|%s| Query failed!", self.__class__.__name__) + raise - log.info("|Spark| DataFrame successfully created from SQL statement ") + log.info("|Spark| DataFrame successfully created from SQL statement") return df @slot diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index e8c19e38..84276147 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -9,15 +9,14 @@ from enum import Enum, auto from typing import TYPE_CHECKING, Callable, ClassVar, Optional, TypeVar -from onetl.impl.generic_options import GenericOptions - try: from pydantic.v1 import Field, PrivateAttr, SecretStr, validator except (ImportError, AttributeError): from pydantic import Field, PrivateAttr, SecretStr, validator # type: ignore[no-redef, assignment] +from onetl._metrics.command import SparkCommandMetrics from onetl._util.java import get_java_gateway, try_import_java_class -from onetl._util.spark import get_spark_version, stringify +from onetl._util.spark import estimate_dataframe_size, get_spark_version, stringify from onetl._util.sql import clear_statement from onetl._util.version import Version from onetl.connection.db_connection.jdbc_mixin.options import ( @@ -29,7 +28,7 @@ ) from onetl.exception import MISSING_JVM_CLASS_MSG from onetl.hooks import slot, support_hooks -from onetl.impl import FrozenModel +from onetl.impl import FrozenModel, GenericOptions from onetl.log import log_lines if TYPE_CHECKING: @@ -204,20 +203,27 @@ def fetch( log.info("|%s| Executing SQL query (on driver):", self.__class__.__name__) log_lines(log, query) - df = self._query_on_driver( - query, - ( - self.FetchOptions.parse(options.dict()) # type: ignore - if isinstance(options, JDBCMixinOptions) - else self.FetchOptions.parse(options) - ), + call_options = ( + self.FetchOptions.parse(options.dict()) # type: ignore + if isinstance(options, JDBCMixinOptions) + else self.FetchOptions.parse(options) ) - log.info( - "|%s| Query succeeded, resulting in-memory dataframe contains %d rows", - self.__class__.__name__, - df.count(), - ) + try: + df = self._query_on_driver(query, call_options) + except Exception: + log.error("|%s| Query failed!", self.__class__.__name__) + raise + + log.info("|%s| Query succeeded, created in-memory dataframe.", self.__class__.__name__) + + # as we don't actually use Spark for this method, SparkMetricsRecorder is useless. + # Just create metrics by hand, and fill them up using information based on dataframe content. + metrics = SparkCommandMetrics() + metrics.input.read_rows = df.count() + metrics.driver.in_memory_bytes = estimate_dataframe_size(self.spark, df) + log.info("|%s| Recorded metrics:", self.__class__.__name__) + log_lines(log, str(metrics)) return df @slot @@ -273,17 +279,26 @@ def execute( if isinstance(options, JDBCMixinOptions) else self.ExecuteOptions.parse(options) ) - df = self._call_on_driver(statement, call_options) - - if df is not None: - rows_count = df.count() - log.info( - "|%s| Execution succeeded, resulting in-memory dataframe contains %d rows", - self.__class__.__name__, - rows_count, - ) - else: - log.info("|%s| Execution succeeded, nothing returned", self.__class__.__name__) + + try: + df = self._call_on_driver(statement, call_options) + except Exception: + log.error("|%s| Execution failed!", self.__class__.__name__) + raise + + if not df: + log.info("|%s| Execution succeeded, nothing returned.", self.__class__.__name__) + return None + + log.info("|%s| Execution succeeded, created in-memory dataframe.", self.__class__.__name__) + # as we don't actually use Spark for this method, SparkMetricsRecorder is useless. + # Just create metrics by hand, and fill them up using information based on dataframe content. + metrics = SparkCommandMetrics() + metrics.input.read_rows = df.count() + metrics.driver.in_memory_bytes = estimate_dataframe_size(self.spark, df) + + log.info("|%s| Recorded metrics:", self.__class__.__name__) + log_lines(log, str(metrics)) return df @validator("spark") diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 04398950..c7669361 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -20,14 +20,12 @@ from etl_entities.instance import Host from onetl._util.classproperty import classproperty -from onetl._util.sql import clear_statement from onetl._util.version import Version from onetl.connection.db_connection.jdbc_connection import JDBCConnection from onetl.connection.db_connection.jdbc_connection.options import JDBCReadOptions from onetl.connection.db_connection.jdbc_mixin.options import ( JDBCExecuteOptions, JDBCFetchOptions, - JDBCOptions, ) from onetl.connection.db_connection.oracle.dialect import OracleDialect from onetl.connection.db_connection.oracle.options import ( @@ -43,8 +41,6 @@ from onetl.log import BASE_LOG_INDENT, log_lines # do not import PySpark here, as we allow user to use `Oracle.get_packages()` for creating Spark session - - if TYPE_CHECKING: from pyspark.sql import DataFrame @@ -290,32 +286,6 @@ def get_min_max_values( max_value = int(max_value) return min_value, max_value - @slot - def execute( - self, - statement: str, - options: JDBCOptions | JDBCExecuteOptions | dict | None = None, # noqa: WPS437 - ) -> DataFrame | None: - statement = clear_statement(statement) - - log.info("|%s| Executing statement (on driver):", self.__class__.__name__) - log_lines(log, statement) - - call_options = self.ExecuteOptions.parse(options) - df = self._call_on_driver(statement, call_options) - self._handle_compile_errors(statement.strip(), call_options) - - if df is not None: - rows_count = df.count() - log.info( - "|%s| Execution succeeded, resulting in-memory dataframe contains %d rows", - self.__class__.__name__, - rows_count, - ) - else: - log.info("|%s| Execution succeeded, nothing returned", self.__class__.__name__) - return df - @root_validator def _only_one_of_sid_or_service_name(cls, values): sid = values.get("sid") @@ -329,6 +299,15 @@ def _only_one_of_sid_or_service_name(cls, values): return values + def _call_on_driver( + self, + query: str, + options: JDBCExecuteOptions, + ) -> DataFrame | None: + result = super()._call_on_driver(query, options) + self._handle_compile_errors(query.strip(), options) + return result + def _parse_create_statement(self, statement: str) -> tuple[str, str, str] | None: """ Parses ``CREATE ... type_name [schema.]object_name ...`` statement diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index 666fce87..06dbd44c 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from logging import getLogger +import logging from typing import TYPE_CHECKING, Optional try: @@ -10,12 +10,15 @@ except (ImportError, AttributeError): from pydantic import Field, PrivateAttr, validator # type: ignore[no-redef, assignment] +from onetl._metrics.command import SparkCommandMetrics +from onetl._metrics.recorder import SparkMetricsRecorder from onetl.base import BaseDBConnection from onetl.hooks import slot, support_hooks from onetl.impl import FrozenModel, GenericOptions from onetl.log import ( entity_boundary_log, log_dataframe_schema, + log_lines, log_options, log_with_indent, ) @@ -23,7 +26,7 @@ if TYPE_CHECKING: from pyspark.sql import DataFrame -log = getLogger(__name__) +log = logging.getLogger(__name__) @support_hooks @@ -172,7 +175,7 @@ def validate_options(cls, options, values): return None @slot - def run(self, df: DataFrame): + def run(self, df: DataFrame) -> None: """ Method for writing your df to specified target. |support_hooks| @@ -188,7 +191,7 @@ def run(self, df: DataFrame): Examples -------- - Write df to target: + Write dataframe to target: .. code:: python @@ -198,18 +201,37 @@ def run(self, df: DataFrame): raise ValueError(f"DataFrame is streaming. {self.__class__.__name__} supports only batch DataFrames.") entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() starts") - if not self._connection_checked: self._log_parameters() log_dataframe_schema(log, df) self.connection.check() self._connection_checked = True - self.connection.write_df_to_target( - df=df, - target=str(self.target), - **self._get_write_kwargs(), - ) + with SparkMetricsRecorder(self.connection.spark) as recorder: + try: + self.connection.write_df_to_target( + df=df, + target=str(self.target), + **self._get_write_kwargs(), + ) + except Exception: + metrics = recorder.metrics() + # SparkListener is not a reliable source of information, metrics may or may not be present. + # Because of this we also do not return these metrics as method result + if metrics.output.is_empty: + log.error( + "|%s| Error while writing dataframe.", + self.__class__.__name__, + ) + else: + log.error( + "|%s| Error while writing dataframe. Target MAY contain partially written data!", + self.__class__.__name__, + ) + self._log_metrics(metrics) + raise + finally: + self._log_metrics(recorder.metrics()) entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") @@ -225,3 +247,8 @@ def _get_write_kwargs(self) -> dict: return {"options": self.options} return {} + + def _log_metrics(self, metrics: SparkCommandMetrics) -> None: + if not metrics.is_empty: + log.debug("|%s| Recorded metrics (some values may be missing!):", self.__class__.__name__) + log_lines(log, str(metrics), level=logging.DEBUG) diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index a80f5480..6431219a 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -10,6 +10,8 @@ except (ImportError, AttributeError): from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] +from onetl._metrics.command import SparkCommandMetrics +from onetl._metrics.recorder import SparkMetricsRecorder from onetl.base import BaseFileDFConnection, BaseWritableFileFormat, PurePathProtocol from onetl.file.file_df_writer.options import FileDFWriterOptions from onetl.hooks import slot, support_hooks @@ -17,6 +19,7 @@ from onetl.log import ( entity_boundary_log, log_dataframe_schema, + log_lines, log_options, log_with_indent, ) @@ -125,12 +128,32 @@ def run(self, df: DataFrame) -> None: self.connection.check() self._connection_checked = True - self.connection.write_df_as_files( - df=df, - path=self.target_path, - format=self.format, - options=self.options, - ) + with SparkMetricsRecorder(self.connection.spark) as recorder: + try: + self.connection.write_df_as_files( + df=df, + path=self.target_path, + format=self.format, + options=self.options, + ) + except Exception: + metrics = recorder.metrics() + if metrics.output.is_empty: + # SparkListener is not a reliable source of information, metrics may or may not be present. + # Because of this we also do not return these metrics as method result + log.error( + "|%s| Error while writing dataframe.", + self.__class__.__name__, + ) + else: + log.error( + "|%s| Error while writing dataframe. Target MAY contain partially written data!", + self.__class__.__name__, + ) + self._log_metrics(metrics) + raise + finally: + self._log_metrics(recorder.metrics()) entity_boundary_log(log, f"{self.__class__.__name__}.run() ends", char="-") @@ -143,6 +166,11 @@ def _log_parameters(self, df: DataFrame) -> None: log_options(log, options_dict) log_dataframe_schema(log, df) + def _log_metrics(self, metrics: SparkCommandMetrics) -> None: + if not metrics.is_empty: + log.debug("|%s| Recorded metrics (some values may be missing!):", self.__class__.__name__) + log_lines(log, str(metrics), level=logging.DEBUG) + @validator("target_path", pre=True) def _validate_target_path(cls, target_path, values): connection: BaseFileDFConnection = values["connection"] diff --git a/tests/.coveragerc b/tests/.coveragerc index 55af8c09..85c7bb2d 100644 --- a/tests/.coveragerc +++ b/tests/.coveragerc @@ -11,6 +11,7 @@ exclude_lines = def __repr__ if self.debug: if settings.DEBUG + if log.isEnabledFor\(logging.DEBUG\) raise AssertionError raise NotImplementedError if __name__ == .__main__.: @@ -20,6 +21,5 @@ exclude_lines = if pyspark_version if spark_version spark = SparkSession._instantiatedSession - if log.isEnabledFor(logging.DEBUG): if sys.version_info except .*ImportError diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py index b72f8ac1..6cea95cc 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py @@ -1007,7 +1007,7 @@ def test_postgres_connection_sql_options( processing.assert_equal_df(df=df, other_frame=table_df) -def test_postgres_fetch_with_legacy_jdbc_options(spark, processing): +def test_postgres_connection_fetch_with_legacy_jdbc_options(spark, processing): postgres = Postgres( host=processing.host, port=processing.port, @@ -1023,7 +1023,7 @@ def test_postgres_fetch_with_legacy_jdbc_options(spark, processing): assert df is not None -def test_postgres_execute_with_legacy_jdbc_options(spark, processing): +def test_postgres_connection_execute_with_legacy_jdbc_options(spark, processing): postgres = Postgres( host=processing.host, port=processing.port, From d2103227553f41d6b0905cce7058ccb3899aad9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 8 Aug 2024 16:21:10 +0000 Subject: [PATCH 41/64] [DOP-18743] Set default jobDescription --- docs/changelog/next_release/304.breaking.rst | 3 + docs/changelog/next_release/304.feature.rst | 6 ++ onetl/_util/hadoop.py | 4 +- onetl/_util/java.py | 2 +- onetl/_util/spark.py | 23 +++++ .../db_connection/clickhouse/connection.py | 3 + .../db_connection/greenplum/connection.py | 3 + .../db_connection/hive/connection.py | 14 ++- .../jdbc_connection/connection.py | 4 +- .../db_connection/jdbc_mixin/connection.py | 77 ++++++++------- .../db_connection/kafka/connection.py | 7 +- .../db_connection/mongodb/connection.py | 20 ++-- .../db_connection/mssql/connection.py | 9 ++ .../db_connection/mysql/connection.py | 3 + .../db_connection/oracle/connection.py | 6 ++ .../db_connection/postgres/connection.py | 3 + .../db_connection/teradata/connection.py | 3 + onetl/connection/file_connection/ftp.py | 5 +- onetl/connection/file_connection/ftps.py | 4 - .../file_connection/hdfs/connection.py | 5 + onetl/connection/file_connection/s3.py | 5 +- onetl/connection/file_connection/samba.py | 5 +- onetl/connection/file_connection/sftp.py | 5 +- onetl/connection/file_connection/webdav.py | 5 +- .../spark_hdfs/connection.py | 3 + .../file_df_connection/spark_local_fs.py | 4 + .../file_df_connection/spark_s3/connection.py | 5 +- onetl/db/db_reader/db_reader.py | 92 ++++++++++-------- onetl/db/db_writer/db_writer.py | 29 ++++-- onetl/file/file_df_reader/file_df_reader.py | 27 +++--- onetl/file/file_df_writer/file_df_writer.py | 29 ++++-- tests/fixtures/spark.py | 3 +- .../test_clickhouse_unit.py | 4 +- .../test_greenplum_unit.py | 5 +- .../test_kafka_unit.py | 4 + .../test_mongodb_unit.py | 22 +++-- .../test_mssql_unit.py | 6 +- .../test_mysql_unit.py | 6 +- .../test_oracle_unit.py | 6 +- .../test_postgres_unit.py | 5 +- .../test_teradata_unit.py | 5 +- .../test_ftp_unit.py | 33 +++---- .../test_ftps_unit.py | 33 ++++--- .../test_hdfs_unit.py | 97 ++++++++++--------- .../test_s3_unit.py | 56 +++++------ .../test_samba_unit.py | 39 ++++---- .../test_sftp_unit.py | 46 +++++---- .../test_webdav_unit.py | 37 ++++--- .../test_spark_hdfs_unit.py | 39 ++++---- .../test_spark_local_fs_unit.py | 1 + .../test_spark_s3_unit.py | 47 ++++----- 51 files changed, 545 insertions(+), 362 deletions(-) create mode 100644 docs/changelog/next_release/304.breaking.rst create mode 100644 docs/changelog/next_release/304.feature.rst diff --git a/docs/changelog/next_release/304.breaking.rst b/docs/changelog/next_release/304.breaking.rst new file mode 100644 index 00000000..60598321 --- /dev/null +++ b/docs/changelog/next_release/304.breaking.rst @@ -0,0 +1,3 @@ +Change connection URL used for generating HWM names of S3 and Samba sources: +* ``smb://host:port`` -> ``smb://host:port/share`` +* ``s3://host:port`` -> ``s3://host:port/bucket`` diff --git a/docs/changelog/next_release/304.feature.rst b/docs/changelog/next_release/304.feature.rst new file mode 100644 index 00000000..97560354 --- /dev/null +++ b/docs/changelog/next_release/304.feature.rst @@ -0,0 +1,6 @@ +Generate default ``jobDescription`` based on currently executed method. Examples: +* ``DBWriter() -> Postgres[host:5432/database]`` +* ``MongoDB[localhost:27017/admin] -> DBReader.run()`` +* ``Hive[cluster].execute()`` + +If user already set custom ``jobDescription``, it will left intact. diff --git a/onetl/_util/hadoop.py b/onetl/_util/hadoop.py index fdf275de..aed572e0 100644 --- a/onetl/_util/hadoop.py +++ b/onetl/_util/hadoop.py @@ -14,7 +14,7 @@ def get_hadoop_version(spark_session: SparkSession) -> Version: """ Get version of Hadoop libraries embedded to Spark """ - jvm = spark_session._jvm # noqa: WPS437 + jvm = spark_session._jvm # noqa: WPS437 # type: ignore[attr-defined] version_info = jvm.org.apache.hadoop.util.VersionInfo # type: ignore[union-attr] hadoop_version: str = version_info.getVersion() return Version(hadoop_version) @@ -24,4 +24,4 @@ def get_hadoop_config(spark_session: SparkSession): """ Get ``org.apache.hadoop.conf.Configuration`` object """ - return spark_session.sparkContext._jsc.hadoopConfiguration() + return spark_session.sparkContext._jsc.hadoopConfiguration() # type: ignore[attr-defined] diff --git a/onetl/_util/java.py b/onetl/_util/java.py index 45111432..1ec50a0d 100644 --- a/onetl/_util/java.py +++ b/onetl/_util/java.py @@ -16,7 +16,7 @@ def get_java_gateway(spark_session: SparkSession) -> JavaGateway: """ Get py4j Java gateway object """ - return spark_session._sc._gateway # noqa: WPS437 # type: ignore + return spark_session._sc._gateway # noqa: WPS437 # type: ignore[attr-defined] def try_import_java_class(spark_session: SparkSession, name: str): diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index 2b2edbaf..f7d018b3 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -19,6 +19,9 @@ from pyspark.sql import DataFrame, SparkSession from pyspark.sql.conf import RuntimeConfig +SPARK_JOB_DESCRIPTION_PROPERTY = "spark.job.description" +SPARK_JOB_GROUP_PROPERTY = "spark.jobGroup.id" + def stringify(value: Any, quote: bool = False) -> Any: # noqa: WPS212 """ @@ -200,3 +203,23 @@ def get_executor_total_cores(spark_session: SparkSession, include_driver: bool = expected_cores += 1 return expected_cores, config + + +@contextmanager +def override_job_description(spark_session: SparkSession, job_description: str): + """ + Override Spark job description. + + Unlike ``spark_session.sparkContext.setJobDescription``, this method resets job description + before exiting the context manager, instead of keeping it. + + If user set custom description, it will be left intact. + """ + spark_context = spark_session.sparkContext + original_description = spark_context.getLocalProperty(SPARK_JOB_DESCRIPTION_PROPERTY) + + try: + spark_context.setLocalProperty(SPARK_JOB_DESCRIPTION_PROPERTY, original_description or job_description) + yield + finally: + spark_context.setLocalProperty(SPARK_JOB_DESCRIPTION_PROPERTY, original_description) # type: ignore[arg-type] diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 0ca6d0ce..482cc941 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -196,6 +196,9 @@ def jdbc_params(self) -> dict: def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}" + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}]" + @staticmethod def _build_statement( statement: str, diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 7ed60539..0f40436f 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -267,6 +267,9 @@ def package_spark_3_2(cls) -> str: def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}/{self.database}" + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.database}]" + @property def jdbc_url(self) -> str: return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}" diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 61032987..855a0ead 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -14,7 +14,7 @@ from pydantic import validator # type: ignore[no-redef, assignment] from onetl._metrics.recorder import SparkMetricsRecorder -from onetl._util.spark import inject_spark_param +from onetl._util.spark import inject_spark_param, override_job_description from onetl._util.sql import clear_statement from onetl.connection.db_connection.db_connection import DBConnection from onetl.connection.db_connection.hive.dialect import HiveDialect @@ -159,6 +159,9 @@ def get_current(cls, spark: SparkSession): def instance_url(self) -> str: return self.cluster + def __str__(self): + return f"{self.__class__.__name__}[{self.cluster}]" + @slot def check(self): log.debug("|%s| Detecting current cluster...", self.__class__.__name__) @@ -173,7 +176,8 @@ def check(self): log_lines(log, self._CHECK_QUERY, level=logging.DEBUG) try: - self._execute_sql(self._CHECK_QUERY).limit(1).collect() + with override_job_description(self.spark, f"{self}.check()"): + self._execute_sql(self._CHECK_QUERY).limit(1).collect() log.info("|%s| Connection is available.", self.__class__.__name__) except Exception as e: log.exception("|%s| Connection is unavailable", self.__class__.__name__) @@ -213,7 +217,8 @@ def sql( with SparkMetricsRecorder(self.spark) as recorder: try: - df = self._execute_sql(query) + with override_job_description(self.spark, f"{self}.sql()"): + df = self._execute_sql(query) except Exception: log.error("|%s| Query failed", self.__class__.__name__) @@ -260,7 +265,8 @@ def execute( with SparkMetricsRecorder(self.spark) as recorder: try: - self._execute_sql(statement).collect() + with override_job_description(self.spark, f"{self}.execute()"): + self._execute_sql(statement).collect() except Exception: log.error("|%s| Execution failed", self.__class__.__name__) metrics = recorder.metrics() diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 2fc2f7cf..9d41298e 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -7,6 +7,7 @@ import warnings from typing import TYPE_CHECKING, Any +from onetl._util.spark import override_job_description from onetl._util.sql import clear_statement from onetl.connection.db_connection.db_connection import DBConnection from onetl.connection.db_connection.jdbc_connection.dialect import JDBCDialect @@ -93,7 +94,8 @@ def sql( log_lines(log, query) try: - df = self._query_on_executor(query, self.SQLOptions.parse(options)) + with override_job_description(self.spark, f"{self}.sql()"): + df = self._query_on_executor(query, self.SQLOptions.parse(options)) except Exception: log.error("|%s| Query failed!", self.__class__.__name__) raise diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 84276147..8ec77d13 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -16,7 +16,12 @@ from onetl._metrics.command import SparkCommandMetrics from onetl._util.java import get_java_gateway, try_import_java_class -from onetl._util.spark import estimate_dataframe_size, get_spark_version, stringify +from onetl._util.spark import ( + estimate_dataframe_size, + get_spark_version, + override_job_description, + stringify, +) from onetl._util.sql import clear_statement from onetl._util.version import Version from onetl.connection.db_connection.jdbc_mixin.options import ( @@ -209,21 +214,22 @@ def fetch( else self.FetchOptions.parse(options) ) - try: - df = self._query_on_driver(query, call_options) - except Exception: - log.error("|%s| Query failed!", self.__class__.__name__) - raise - - log.info("|%s| Query succeeded, created in-memory dataframe.", self.__class__.__name__) - - # as we don't actually use Spark for this method, SparkMetricsRecorder is useless. - # Just create metrics by hand, and fill them up using information based on dataframe content. - metrics = SparkCommandMetrics() - metrics.input.read_rows = df.count() - metrics.driver.in_memory_bytes = estimate_dataframe_size(self.spark, df) - log.info("|%s| Recorded metrics:", self.__class__.__name__) - log_lines(log, str(metrics)) + with override_job_description(self.spark, f"{self}.fetch()"): + try: + df = self._query_on_driver(query, call_options) + except Exception: + log.error("|%s| Query failed!", self.__class__.__name__) + raise + + log.info("|%s| Query succeeded, created in-memory dataframe.", self.__class__.__name__) + + # as we don't actually use Spark for this method, SparkMetricsRecorder is useless. + # Just create metrics by hand, and fill them up using information based on dataframe content. + metrics = SparkCommandMetrics() + metrics.input.read_rows = df.count() + metrics.driver.in_memory_bytes = estimate_dataframe_size(self.spark, df) + log.info("|%s| Recorded metrics:", self.__class__.__name__) + log_lines(log, str(metrics)) return df @slot @@ -280,25 +286,26 @@ def execute( else self.ExecuteOptions.parse(options) ) - try: - df = self._call_on_driver(statement, call_options) - except Exception: - log.error("|%s| Execution failed!", self.__class__.__name__) - raise - - if not df: - log.info("|%s| Execution succeeded, nothing returned.", self.__class__.__name__) - return None - - log.info("|%s| Execution succeeded, created in-memory dataframe.", self.__class__.__name__) - # as we don't actually use Spark for this method, SparkMetricsRecorder is useless. - # Just create metrics by hand, and fill them up using information based on dataframe content. - metrics = SparkCommandMetrics() - metrics.input.read_rows = df.count() - metrics.driver.in_memory_bytes = estimate_dataframe_size(self.spark, df) - - log.info("|%s| Recorded metrics:", self.__class__.__name__) - log_lines(log, str(metrics)) + with override_job_description(self.spark, f"{self}.execute()"): + try: + df = self._call_on_driver(statement, call_options) + except Exception: + log.error("|%s| Execution failed!", self.__class__.__name__) + raise + + if not df: + log.info("|%s| Execution succeeded, nothing returned.", self.__class__.__name__) + return None + + log.info("|%s| Execution succeeded, created in-memory dataframe.", self.__class__.__name__) + # as we don't actually use Spark for this method, SparkMetricsRecorder is useless. + # Just create metrics by hand, and fill them up using information based on dataframe content. + metrics = SparkCommandMetrics() + metrics.input.read_rows = df.count() + metrics.driver.in_memory_bytes = estimate_dataframe_size(self.spark, df) + + log.info("|%s| Recorded metrics:", self.__class__.__name__) + log_lines(log, str(metrics)) return df @validator("spark") diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index ce3829e4..b404eafb 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -497,7 +497,7 @@ def get_min_max_values( # https://kafka.apache.org/22/javadoc/org/apache/kafka/clients/consumer/KafkaConsumer.html#partitionsFor-java.lang.String- partition_infos = consumer.partitionsFor(source) - jvm = self.spark._jvm + jvm = self.spark._jvm # type: ignore[attr-defined] topic_partitions = [ jvm.org.apache.kafka.common.TopicPartition(source, p.partition()) # type: ignore[union-attr] for p in partition_infos @@ -542,6 +542,9 @@ def get_min_max_values( def instance_url(self): return "kafka://" + self.cluster + def __str__(self): + return f"{self.__class__.__name__}[{self.cluster}]" + @root_validator(pre=True) def _get_addresses_by_cluster(cls, values): cluster = values.get("cluster") @@ -639,7 +642,7 @@ def _get_java_consumer(self): return consumer_class(connection_properties) def _get_topics(self, timeout: int = 10) -> set[str]: - jvm = self.spark._jvm + jvm = self.spark._jvm # type: ignore[attr-defined] # Maybe we should not pass explicit timeout at all, # and instead use default.api.timeout.ms which is configurable via self.extra. # Think about this next time if someone see issues in real use diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 568cd953..f81a3bf8 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -18,7 +18,7 @@ from onetl._util.classproperty import classproperty from onetl._util.java import try_import_java_class from onetl._util.scala import get_default_scala_version -from onetl._util.spark import get_spark_version +from onetl._util.spark import get_spark_version, override_job_description from onetl._util.version import Version from onetl.connection.db_connection.db_connection import DBConnection from onetl.connection.db_connection.mongodb.dialect import MongoDBDialect @@ -347,17 +347,25 @@ def pipeline( if pipeline: read_options["aggregation.pipeline"] = json.dumps(pipeline) read_options["connection.uri"] = self.connection_url - spark_reader = self.spark.read.format("mongodb").options(**read_options) - if df_schema: - spark_reader = spark_reader.schema(df_schema) + with override_job_description( + self.spark, + f"{self}.pipeline()", + ): + spark_reader = self.spark.read.format("mongodb").options(**read_options) - return spark_reader.load() + if df_schema: + spark_reader = spark_reader.schema(df_schema) + + return spark_reader.load() @property def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}/{self.database}" + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.database}]" + @slot def check(self): log.info("|%s| Checking connection availability...", self.__class__.__name__) @@ -532,7 +540,7 @@ def _check_java_class_imported(cls, spark): return spark def _collection_exists(self, source: str) -> bool: - jvm = self.spark._jvm + jvm = self.spark._jvm # type: ignore[attr-defined] client = jvm.com.mongodb.client.MongoClients.create(self.connection_url) # type: ignore collections = set(client.getDatabase(self.database).listCollectionNames().iterator()) if source in collections: diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 556cb4cb..f2a29b44 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -268,3 +268,12 @@ def instance_url(self) -> str: # for backward compatibility keep port number in legacy HWM instance url port = self.port or 1433 return f"{self.__class__.__name__.lower()}://{self.host}:{port}/{self.database}" + + def __str__(self): + extra_dict = self.extra.dict(by_alias=True) + instance_name = extra_dict.get("instanceName") + if instance_name: + return rf"{self.__class__.__name__}[{self.host}\{instance_name}/{self.database}]" + + port = self.port or 1433 + return f"{self.__class__.__name__}[{self.host}:{port}/{self.database}]" diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 72090d58..e3c91196 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -175,3 +175,6 @@ def jdbc_params(self) -> dict: @property def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}]" diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index c7669361..40164fe1 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -262,6 +262,12 @@ def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}/{self.service_name}" + def __str__(self): + if self.sid: + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.sid}]" + + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.service_name}]" + @slot def get_min_max_values( self, diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 132d9727..1c11d9e3 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -182,6 +182,9 @@ def jdbc_params(self) -> dict[str, str]: def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}/{self.database}" + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.database}]" + def _options_to_connection_properties( self, options: JDBCFetchOptions | JDBCExecuteOptions, diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 6ef2637b..9c8f073c 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -208,3 +208,6 @@ def jdbc_url(self) -> str: @property def instance_url(self) -> str: return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}]" diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index b457b966..d5ff5216 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -105,7 +105,10 @@ class FTP(FileConnection, RenameDirMixin): @property def instance_url(self) -> str: - return f"ftp://{self.host}:{self.port}" + return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}]" @slot def path_exists(self, path: os.PathLike | str) -> bool: diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 8cf9aa8f..0180edf4 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -95,10 +95,6 @@ class FTPS(FTP): ) """ - @property - def instance_url(self) -> str: - return f"ftps://{self.host}:{self.port}" - def _get_client(self) -> FTPHost: """ Returns a FTPS connection object diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 056622fb..89c0ec96 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -264,6 +264,11 @@ def instance_url(self) -> str: return self.cluster return f"hdfs://{self.host}:{self.webhdfs_port}" + def __str__(self): + if self.cluster: + return f"{self.__class__.__name__}[{self.cluster}]" + return f"{self.__class__.__name__}[{self.host}:{self.webhdfs_port}]" + @slot def path_exists(self, path: os.PathLike | str) -> bool: return self.client.status(os.fspath(path), strict=False) diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index f8f584dc..0f411c85 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -131,7 +131,10 @@ def validate_port(cls, values): @property def instance_url(self) -> str: - return f"s3://{self.host}:{self.port}" + return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}/{self.bucket}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.bucket}]" @slot def create_dir(self, path: os.PathLike | str) -> RemoteDirectory: diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 9fc0857f..430e15a7 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -125,7 +125,10 @@ class Samba(FileConnection): @property def instance_url(self) -> str: - return f"smb://{self.host}:{self.port}" + return f"smb://{self.host}:{self.port}/{self.share}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}/{self.share}]" @slot def check(self): diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 8cd2ac1e..92db2adc 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -120,7 +120,10 @@ class SFTP(FileConnection, RenameDirMixin): @property def instance_url(self) -> str: - return f"sftp://{self.host}:{self.port}" + return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}]" @slot def path_exists(self, path: os.PathLike | str) -> bool: diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index aa540567..44ac766a 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -130,7 +130,10 @@ def check_port(cls, values): @property def instance_url(self) -> str: - return f"webdav://{self.host}:{self.port}" + return f"{self.__class__.__name__.lower()}://{self.host}:{self.port}" + + def __str__(self): + return f"{self.__class__.__name__}[{self.host}:{self.port}]" @slot def path_exists(self, path: os.PathLike | str) -> bool: diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 26c1416e..10ff1005 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -164,6 +164,9 @@ def path_from_string(self, path: os.PathLike | str) -> Path: def instance_url(self): return self.cluster + def __str__(self): + return f"HDFS[{self.cluster}]" + def __enter__(self): return self diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index 839cbdae..71c70414 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -74,6 +74,10 @@ def instance_url(self): fqdn = socket.getfqdn() return f"file://{fqdn}" + def __str__(self): + # str should not make network requests + return "LocalFS" + @validator("spark") def _validate_spark(cls, spark): master = spark.conf.get("spark.master") diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 1efe39d4..eb74d698 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -256,7 +256,10 @@ def path_from_string(self, path: os.PathLike | str) -> RemotePath: @property def instance_url(self): - return f"s3://{self.host}:{self.port}" + return f"s3://{self.host}:{self.port}/{self.bucket}" + + def __str__(self): + return f"S3[{self.host}:{self.port}/{self.bucket}]" def __enter__(self): return self diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 91b3f21b..f560104d 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -17,7 +17,7 @@ except (ImportError, AttributeError): from pydantic import Field, PrivateAttr, root_validator, validator # type: ignore[no-redef, assignment] -from onetl._util.spark import try_import_pyspark +from onetl._util.spark import override_job_description, try_import_pyspark from onetl.base import ( BaseDBConnection, ContainsGetDFSchemaMethod, @@ -542,26 +542,30 @@ def has_data(self) -> bool: """ self._check_strategy() - if not self._connection_checked: - self._log_parameters() - self.connection.check() - - window, limit = self._calculate_window_and_limit() - if limit == 0: - return False - - df = self.connection.read_source_as_df( - source=str(self.source), - columns=self.columns, - hint=self.hint, - where=self.where, - df_schema=self.df_schema, - window=window, - limit=1, - **self._get_read_kwargs(), - ) + with override_job_description( + self.connection.spark, + f"{self.connection} -> {self.__class__.__name__}.has_data()", + ): + if not self._connection_checked: + self._log_parameters() + self.connection.check() + + window, limit = self._calculate_window_and_limit() + if limit == 0: + return False + + df = self.connection.read_source_as_df( + source=str(self.source), + columns=self.columns, + hint=self.hint, + where=self.where, + df_schema=self.df_schema, + window=window, + limit=1, + **self._get_read_kwargs(), + ) - return bool(df.take(1)) + return bool(df.take(1)) @slot def raise_if_no_data(self) -> None: @@ -633,28 +637,32 @@ def run(self) -> DataFrame: self._check_strategy() - if not self._connection_checked: - self._log_parameters() - self.connection.check() - self._connection_checked = True - - window, limit = self._calculate_window_and_limit() - - # update the HWM with the stop value - if self.hwm and window: - strategy: HWMStrategy = StrategyManager.get_current() # type: ignore[assignment] - strategy.update_hwm(window.stop_at.value) - - df = self.connection.read_source_as_df( - source=str(self.source), - columns=self.columns, - hint=self.hint, - where=self.where, - df_schema=self.df_schema, - window=window, - limit=limit, - **self._get_read_kwargs(), - ) + with override_job_description( + self.connection.spark, + f"{self.connection} -> {self.__class__.__name__}.run()", + ): + if not self._connection_checked: + self._log_parameters() + self.connection.check() + self._connection_checked = True + + window, limit = self._calculate_window_and_limit() + + # update the HWM with the stop value + if self.hwm and window: + strategy: HWMStrategy = StrategyManager.get_current() # type: ignore[assignment] + strategy.update_hwm(window.stop_at.value) + + df = self.connection.read_source_as_df( + source=str(self.source), + columns=self.columns, + hint=self.hint, + where=self.where, + df_schema=self.df_schema, + window=window, + limit=limit, + **self._get_read_kwargs(), + ) entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") return df diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index 06dbd44c..0b07ec4e 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -12,6 +12,7 @@ from onetl._metrics.command import SparkCommandMetrics from onetl._metrics.recorder import SparkMetricsRecorder +from onetl._util.spark import override_job_description from onetl.base import BaseDBConnection from onetl.hooks import slot, support_hooks from onetl.impl import FrozenModel, GenericOptions @@ -201,19 +202,27 @@ def run(self, df: DataFrame) -> None: raise ValueError(f"DataFrame is streaming. {self.__class__.__name__} supports only batch DataFrames.") entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() starts") - if not self._connection_checked: - self._log_parameters() - log_dataframe_schema(log, df) - self.connection.check() - self._connection_checked = True + with override_job_description( + self.connection.spark, + f"{self.__class__.__name__}.run() -> {self.connection}", + ): + if not self._connection_checked: + self._log_parameters() + log_dataframe_schema(log, df) + self.connection.check() + self._connection_checked = True with SparkMetricsRecorder(self.connection.spark) as recorder: try: - self.connection.write_df_to_target( - df=df, - target=str(self.target), - **self._get_write_kwargs(), - ) + with override_job_description( + self.connection.spark, + f"{self.__class__.__name__}.run() -> {self.connection}", + ): + self.connection.write_df_to_target( + df=df, + target=str(self.target), + **self._get_write_kwargs(), + ) except Exception: metrics = recorder.metrics() # SparkListener is not a reliable source of information, metrics may or may not be present. diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index b18fc179..f1e2f01e 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -13,7 +13,7 @@ except (ImportError, AttributeError): from pydantic import PrivateAttr, validator # type: ignore[no-redef, assignment] -from onetl._util.spark import try_import_pyspark +from onetl._util.spark import override_job_description, try_import_pyspark from onetl.base import BaseFileDFConnection, BaseReadableFileFormat, PurePathProtocol from onetl.file.file_df_reader.options import FileDFReaderOptions from onetl.file.file_set import FileSet @@ -211,18 +211,23 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: if not self._connection_checked: self._log_parameters(files) - paths: FileSet[PurePathProtocol] = FileSet() - if files is not None: - paths = FileSet(self._validate_files(files)) - elif self.source_path: - paths = FileSet([self.source_path]) + with override_job_description( + self.connection.spark, + f"{self.connection} -> {self.__class__.__name__}.run()", + ): + paths: FileSet[PurePathProtocol] = FileSet() + if files is not None: + paths = FileSet(self._validate_files(files)) + elif self.source_path: + paths = FileSet([self.source_path]) - if not self._connection_checked: - self.connection.check() - log_with_indent(log, "") - self._connection_checked = True + if not self._connection_checked: + self.connection.check() + log_with_indent(log, "") + self._connection_checked = True + + df = self._read_files(paths) - df = self._read_files(paths) entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") return df diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index 6431219a..35baaf15 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -12,6 +12,7 @@ from onetl._metrics.command import SparkCommandMetrics from onetl._metrics.recorder import SparkMetricsRecorder +from onetl._util.spark import override_job_description from onetl.base import BaseFileDFConnection, BaseWritableFileFormat, PurePathProtocol from onetl.file.file_df_writer.options import FileDFWriterOptions from onetl.hooks import slot, support_hooks @@ -123,19 +124,27 @@ def run(self, df: DataFrame) -> None: if df.isStreaming: raise ValueError(f"DataFrame is streaming. {self.__class__.__name__} supports only batch DataFrames.") - if not self._connection_checked: - self._log_parameters(df) - self.connection.check() - self._connection_checked = True + with override_job_description( + self.connection.spark, + f"{self.__class__.__name__}.run() -> {self.connection}", + ): + if not self._connection_checked: + self._log_parameters(df) + self.connection.check() + self._connection_checked = True with SparkMetricsRecorder(self.connection.spark) as recorder: try: - self.connection.write_df_as_files( - df=df, - path=self.target_path, - format=self.format, - options=self.options, - ) + with override_job_description( + self.connection.spark, + f"{self.__class__.__name__}.run() -> {self.connection}", + ): + self.connection.write_df_as_files( + df=df, + path=self.target_path, + format=self.format, + options=self.options, + ) except Exception: metrics = recorder.metrics() if metrics.output.is_empty: diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index e7248e84..7a9b812a 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -123,12 +123,11 @@ def excluded_packages(): @pytest.fixture( scope="session", - name="spark", params=[ pytest.param("real-spark", marks=[pytest.mark.db_connection, pytest.mark.connection]), ], ) -def get_spark_session(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages, excluded_packages): +def spark(warehouse_dir, spark_metastore_dir, ivysettings_path, maven_packages, excluded_packages): from pyspark.sql import SparkSession spark = ( diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index ff36e0a6..287061d2 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -128,10 +128,10 @@ def test_clickhouse(spark_mock): "url": "jdbc:clickhouse://some_host:8123/database", } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "clickhouse://some_host:8123" + assert str(conn) == "Clickhouse[some_host:8123]" def test_clickhouse_with_port(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 0d382d44..47821642 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -129,10 +129,10 @@ def test_greenplum(spark_mock): "tcpKeepAlive": "true", } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "greenplum://some_host:5432/database" + assert str(conn) == "Greenplum[some_host:5432/database]" def test_greenplum_with_port(spark_mock): @@ -156,6 +156,7 @@ def test_greenplum_with_port(spark_mock): } assert conn.instance_url == "greenplum://some_host:5000/database" + assert str(conn) == "Greenplum[some_host:5000/database]" def test_greenplum_without_database_error(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py index 2e0ccd1a..74101388 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_kafka_unit.py @@ -181,6 +181,7 @@ def test_kafka_basic_auth_get_jaas_conf(spark_mock): assert conn.addresses == ["192.168.1.1"] assert conn.instance_url == "kafka://some_cluster" + assert str(conn) == "Kafka[some_cluster]" def test_kafka_anon_auth(spark_mock): @@ -194,6 +195,7 @@ def test_kafka_anon_auth(spark_mock): assert conn.addresses == ["192.168.1.1"] assert conn.instance_url == "kafka://some_cluster" + assert str(conn) == "Kafka[some_cluster]" @pytest.mark.parametrize("digest", ["SHA-256", "SHA-512"]) @@ -217,6 +219,7 @@ def test_kafka_scram_auth(spark_mock, digest): assert conn.addresses == ["192.168.1.1"] assert conn.instance_url == "kafka://some_cluster" + assert str(conn) == "Kafka[some_cluster]" def test_kafka_auth_keytab(spark_mock, create_keytab): @@ -235,6 +238,7 @@ def test_kafka_auth_keytab(spark_mock, create_keytab): assert conn.addresses == ["192.168.1.1"] assert conn.instance_url == "kafka://some_cluster" + assert str(conn) == "Kafka[some_cluster]" def test_kafka_empty_addresses(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index f494e3de..9142848e 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -126,9 +126,10 @@ def test_mongodb(spark_mock): assert conn.database == "database" assert conn.connection_url == "mongodb://user:password@host:27017/database" + assert conn.instance_url == "mongodb://host:27017/database" + assert str(conn) == "MongoDB[host:27017/database]" - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) @pytest.mark.parametrize( @@ -150,7 +151,7 @@ def test_mongodb_options_hint(): def test_mongodb_with_port(spark_mock): - mongo = MongoDB( + conn = MongoDB( host="host", user="user", password="password", @@ -159,14 +160,15 @@ def test_mongodb_with_port(spark_mock): spark=spark_mock, ) - assert mongo.host == "host" - assert mongo.port == 12345 - assert mongo.user == "user" - assert mongo.password != "password" - assert mongo.password.get_secret_value() == "password" - assert mongo.database == "database" + assert conn.host == "host" + assert conn.port == 12345 + assert conn.user == "user" + assert conn.password != "password" + assert conn.password.get_secret_value() == "password" + assert conn.database == "database" - assert mongo.connection_url == "mongodb://user:password@host:12345/database" + assert conn.connection_url == "mongodb://user:password@host:12345/database" + assert conn.instance_url == "mongodb://host:12345/database" def test_mongodb_without_mandatory_args(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py index e1a18aa9..d9f3cfda 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py @@ -101,10 +101,10 @@ def test_mssql(spark_mock): "databaseName": "database", } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "mssql://some_host:1433/database" + assert str(conn) == "MSSQL[some_host:1433/database]" def test_mssql_with_custom_port(spark_mock): @@ -127,6 +127,7 @@ def test_mssql_with_custom_port(spark_mock): } assert conn.instance_url == "mssql://some_host:5000/database" + assert str(conn) == "MSSQL[some_host:5000/database]" def test_mssql_with_instance_name(spark_mock): @@ -157,6 +158,7 @@ def test_mssql_with_instance_name(spark_mock): } assert conn.instance_url == "mssql://some_host\\myinstance/database" + assert str(conn) == "MSSQL[some_host\\myinstance/database]" def test_mssql_without_database_error(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py index f2c68d93..0d57da48 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py @@ -89,10 +89,10 @@ def test_mysql(spark_mock): "useUnicode": "yes", } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "mysql://some_host:3306" + assert str(conn) == "MySQL[some_host:3306]" def test_mysql_with_port(spark_mock): @@ -116,6 +116,7 @@ def test_mysql_with_port(spark_mock): } assert conn.instance_url == "mysql://some_host:5000" + assert str(conn) == "MySQL[some_host:5000]" def test_mysql_without_database(spark_mock): @@ -139,6 +140,7 @@ def test_mysql_without_database(spark_mock): } assert conn.instance_url == "mysql://some_host:3306" + assert str(conn) == "MySQL[some_host:3306]" def test_mysql_with_extra(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py index ae7bf87c..dd02b5c9 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py @@ -110,10 +110,10 @@ def test_oracle(spark_mock): "url": "jdbc:oracle:thin:@some_host:1521:sid", } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "oracle://some_host:1521/sid" + assert str(conn) == "Oracle[some_host:1521/sid]" def test_oracle_with_port(spark_mock): @@ -135,6 +135,7 @@ def test_oracle_with_port(spark_mock): } assert conn.instance_url == "oracle://some_host:5000/sid" + assert str(conn) == "Oracle[some_host:5000/sid]" def test_oracle_uri_with_service_name(spark_mock): @@ -149,6 +150,7 @@ def test_oracle_uri_with_service_name(spark_mock): } assert conn.instance_url == "oracle://some_host:1521/service" + assert str(conn) == "Oracle[some_host:1521/service]" def test_oracle_without_sid_and_service_name(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index 6e37417a..2b0080bf 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -90,10 +90,10 @@ def test_postgres(spark_mock): "stringtype": "unspecified", } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "postgres://some_host:5432/database" + assert str(conn) == "Postgres[some_host:5432/database]" def test_postgres_with_port(spark_mock): @@ -118,6 +118,7 @@ def test_postgres_with_port(spark_mock): } assert conn.instance_url == "postgres://some_host:5000/database" + assert str(conn) == "Postgres[some_host:5000/database]" def test_postgres_without_database_error(spark_mock): diff --git a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py index bef65a55..39c90cce 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_teradata_unit.py @@ -89,10 +89,10 @@ def test_teradata(spark_mock): "url": conn.jdbc_url, } - assert "password='passwd'" not in str(conn) - assert "password='passwd'" not in repr(conn) + assert "passwd" not in repr(conn) assert conn.instance_url == "teradata://some_host:1025" + assert str(conn) == "Teradata[some_host:1025]" def test_teradata_with_port(spark_mock): @@ -117,6 +117,7 @@ def test_teradata_with_port(spark_mock): } assert conn.instance_url == "teradata://some_host:5000" + assert str(conn) == "Teradata[some_host:5000]" def test_teradata_without_database(spark_mock): diff --git a/tests/tests_unit/tests_file_connection_unit/test_ftp_unit.py b/tests/tests_unit/tests_file_connection_unit/test_ftp_unit.py index ab47c248..33f6b29f 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_ftp_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_ftp_unit.py @@ -8,35 +8,34 @@ def test_ftp_connection(): from onetl.connection import FTP - ftp = FTP(host="some_host", user="some_user", password="pwd") - assert isinstance(ftp, FileConnection) - assert ftp.host == "some_host" - assert ftp.user == "some_user" - assert ftp.password != "pwd" - assert ftp.password.get_secret_value() == "pwd" - assert ftp.port == 21 + conn = FTP(host="some_host", user="some_user", password="pwd") + assert isinstance(conn, FileConnection) + assert conn.host == "some_host" + assert conn.user == "some_user" + assert conn.password != "pwd" + assert conn.password.get_secret_value() == "pwd" + assert conn.port == 21 - assert "password='pwd'" not in str(ftp) - assert "password='pwd'" not in repr(ftp) + assert str(conn) == "FTP[some_host:21]" + assert "pwd" not in repr(conn) def test_ftp_connection_anonymous(): from onetl.connection import FTP - ftp = FTP(host="some_host") - - assert isinstance(ftp, FileConnection) - assert ftp.host == "some_host" - assert ftp.user is None - assert ftp.password is None + conn = FTP(host="some_host") + assert conn.host == "some_host" + assert conn.user is None + assert conn.password is None def test_ftp_connection_with_port(): from onetl.connection import FTP - ftp = FTP(host="some_host", user="some_user", password="pwd", port=500) + conn = FTP(host="some_host", user="some_user", password="pwd", port=500) - assert ftp.port == 500 + assert conn.port == 500 + assert str(conn) == "FTP[some_host:500]" def test_ftp_connection_without_mandatory_args(): diff --git a/tests/tests_unit/tests_file_connection_unit/test_ftps_unit.py b/tests/tests_unit/tests_file_connection_unit/test_ftps_unit.py index aa63de1e..c0b201e6 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_ftps_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_ftps_unit.py @@ -8,35 +8,36 @@ def test_ftps_connection(): from onetl.connection import FTPS - ftps = FTPS(host="some_host", user="some_user", password="pwd") - assert isinstance(ftps, FileConnection) - assert ftps.host == "some_host" - assert ftps.user == "some_user" - assert ftps.password != "pwd" - assert ftps.password.get_secret_value() == "pwd" - assert ftps.port == 21 + conn = FTPS(host="some_host", user="some_user", password="pwd") + assert isinstance(conn, FileConnection) + assert conn.host == "some_host" + assert conn.user == "some_user" + assert conn.password != "pwd" + assert conn.password.get_secret_value() == "pwd" + assert conn.port == 21 - assert "password='pwd'" not in str(ftps) - assert "password='pwd'" not in repr(ftps) + assert str(conn) == "FTPS[some_host:21]" + assert "pwd" not in repr(conn) def test_ftps_connection_anonymous(): from onetl.connection import FTPS - ftps = FTPS(host="some_host") + conn = FTPS(host="some_host") - assert isinstance(ftps, FileConnection) - assert ftps.host == "some_host" - assert ftps.user is None - assert ftps.password is None + assert isinstance(conn, FileConnection) + assert conn.host == "some_host" + assert conn.user is None + assert conn.password is None def test_ftps_connection_with_port(): from onetl.connection import FTPS - ftps = FTPS(host="some_host", user="some_user", password="pwd", port=500) + conn = FTPS(host="some_host", user="some_user", password="pwd", port=500) - assert ftps.port == 500 + assert conn.port == 500 + assert str(conn) == "FTPS[some_host:500]" def test_ftps_connection_without_mandatory_args(): diff --git a/tests/tests_unit/tests_file_connection_unit/test_hdfs_unit.py b/tests/tests_unit/tests_file_connection_unit/test_hdfs_unit.py index 2249e237..26b4cf7a 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_hdfs_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_hdfs_unit.py @@ -15,73 +15,74 @@ def test_hdfs_connection_with_host(): from onetl.connection import HDFS - hdfs = HDFS(host="some-host.domain.com") - assert isinstance(hdfs, FileConnection) - assert hdfs.host == "some-host.domain.com" - assert hdfs.webhdfs_port == 50070 - assert not hdfs.user - assert not hdfs.password - assert not hdfs.keytab - assert hdfs.instance_url == "hdfs://some-host.domain.com:50070" + conn = HDFS(host="some-host.domain.com") + assert isinstance(conn, FileConnection) + assert conn.host == "some-host.domain.com" + assert conn.webhdfs_port == 50070 + assert not conn.user + assert not conn.password + assert not conn.keytab + assert conn.instance_url == "hdfs://some-host.domain.com:50070" + assert str(conn) == "HDFS[some-host.domain.com:50070]" def test_hdfs_connection_with_cluster(): from onetl.connection import HDFS - hdfs = HDFS(cluster="rnd-dwh") - assert isinstance(hdfs, FileConnection) - assert hdfs.cluster == "rnd-dwh" - assert hdfs.webhdfs_port == 50070 - assert not hdfs.user - assert not hdfs.password - assert not hdfs.keytab - assert hdfs.instance_url == "rnd-dwh" + conn = HDFS(cluster="rnd-dwh") + assert conn.cluster == "rnd-dwh" + assert conn.webhdfs_port == 50070 + assert not conn.user + assert not conn.password + assert not conn.keytab + assert conn.instance_url == "rnd-dwh" + assert str(conn) == "HDFS[rnd-dwh]" def test_hdfs_connection_with_cluster_and_host(): from onetl.connection import HDFS - hdfs = HDFS(cluster="rnd-dwh", host="some-host.domain.com") - assert isinstance(hdfs, FileConnection) - assert hdfs.cluster == "rnd-dwh" - assert hdfs.host == "some-host.domain.com" - assert hdfs.instance_url == "rnd-dwh" + conn = HDFS(cluster="rnd-dwh", host="some-host.domain.com") + assert conn.cluster == "rnd-dwh" + assert conn.host == "some-host.domain.com" + assert conn.instance_url == "rnd-dwh" + assert str(conn) == "HDFS[rnd-dwh]" -def test_hdfs_connection_with_port(): +def test_hdfs_connection_with_host_and_port(): from onetl.connection import HDFS - hdfs = HDFS(host="some-host.domain.com", port=9080) - assert isinstance(hdfs, FileConnection) - assert hdfs.host == "some-host.domain.com" - assert hdfs.webhdfs_port == 9080 - assert hdfs.instance_url == "hdfs://some-host.domain.com:9080" + conn = HDFS(host="some-host.domain.com", port=9080) + assert conn.host == "some-host.domain.com" + assert conn.webhdfs_port == 9080 + assert conn.instance_url == "hdfs://some-host.domain.com:9080" + assert str(conn) == "HDFS[some-host.domain.com:9080]" def test_hdfs_connection_with_user(): from onetl.connection import HDFS - hdfs = HDFS(host="some-host.domain.com", user="some_user") - assert hdfs.host == "some-host.domain.com" - assert hdfs.webhdfs_port == 50070 - assert hdfs.user == "some_user" - assert not hdfs.password - assert not hdfs.keytab + conn = HDFS(host="some-host.domain.com", user="some_user") + assert conn.host == "some-host.domain.com" + assert conn.webhdfs_port == 50070 + assert conn.user == "some_user" + assert not conn.password + assert not conn.keytab def test_hdfs_connection_with_password(): from onetl.connection import HDFS - hdfs = HDFS(host="some-host.domain.com", user="some_user", password="pwd") - assert hdfs.host == "some-host.domain.com" - assert hdfs.webhdfs_port == 50070 - assert hdfs.user == "some_user" - assert hdfs.password != "pwd" - assert hdfs.password.get_secret_value() == "pwd" - assert not hdfs.keytab + conn = HDFS(host="some-host.domain.com", user="some_user", password="pwd") + assert conn.host == "some-host.domain.com" + assert conn.webhdfs_port == 50070 + assert conn.user == "some_user" + assert conn.password != "pwd" + assert conn.password.get_secret_value() == "pwd" + assert not conn.keytab + assert str(conn) == "HDFS[some-host.domain.com:50070]" - assert "password='pwd'" not in str(hdfs) - assert "password='pwd'" not in repr(hdfs) + assert "pwd" not in repr(conn) def test_hdfs_connection_with_keytab(request, tmp_path_factory): @@ -91,15 +92,15 @@ def test_hdfs_connection_with_keytab(request, tmp_path_factory): folder.mkdir(exist_ok=True, parents=True) keytab = folder / "user.keytab" keytab.touch() - hdfs = HDFS(host="some-host.domain.com", user="some_user", keytab=keytab) + conn = HDFS(host="some-host.domain.com", user="some_user", keytab=keytab) def finalizer(): shutil.rmtree(folder) request.addfinalizer(finalizer) - assert hdfs.user == "some_user" - assert not hdfs.password + assert conn.user == "some_user" + assert not conn.password def test_hdfs_connection_keytab_does_not_exist(): @@ -242,7 +243,7 @@ def get_webhdfs_port(cluster: str) -> int | None: assert HDFS(host="some-node.domain.com", cluster="rnd-dwh").webhdfs_port == 9080 -def test_hdfs_known_get_current(request, mocker): +def test_hdfs_known_get_current(request): from onetl.connection import HDFS # no hooks bound to HDFS.Slots.get_current_cluster @@ -259,5 +260,5 @@ def get_current_cluster() -> str: request.addfinalizer(get_current_cluster.disable) - hdfs = HDFS.get_current() - assert hdfs.cluster == "rnd-dwh" + conn = HDFS.get_current() + assert conn.cluster == "rnd-dwh" diff --git a/tests/tests_unit/tests_file_connection_unit/test_s3_unit.py b/tests/tests_unit/tests_file_connection_unit/test_s3_unit.py index e652c24e..524c24f6 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_s3_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_s3_unit.py @@ -6,29 +6,29 @@ def test_s3_connection(): from onetl.connection import S3 - s3 = S3( + conn = S3( host="some_host", access_key="access key", secret_key="some key", bucket="bucket", ) - assert s3.host == "some_host" - assert s3.access_key == "access key" - assert s3.secret_key != "some key" - assert s3.secret_key.get_secret_value() == "some key" - assert s3.protocol == "https" - assert s3.port == 443 - assert s3.instance_url == "s3://some_host:443" + assert conn.host == "some_host" + assert conn.access_key == "access key" + assert conn.secret_key != "some key" + assert conn.secret_key.get_secret_value() == "some key" + assert conn.protocol == "https" + assert conn.port == 443 + assert conn.instance_url == "s3://some_host:443/bucket" + assert str(conn) == "S3[some_host:443/bucket]" - assert "some key" not in str(s3) - assert "some key" not in repr(s3) + assert "some key" not in repr(conn) def test_s3_connection_with_session_token(): from onetl.connection import S3 - s3 = S3( + conn = S3( host="some_host", access_key="access_key", secret_key="some key", @@ -36,17 +36,16 @@ def test_s3_connection_with_session_token(): bucket="bucket", ) - assert s3.session_token != "some token" - assert s3.session_token.get_secret_value() == "some token" + assert conn.session_token != "some token" + assert conn.session_token.get_secret_value() == "some token" - assert "some token" not in str(s3) - assert "some token" not in repr(s3) + assert "some token" not in repr(conn) def test_s3_connection_https(): from onetl.connection import S3 - s3 = S3( + conn = S3( host="some_host", access_key="access_key", secret_key="secret_key", @@ -54,15 +53,16 @@ def test_s3_connection_https(): protocol="https", ) - assert s3.protocol == "https" - assert s3.port == 443 - assert s3.instance_url == "s3://some_host:443" + assert conn.protocol == "https" + assert conn.port == 443 + assert conn.instance_url == "s3://some_host:443/bucket" + assert str(conn) == "S3[some_host:443/bucket]" def test_s3_connection_http(): from onetl.connection import S3 - s3 = S3( + conn = S3( host="some_host", access_key="access_key", secret_key="secret_key", @@ -70,16 +70,17 @@ def test_s3_connection_http(): protocol="http", ) - assert s3.protocol == "http" - assert s3.port == 80 - assert s3.instance_url == "s3://some_host:80" + assert conn.protocol == "http" + assert conn.port == 80 + assert conn.instance_url == "s3://some_host:80/bucket" + assert str(conn) == "S3[some_host:80/bucket]" @pytest.mark.parametrize("protocol", ["http", "https"]) def test_s3_connection_with_port(protocol): from onetl.connection import S3 - s3 = S3( + conn = S3( host="some_host", port=9000, access_key="access_key", @@ -88,6 +89,7 @@ def test_s3_connection_with_port(protocol): protocol=protocol, ) - assert s3.protocol == protocol - assert s3.port == 9000 - assert s3.instance_url == "s3://some_host:9000" + assert conn.protocol == protocol + assert conn.port == 9000 + assert conn.instance_url == "s3://some_host:9000/bucket" + assert str(conn) == "S3[some_host:9000/bucket]" diff --git a/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py index 42f95b36..2dfd06e6 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_samba_unit.py @@ -8,36 +8,39 @@ def test_samba_connection(): from onetl.connection import Samba - samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd") - assert isinstance(samba, FileConnection) - assert samba.host == "some_host" - assert samba.protocol == "SMB" - assert samba.domain == "" - assert samba.auth_type == "NTLMv2" - assert samba.port == 445 - assert samba.user == "some_user" - assert samba.password != "pwd" - assert samba.password.get_secret_value() == "pwd" + conn = Samba(host="some_host", share="share_name", user="some_user", password="pwd") + assert isinstance(conn, FileConnection) + assert conn.host == "some_host" + assert conn.port == 445 + assert conn.share == "share_name" + assert conn.protocol == "SMB" + assert conn.domain == "" + assert conn.auth_type == "NTLMv2" + assert conn.user == "some_user" + assert conn.password != "pwd" + assert conn.password.get_secret_value() == "pwd" - assert "password='pwd'" not in str(samba) - assert "password='pwd'" not in repr(samba) + assert conn.instance_url == "smb://some_host:445/share_name" + assert str(conn) == "Samba[some_host:445/share_name]" + + assert "pwd" not in repr(conn) def test_samba_connection_with_net_bios(): from onetl.connection import Samba - samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol="NetBIOS") - assert samba.protocol == "NetBIOS" - assert samba.port == 139 + conn = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol="NetBIOS") + assert conn.protocol == "NetBIOS" + assert conn.port == 139 @pytest.mark.parametrize("protocol", ["SMB", "NetBIOS"]) def test_samba_connection_with_custom_port(protocol): from onetl.connection import Samba - samba = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol=protocol, port=444) - assert samba.protocol == protocol - assert samba.port == 444 + conn = Samba(host="some_host", share="share_name", user="some_user", password="pwd", protocol=protocol, port=444) + assert conn.protocol == protocol + assert conn.port == 444 def test_samba_connection_without_mandatory_args(): diff --git a/tests/tests_unit/tests_file_connection_unit/test_sftp_unit.py b/tests/tests_unit/tests_file_connection_unit/test_sftp_unit.py index 11f6cfbd..d2e02b75 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_sftp_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_sftp_unit.py @@ -7,35 +7,41 @@ def test_sftp_connection_anonymous(): - from onetl.connection import SFTP + from onetl.connection import SFTP, FileConnection - sftp = SFTP(host="some_host") - assert sftp.host == "some_host" - assert sftp.port == 22 - assert not sftp.user - assert not sftp.password - assert not sftp.key_file + conn = SFTP(host="some_host") + assert isinstance(conn, FileConnection) + assert conn.host == "some_host" + assert conn.port == 22 + assert not conn.user + assert not conn.password + assert not conn.key_file + assert conn.instance_url == "sftp://some_host:22" + assert str(conn) == "SFTP[some_host:22]" def test_sftp_connection_with_port(): from onetl.connection import SFTP - sftp = SFTP(host="some_host", port=500) + conn = SFTP(host="some_host", port=500) - assert sftp.port == 500 + assert conn.port == 500 + assert conn.instance_url == "sftp://some_host:500" + assert str(conn) == "SFTP[some_host:500]" def test_sftp_connection_with_password(): from onetl.connection import SFTP - sftp = SFTP(host="some_host", user="some_user", password="pwd") - assert sftp.user == "some_user" - assert sftp.password != "pwd" - assert sftp.password.get_secret_value() == "pwd" - assert not sftp.key_file + conn = SFTP(host="some_host", user="some_user", password="pwd") + assert conn.user == "some_user" + assert conn.password != "pwd" + assert conn.password.get_secret_value() == "pwd" + assert not conn.key_file + assert conn.instance_url == "sftp://some_host:22" + assert str(conn) == "SFTP[some_host:22]" - assert "password='pwd'" not in str(sftp) - assert "password='pwd'" not in repr(sftp) + assert "pwd" not in repr(conn) def test_sftp_connection_with_key_file(request, tmp_path_factory): @@ -51,10 +57,10 @@ def finalizer(): request.addfinalizer(finalizer) - sftp = SFTP(host="some_host", user="some_user", key_file=key_file) - assert sftp.user == "some_user" - assert not sftp.password - assert sftp.key_file == key_file + conn = SFTP(host="some_host", user="some_user", key_file=key_file) + assert conn.user == "some_user" + assert not conn.password + assert conn.key_file == key_file def test_sftp_connection_key_file_does_not_exist(): diff --git a/tests/tests_unit/tests_file_connection_unit/test_webdav_unit.py b/tests/tests_unit/tests_file_connection_unit/test_webdav_unit.py index 7d92d494..7f458678 100644 --- a/tests/tests_unit/tests_file_connection_unit/test_webdav_unit.py +++ b/tests/tests_unit/tests_file_connection_unit/test_webdav_unit.py @@ -8,34 +8,39 @@ def test_webdav_connection(): from onetl.connection import WebDAV - webdav = WebDAV(host="some_host", user="some_user", password="pwd") - assert isinstance(webdav, FileConnection) - assert webdav.host == "some_host" - assert webdav.protocol == "https" - assert webdav.port == 443 - assert webdav.user == "some_user" - assert webdav.password != "pwd" - assert webdav.password.get_secret_value() == "pwd" + conn = WebDAV(host="some_host", user="some_user", password="pwd") + assert isinstance(conn, FileConnection) + assert conn.host == "some_host" + assert conn.protocol == "https" + assert conn.port == 443 + assert conn.user == "some_user" + assert conn.password != "pwd" + assert conn.password.get_secret_value() == "pwd" + assert conn.instance_url == "webdav://some_host:443" + assert str(conn) == "WebDAV[some_host:443]" - assert "password='pwd'" not in str(webdav) - assert "password='pwd'" not in repr(webdav) + assert "pwd" not in repr(conn) def test_webdav_connection_with_http(): from onetl.connection import WebDAV - webdav = WebDAV(host="some_host", user="some_user", password="pwd", protocol="http") - assert webdav.protocol == "http" - assert webdav.port == 80 + conn = WebDAV(host="some_host", user="some_user", password="pwd", protocol="http") + assert conn.protocol == "http" + assert conn.port == 80 + assert conn.instance_url == "webdav://some_host:80" + assert str(conn) == "WebDAV[some_host:80]" @pytest.mark.parametrize("protocol", ["http", "https"]) def test_webdav_connection_with_custom_port(protocol): from onetl.connection import WebDAV - webdav = WebDAV(host="some_host", user="some_user", password="pwd", port=500, protocol=protocol) - assert webdav.protocol == protocol - assert webdav.port == 500 + conn = WebDAV(host="some_host", user="some_user", password="pwd", port=500, protocol=protocol) + assert conn.protocol == protocol + assert conn.port == 500 + assert conn.instance_url == "webdav://some_host:500" + assert str(conn) == "WebDAV[some_host:500]" def test_webdav_connection_without_mandatory_args(): diff --git a/tests/tests_unit/tests_file_df_connection_unit/test_spark_hdfs_unit.py b/tests/tests_unit/tests_file_df_connection_unit/test_spark_hdfs_unit.py index 08ca6c1f..0d392c8d 100644 --- a/tests/tests_unit/tests_file_df_connection_unit/test_spark_hdfs_unit.py +++ b/tests/tests_unit/tests_file_df_connection_unit/test_spark_hdfs_unit.py @@ -12,28 +12,31 @@ def test_spark_hdfs_with_cluster(spark_mock): - hdfs = SparkHDFS(cluster="rnd-dwh", spark=spark_mock) - assert isinstance(hdfs, BaseFileDFConnection) - assert hdfs.cluster == "rnd-dwh" - assert hdfs.host is None - assert hdfs.ipc_port == 8020 - assert hdfs.instance_url == "rnd-dwh" + conn = SparkHDFS(cluster="rnd-dwh", spark=spark_mock) + assert isinstance(conn, BaseFileDFConnection) + assert conn.cluster == "rnd-dwh" + assert conn.host is None + assert conn.ipc_port == 8020 + assert conn.instance_url == "rnd-dwh" + assert str(conn) == "HDFS[rnd-dwh]" def test_spark_hdfs_with_cluster_and_host(spark_mock): - hdfs = SparkHDFS(cluster="rnd-dwh", host="some-host.domain.com", spark=spark_mock) - assert isinstance(hdfs, BaseFileDFConnection) - assert hdfs.cluster == "rnd-dwh" - assert hdfs.host == "some-host.domain.com" - assert hdfs.instance_url == "rnd-dwh" + conn = SparkHDFS(cluster="rnd-dwh", host="some-host.domain.com", spark=spark_mock) + assert isinstance(conn, BaseFileDFConnection) + assert conn.cluster == "rnd-dwh" + assert conn.host == "some-host.domain.com" + assert conn.instance_url == "rnd-dwh" + assert str(conn) == "HDFS[rnd-dwh]" def test_spark_hdfs_with_port(spark_mock): - hdfs = SparkHDFS(cluster="rnd-dwh", port=9020, spark=spark_mock) - assert isinstance(hdfs, BaseFileDFConnection) - assert hdfs.cluster == "rnd-dwh" - assert hdfs.ipc_port == 9020 - assert hdfs.instance_url == "rnd-dwh" + conn = SparkHDFS(cluster="rnd-dwh", port=9020, spark=spark_mock) + assert isinstance(conn, BaseFileDFConnection) + assert conn.cluster == "rnd-dwh" + assert conn.ipc_port == 9020 + assert conn.instance_url == "rnd-dwh" + assert str(conn) == "HDFS[rnd-dwh]" def test_spark_hdfs_without_cluster(spark_mock): @@ -143,5 +146,5 @@ def get_current_cluster() -> str: request.addfinalizer(get_current_cluster.disable) - hdfs = SparkHDFS.get_current(spark=spark_mock) - assert hdfs.cluster == "rnd-dwh" + conn = SparkHDFS.get_current(spark=spark_mock) + assert conn.cluster == "rnd-dwh" diff --git a/tests/tests_unit/tests_file_df_connection_unit/test_spark_local_fs_unit.py b/tests/tests_unit/tests_file_df_connection_unit/test_spark_local_fs_unit.py index e98c986c..ac41f7f8 100644 --- a/tests/tests_unit/tests_file_df_connection_unit/test_spark_local_fs_unit.py +++ b/tests/tests_unit/tests_file_df_connection_unit/test_spark_local_fs_unit.py @@ -13,6 +13,7 @@ def test_spark_local_fs_spark_local(spark_mock): conn = SparkLocalFS(spark=spark_mock) assert conn.spark == spark_mock assert conn.instance_url == f"file://{socket.getfqdn()}" + assert str(conn) == "LocalFS" @pytest.mark.parametrize("master", ["k8s", "yarn"]) diff --git a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py index 99a20633..34ac4387 100644 --- a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py +++ b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py @@ -84,7 +84,7 @@ def spark_mock_hadoop_3(spark_mock): def test_spark_s3(spark_mock_hadoop_3): - s3 = SparkS3( + conn = SparkS3( host="some_host", access_key="access key", secret_key="some key", @@ -92,20 +92,20 @@ def test_spark_s3(spark_mock_hadoop_3): spark=spark_mock_hadoop_3, ) - assert s3.host == "some_host" - assert s3.access_key == "access key" - assert s3.secret_key != "some key" - assert s3.secret_key.get_secret_value() == "some key" - assert s3.protocol == "https" - assert s3.port == 443 - assert s3.instance_url == "s3://some_host:443" + assert conn.host == "some_host" + assert conn.access_key == "access key" + assert conn.secret_key != "some key" + assert conn.secret_key.get_secret_value() == "some key" + assert conn.protocol == "https" + assert conn.port == 443 + assert conn.instance_url == "s3://some_host:443/bucket" + assert str(conn) == "S3[some_host:443/bucket]" - assert "some key" not in str(s3) - assert "some key" not in repr(s3) + assert "some key" not in repr(conn) def test_spark_s3_with_protocol_https(spark_mock_hadoop_3): - s3 = SparkS3( + conn = SparkS3( host="some_host", access_key="access_key", secret_key="secret_key", @@ -114,13 +114,14 @@ def test_spark_s3_with_protocol_https(spark_mock_hadoop_3): spark=spark_mock_hadoop_3, ) - assert s3.protocol == "https" - assert s3.port == 443 - assert s3.instance_url == "s3://some_host:443" + assert conn.protocol == "https" + assert conn.port == 443 + assert conn.instance_url == "s3://some_host:443/bucket" + assert str(conn) == "S3[some_host:443/bucket]" def test_spark_s3_with_protocol_http(spark_mock_hadoop_3): - s3 = SparkS3( + conn = SparkS3( host="some_host", access_key="access_key", secret_key="secret_key", @@ -129,14 +130,15 @@ def test_spark_s3_with_protocol_http(spark_mock_hadoop_3): spark=spark_mock_hadoop_3, ) - assert s3.protocol == "http" - assert s3.port == 80 - assert s3.instance_url == "s3://some_host:80" + assert conn.protocol == "http" + assert conn.port == 80 + assert conn.instance_url == "s3://some_host:80/bucket" + assert str(conn) == "S3[some_host:80/bucket]" @pytest.mark.parametrize("protocol", ["http", "https"]) def test_spark_s3_with_port(spark_mock_hadoop_3, protocol): - s3 = SparkS3( + conn = SparkS3( host="some_host", port=9000, access_key="access_key", @@ -146,9 +148,10 @@ def test_spark_s3_with_port(spark_mock_hadoop_3, protocol): spark=spark_mock_hadoop_3, ) - assert s3.protocol == protocol - assert s3.port == 9000 - assert s3.instance_url == "s3://some_host:9000" + assert conn.protocol == protocol + assert conn.port == 9000 + assert conn.instance_url == "s3://some_host:9000/bucket" + assert str(conn) == "S3[some_host:9000/bucket]" @pytest.mark.parametrize( From 1382600ec8552eb22bc53e242f1afea1ac38ab01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 20 Aug 2024 11:14:06 +0000 Subject: [PATCH 42/64] Fix documentation build --- docs/conf.py | 3 --- requirements/docs.txt | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index f781dddd..867d4daf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -120,9 +120,6 @@ {"rel": "icon", "href": "icon.svg", "type": "image/svg+xml"}, ] -# TODO: remove after https://github.com/mgeier/sphinx-last-updated-by-git/pull/77 -git_exclude_patterns = ["docs/_static/logo_wide.svg"] - # The master toctree document. master_doc = "index" diff --git a/requirements/docs.txt b/requirements/docs.txt index be2cd127..87768350 100644 --- a/requirements/docs.txt +++ b/requirements/docs.txt @@ -9,7 +9,8 @@ sphinx<8 sphinx-copybutton sphinx-design sphinx-favicon -sphinx-last-updated-by-git +# https://github.com/mgeier/sphinx-last-updated-by-git/pull/77 +sphinx-last-updated-by-git>=0.3.8 # TODO: uncomment after https://github.com/zqmillet/sphinx-plantuml/pull/4 # sphinx-plantuml sphinx-tabs From f4d1f3dbcf76acc1cdb4e587705b84f351241c70 Mon Sep 17 00:00:00 2001 From: Maxim Liksakov <67663774+maxim-lixakov@users.noreply.github.com> Date: Tue, 20 Aug 2024 17:54:27 +0300 Subject: [PATCH 43/64] [DOP-16999] - Add jdbc_dialect logging (#305) --- docs/changelog/next_release/305.feature.rst | 1 + .../jdbc_connection/connection.py | 2 ++ .../db_connection/jdbc_mixin/connection.py | 17 +++++++++++--- .../test_clickhouse_integration.py | 22 ++++++++++++++----- .../test_postgres_integration.py | 21 +++++++++++++----- .../test_strategy_incremental_batch.py | 6 ++++- 6 files changed, 53 insertions(+), 16 deletions(-) create mode 100644 docs/changelog/next_release/305.feature.rst diff --git a/docs/changelog/next_release/305.feature.rst b/docs/changelog/next_release/305.feature.rst new file mode 100644 index 00000000..c4c44dc6 --- /dev/null +++ b/docs/changelog/next_release/305.feature.rst @@ -0,0 +1 @@ +Add log.info about JDBC dialect usage: ``Detected dialect: 'org.apache.spark.sql.jdbc.MySQLDialect'`` diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 9d41298e..0f3ac024 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -90,6 +90,7 @@ def sql( query = clear_statement(query) + log.info("|%s| Detected dialect: '%s'", self.__class__.__name__, self._get_spark_dialect_name()) log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) log_lines(log, query) @@ -195,6 +196,7 @@ def get_df_schema( columns: list[str] | None = None, options: JDBCReadOptions | None = None, ) -> StructType: + log.info("|%s| Detected dialect: '%s'", self.__class__.__name__, self._get_spark_dialect_name()) log.info("|%s| Fetching schema of table %r ...", self.__class__.__name__, source) query = self.dialect.get_sql_query(source, columns=columns, limit=0, compact=True) diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 8ec77d13..2f25b5a9 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -205,6 +205,7 @@ def fetch( query = clear_statement(query) + log.info("|%s| Detected dialect: '%s'", self.__class__.__name__, self._get_spark_dialect_name()) log.info("|%s| Executing SQL query (on driver):", self.__class__.__name__) log_lines(log, query) @@ -277,6 +278,7 @@ def execute( statement = clear_statement(statement) + log.info("|%s| Detected dialect: '%s'", self.__class__.__name__, self._get_spark_dialect_name()) log.info("|%s| Executing statement (on driver):", self.__class__.__name__) log_lines(log, statement) @@ -417,6 +419,17 @@ def _get_jdbc_connection(self, options: JDBCFetchOptions | JDBCExecuteOptions): self._last_connection_and_options.data = (new_connection, options) return new_connection + def _get_spark_dialect_name(self) -> str: + """ + Returns the name of the JDBC dialect associated with the connection URL. + """ + dialect = self._get_spark_dialect().toString() + return dialect.split("$")[0] if "$" in dialect else dialect + + def _get_spark_dialect(self): + jdbc_dialects_package = self.spark._jvm.org.apache.spark.sql.jdbc + return jdbc_dialects_package.JdbcDialects.get(self.jdbc_url) + def _close_connections(self): with suppress(Exception): # connection maybe not opened yet @@ -559,9 +572,7 @@ def _resultset_to_dataframe(self, result_set) -> DataFrame: from pyspark.sql import DataFrame # noqa: WPS442 - jdbc_dialects_package = self.spark._jvm.org.apache.spark.sql.jdbc # type: ignore - jdbc_dialect = jdbc_dialects_package.JdbcDialects.get(self.jdbc_url) - + jdbc_dialect = self._get_spark_dialect() jdbc_utils_package = self.spark._jvm.org.apache.spark.sql.execution.datasources.jdbc # type: ignore jdbc_utils = jdbc_utils_package.JdbcUtils diff --git a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py index 78656d83..aa9205b8 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_clickhouse_integration.py @@ -62,7 +62,7 @@ def test_clickhouse_connection_check_extra_is_handled_by_driver(spark, processin @pytest.mark.parametrize("suffix", ["", ";"]) -def test_clickhouse_connection_sql(spark, processing, load_table_data, suffix): +def test_clickhouse_connection_sql(spark, processing, load_table_data, suffix, caplog): clickhouse = Clickhouse( host=processing.host, port=processing.port, @@ -73,7 +73,11 @@ def test_clickhouse_connection_sql(spark, processing, load_table_data, suffix): ) table = load_table_data.full_name - df = clickhouse.sql(f"SELECT * FROM {table}{suffix}") + + with caplog.at_level(logging.INFO): + df = clickhouse.sql(f"SELECT * FROM {table}{suffix}") + assert "Detected dialect: 'org.apache.spark.sql.jdbc.NoopDialect'" in caplog.text + table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -91,7 +95,7 @@ def test_clickhouse_connection_sql(spark, processing, load_table_data, suffix): @pytest.mark.parametrize("suffix", ["", ";"]) -def test_clickhouse_connection_fetch(spark, processing, load_table_data, suffix): +def test_clickhouse_connection_fetch(spark, processing, load_table_data, suffix, caplog): clickhouse = Clickhouse( host=processing.host, port=processing.port, @@ -103,7 +107,10 @@ def test_clickhouse_connection_fetch(spark, processing, load_table_data, suffix) schema = load_table_data.schema table = load_table_data.full_name - df = clickhouse.fetch(f"SELECT * FROM {table}{suffix}") + + with caplog.at_level(logging.INFO): + df = clickhouse.fetch(f"SELECT * FROM {table}{suffix}") + assert "Detected dialect: 'org.apache.spark.sql.jdbc.NoopDialect'" in caplog.text table_df = processing.get_expected_dataframe( schema=load_table_data.schema, @@ -192,7 +199,7 @@ def test_clickhouse_connection_execute_ddl(spark, processing, get_schema_table, @pytest.mark.flaky @pytest.mark.parametrize("suffix", ["", ";"]) -def test_clickhouse_connection_execute_dml(request, spark, processing, load_table_data, suffix): +def test_clickhouse_connection_execute_dml(request, spark, processing, load_table_data, suffix, caplog): clickhouse = Clickhouse( host=processing.host, port=processing.port, @@ -242,7 +249,9 @@ def table_finalizer(): updated_df = pandas.concat([updated_rows, unchanged_rows]) processing.assert_equal_df(df=df, other_frame=updated_df, order_by="id_int") - clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") + with caplog.at_level(logging.INFO): + clickhouse.execute(f"UPDATE {temp_table} SET hwm_int = 1 WHERE id_int < 50{suffix}") + assert "Detected dialect: 'org.apache.spark.sql.jdbc.NoopDialect'" in caplog.text clickhouse.execute(f"ALTER TABLE {temp_table} DELETE WHERE id_int < 70{suffix}") df = clickhouse.fetch(f"SELECT * FROM {temp_table}{suffix}") @@ -273,6 +282,7 @@ def test_clickhouse_connection_execute_function( processing, load_table_data, suffix, + caplog, ): clickhouse = Clickhouse( host=processing.host, diff --git a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py index 6cea95cc..ead0275e 100644 --- a/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py +++ b/tests/tests_integration/tests_db_connection_integration/test_postgres_integration.py @@ -48,7 +48,7 @@ def test_postgres_connection_check_fail(spark): @pytest.mark.parametrize("suffix", ["", ";"]) -def test_postgres_connection_sql(spark, processing, load_table_data, suffix): +def test_postgres_connection_sql(spark, processing, load_table_data, suffix, caplog): postgres = Postgres( host=processing.host, port=processing.port, @@ -60,7 +60,10 @@ def test_postgres_connection_sql(spark, processing, load_table_data, suffix): table = load_table_data.full_name - df = postgres.sql(f"SELECT * FROM {table}{suffix}") + with caplog.at_level(logging.INFO): + df = postgres.sql(f"SELECT * FROM {table}{suffix}") + assert "Detected dialect: 'org.apache.spark.sql.jdbc.PostgresDialect'" in caplog.text + table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -79,7 +82,7 @@ def test_postgres_connection_sql(spark, processing, load_table_data, suffix): @pytest.mark.parametrize("suffix", ["", ";"]) -def test_postgres_connection_fetch(spark, processing, load_table_data, suffix): +def test_postgres_connection_fetch(spark, processing, load_table_data, suffix, caplog): postgres = Postgres( host=processing.host, port=processing.port, @@ -91,7 +94,10 @@ def test_postgres_connection_fetch(spark, processing, load_table_data, suffix): table = load_table_data.full_name - df = postgres.fetch(f"SELECT * FROM {table}{suffix}", Postgres.FetchOptions(fetchsize=2)) + with caplog.at_level(logging.INFO): + df = postgres.fetch(f"SELECT * FROM {table}{suffix}", Postgres.FetchOptions(fetchsize=2)) + assert "Detected dialect: 'org.apache.spark.sql.jdbc.PostgresDialect'" in caplog.text + table_df = processing.get_expected_dataframe( schema=load_table_data.schema, table=load_table_data.table, @@ -1023,7 +1029,7 @@ def test_postgres_connection_fetch_with_legacy_jdbc_options(spark, processing): assert df is not None -def test_postgres_connection_execute_with_legacy_jdbc_options(spark, processing): +def test_postgres_connection_execute_with_legacy_jdbc_options(spark, processing, caplog): postgres = Postgres( host=processing.host, port=processing.port, @@ -1034,4 +1040,7 @@ def test_postgres_connection_execute_with_legacy_jdbc_options(spark, processing) ) options = Postgres.JDBCOptions(query_timeout=30) - postgres.execute("DROP TABLE IF EXISTS temp_table;", options=options) + + with caplog.at_level(logging.INFO): + postgres.execute("DROP TABLE IF EXISTS temp_table;", options=options) + assert "Detected dialect: 'org.apache.spark.sql.jdbc.PostgresDialect'" in caplog.text diff --git a/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py b/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py index 66c7ad31..e72b91e8 100644 --- a/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py +++ b/tests/tests_integration/tests_strategy_integration/test_strategy_incremental_batch.py @@ -1,3 +1,4 @@ +import logging import re import secrets from datetime import date, datetime, timedelta @@ -182,6 +183,7 @@ def test_postgres_strategy_incremental_batch_different_hwm_type_in_store( hwm_column, new_type, step, + caplog, ): postgres = Postgres( host=processing.host, @@ -200,7 +202,9 @@ def test_postgres_strategy_incremental_batch_different_hwm_type_in_store( with IncrementalBatchStrategy(step=step) as batches: for _ in batches: - reader.run() + with caplog.at_level(logging.INFO): + reader.run() + assert "Detected dialect: 'org.apache.spark.sql.jdbc.PostgresDialect'" in caplog.text # change table schema new_fields = {column_name: processing.get_column_type(column_name) for column_name in processing.column_names} From e3d83594685cbfcded56f9dc4978c67080b8a23a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 21 Aug 2024 10:07:00 +0000 Subject: [PATCH 44/64] Test Spark 3.5.2 --- .github/workflows/data/clickhouse/matrix.yml | 2 +- .github/workflows/data/core/matrix.yml | 2 +- .github/workflows/data/hdfs/matrix.yml | 2 +- .github/workflows/data/hive/matrix.yml | 2 +- .github/workflows/data/kafka/matrix.yml | 2 +- .github/workflows/data/local-fs/matrix.yml | 4 ++-- .github/workflows/data/mongodb/matrix.yml | 2 +- .github/workflows/data/mssql/matrix.yml | 2 +- .github/workflows/data/mysql/matrix.yml | 2 +- .github/workflows/data/oracle/matrix.yml | 2 +- .github/workflows/data/postgres/matrix.yml | 2 +- .github/workflows/data/s3/matrix.yml | 2 +- .github/workflows/data/teradata/matrix.yml | 2 +- CONTRIBUTING.rst | 2 +- README.rst | 6 +++--- docker-compose.yml | 2 +- docker/Dockerfile | 2 +- docs/changelog/next_release/306.feature.rst | 1 + .../db_connection/clickhouse/types.rst | 4 ++-- docs/connection/db_connection/mssql/types.rst | 4 ++-- docs/connection/db_connection/mysql/types.rst | 4 ++-- docs/connection/db_connection/oracle/types.rst | 4 ++-- docs/connection/db_connection/postgres/types.rst | 4 ++-- onetl/_metrics/extract.py | 2 +- onetl/_metrics/listener/base.py | 2 +- onetl/_metrics/listener/execution.py | 12 ++++++------ onetl/_metrics/listener/job.py | 4 ++-- onetl/_metrics/listener/listener.py | 2 +- onetl/_metrics/listener/stage.py | 6 +++--- onetl/_metrics/listener/task.py | 6 +++--- onetl/_util/spark.py | 2 +- .../connection/db_connection/kafka/connection.py | 4 ++-- .../file_df_connection/spark_s3/connection.py | 6 +++--- onetl/file/format/avro.py | 4 ++-- onetl/file/format/excel.py | 16 ++++++++-------- onetl/file/format/xml.py | 8 ++++---- .../tests/{spark-3.5.0.txt => spark-3.5.2.txt} | 2 +- .../test_file/test_format_unit/test_avro_unit.py | 8 ++++---- .../test_format_unit/test_excel_unit.py | 16 ++++++++-------- .../test_spark_s3_unit.py | 6 +++--- 40 files changed, 84 insertions(+), 83 deletions(-) create mode 100644 docs/changelog/next_release/306.feature.rst rename requirements/tests/{spark-3.5.0.txt => spark-3.5.2.txt} (76%) diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 6f1d7261..d18856df 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -11,7 +11,7 @@ min: &min max: &max clickhouse-image: clickhouse/clickhouse-server clickhouse-version: 24.6.3.70-alpine - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index d20f074a..504f1d4d 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index af4553f1..f8bae7d5 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max hadoop-version: hadoop3-hdfs - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 6ce0d7a8..31b2120f 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 1b9b2336..4ff5fe64 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -12,7 +12,7 @@ min: &min max: &max kafka-version: 3.7.1 pydantic-version: 2 - spark-version: 3.5.1 + spark-version: 3.5.2 python-version: '3.12' java-version: 20 os: ubuntu-latest diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index d1337291..c4466f3c 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,8 +20,8 @@ min_excel: &min_excel os: ubuntu-latest max: &max - # Excel package currently has no release for 3.5.1 - spark-version: 3.5.0 + # Excel package currently has no release for 3.5.2 + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 98e1fe97..4c3d9d86 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -9,7 +9,7 @@ min: &min max: &max mongodb-version: 7.0.12 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index fad2e738..3748a0a7 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -8,7 +8,7 @@ min: &min max: &max mssql-version: 2022-CU14-ubuntu-22.04 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index d2e70314..17dacdb2 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max mysql-version: 9.0.1 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 7a79c68a..ccafa20f 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -12,7 +12,7 @@ max: &max oracle-image: gvenzl/oracle-free oracle-version: 23.4-slim-faststart db-name: FREEPDB1 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 4c5b5f4e..d37c3a83 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -9,7 +9,7 @@ min: &min max: &max postgres-version: 16.3-alpine - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 06d4f748..405b8b68 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max minio-version: 2024.7.26 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 6c2a5545..d9792be6 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,5 +1,5 @@ max: &max - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 7a70dbac..aa1a3c03 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -71,7 +71,7 @@ Create virtualenv and install dependencies: -r requirements/tests/postgres.txt \ -r requirements/tests/oracle.txt \ -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.1.txt + -r requirements/tests/spark-3.5.2.txt # TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 pip install sphinx-plantuml --no-deps diff --git a/README.rst b/README.rst index 0a4cbc97..9def167f 100644 --- a/README.rst +++ b/README.rst @@ -184,7 +184,7 @@ Compatibility matrix +--------------------------------------------------------------+-------------+-------------+-------+ | `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | +--------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: @@ -199,7 +199,7 @@ or install PySpark explicitly: .. code:: bash - pip install onetl pyspark==3.5.1 # install a specific PySpark version + pip install onetl pyspark==3.5.2 # install a specific PySpark version or inject PySpark to ``sys.path`` in some other way BEFORE creating a class instance. **Otherwise connection object cannot be created.** @@ -540,7 +540,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th setup_logging() # Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.1") + Postgres.get_packages() + maven_packages = SparkS3.get_packages(spark_version="3.5.2") + Postgres.get_packages() spark = ( SparkSession.builder.appName("spark_app_onetl_demo") .config("spark.jars.packages", ",".join(maven_packages)) diff --git a/docker-compose.yml b/docker-compose.yml index f5859bb5..73e8a21e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,7 +9,7 @@ services: context: . target: base args: - SPARK_VERSION: 3.5.1 + SPARK_VERSION: 3.5.2 env_file: .env.docker volumes: - ./:/app/ diff --git a/docker/Dockerfile b/docker/Dockerfile index d3d34ef2..68f40a52 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -44,7 +44,7 @@ ENV PATH=${ONETL_USER_HOME}/.local/bin:${PATH} COPY --chown=onetl:onetl ./run_tests.sh ./pytest_runner.sh ./combine_coverage.sh /app/ RUN chmod +x /app/run_tests.sh /app/pytest_runner.sh /app/combine_coverage.sh -ARG SPARK_VERSION=3.5.1 +ARG SPARK_VERSION=3.5.2 # Spark is heavy, and version change is quite rare COPY --chown=onetl:onetl ./requirements/tests/spark-${SPARK_VERSION}.txt /app/requirements/tests/ RUN pip install -r /app/requirements/tests/spark-${SPARK_VERSION}.txt diff --git a/docs/changelog/next_release/306.feature.rst b/docs/changelog/next_release/306.feature.rst new file mode 100644 index 00000000..1c2b95f7 --- /dev/null +++ b/docs/changelog/next_release/306.feature.rst @@ -0,0 +1 @@ +Update ``Excel`` package from ``0.20.3`` to ``0.20.4``, to include Spark 3.5.1 support. diff --git a/docs/connection/db_connection/clickhouse/types.rst b/docs/connection/db_connection/clickhouse/types.rst index 21ddf0ba..0d8c5675 100644 --- a/docs/connection/db_connection/clickhouse/types.rst +++ b/docs/connection/db_connection/clickhouse/types.rst @@ -106,8 +106,8 @@ References Here you can find source code with type conversions: * `Clickhouse -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> Clickhouse `_ Supported types diff --git a/docs/connection/db_connection/mssql/types.rst b/docs/connection/db_connection/mssql/types.rst index 807d62d9..852289ad 100644 --- a/docs/connection/db_connection/mssql/types.rst +++ b/docs/connection/db_connection/mssql/types.rst @@ -101,8 +101,8 @@ References Here you can find source code with type conversions: * `MSSQL -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> MSSQL `_ Supported types diff --git a/docs/connection/db_connection/mysql/types.rst b/docs/connection/db_connection/mysql/types.rst index 1ad6815c..001a221f 100644 --- a/docs/connection/db_connection/mysql/types.rst +++ b/docs/connection/db_connection/mysql/types.rst @@ -97,8 +97,8 @@ References Here you can find source code with type conversions: * `MySQL -> JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ * `JDBC -> MySQL `_ Supported types diff --git a/docs/connection/db_connection/oracle/types.rst b/docs/connection/db_connection/oracle/types.rst index 81b7da10..2433b0f7 100644 --- a/docs/connection/db_connection/oracle/types.rst +++ b/docs/connection/db_connection/oracle/types.rst @@ -101,8 +101,8 @@ See `List of Oracle types Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ Numeric types ~~~~~~~~~~~~~ diff --git a/docs/connection/db_connection/postgres/types.rst b/docs/connection/db_connection/postgres/types.rst index b4d9d202..f0fe8821 100644 --- a/docs/connection/db_connection/postgres/types.rst +++ b/docs/connection/db_connection/postgres/types.rst @@ -109,8 +109,8 @@ See `List of Postgres types JDBC `_ -* `JDBC -> Spark `_ -* `Spark -> JDBC `_ +* `JDBC -> Spark `_ +* `Spark -> JDBC `_ Numeric types ~~~~~~~~~~~~~ diff --git a/onetl/_metrics/extract.py b/onetl/_metrics/extract.py index 4789d8fd..8b623bb8 100644 --- a/onetl/_metrics/extract.py +++ b/onetl/_metrics/extract.py @@ -70,7 +70,7 @@ def extract_metrics_from_execution(execution: SparkListenerExecution) -> SparkCo disk_spilled_bytes += stage.metrics.disk_spilled_bytes result_size_bytes += stage.metrics.result_size_bytes - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L467-L473 input_file_count = ( _get_int(execution.metrics, SparkSQLMetricNames.NUMBER_OF_FILES_READ) or _get_int(execution.metrics, SparkSQLMetricNames.STATIC_NUMBER_OF_FILES_READ) diff --git a/onetl/_metrics/listener/base.py b/onetl/_metrics/listener/base.py index 90432c7c..a8d5b855 100644 --- a/onetl/_metrics/listener/base.py +++ b/onetl/_metrics/listener/base.py @@ -16,7 +16,7 @@ class BaseSparkListener: """Base no-op SparkListener implementation. - See `SparkListener `_ interface. + See `SparkListener `_ interface. """ spark: SparkSession diff --git a/onetl/_metrics/listener/execution.py b/onetl/_metrics/listener/execution.py index 728c4c2c..f5749e16 100644 --- a/onetl/_metrics/listener/execution.py +++ b/onetl/_metrics/listener/execution.py @@ -22,18 +22,18 @@ class SparkSQLMetricNames(str, Enum): # noqa: WPS338 # Metric names passed to SQLMetrics.createMetric(...) # But only those we're interested in. - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L233C55-L233C87 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L231 NUMBER_OF_PARTITIONS_READ = "number of partitions read" - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 NUMBER_OF_FILES_READ = "number of files read" SIZE_OF_FILES_READ = "size of files read" - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L455-L456 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/DataSourceScanExec.scala#L225-L227 STATIC_NUMBER_OF_FILES_READ = "static number of files read" STATIC_SIZE_OF_FILES_READ = "static size of files read" - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/BasicWriteStatsTracker.scala#L241-L246 NUMBER_OF_DYNAMIC_PART = "number of dynamic part" NUMBER_OF_WRITTEN_FILES = "number of written files" @@ -62,11 +62,11 @@ def jobs(self) -> list[SparkListenerJob]: return result def on_execution_start(self, event): - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L44-L58 self.status = SparkListenerExecutionStatus.STARTED def on_execution_end(self, event): - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLListener.scala#L61-L83 for job in self._jobs.values(): if job.status == SparkListenerJobStatus.FAILED: self.status = SparkListenerExecutionStatus.FAILED diff --git a/onetl/_metrics/listener/job.py b/onetl/_metrics/listener/job.py index b3abbd06..915f1f3d 100644 --- a/onetl/_metrics/listener/job.py +++ b/onetl/_metrics/listener/job.py @@ -38,8 +38,8 @@ def stages(self) -> list[SparkListenerStage]: @classmethod def create(cls, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerJobSubmitted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerJobCompleted.html result = cls( id=event.jobId(), description=event.properties().get("spark.job.description"), diff --git a/onetl/_metrics/listener/listener.py b/onetl/_metrics/listener/listener.py index 3421e5ae..997f22a7 100644 --- a/onetl/_metrics/listener/listener.py +++ b/onetl/_metrics/listener/listener.py @@ -73,7 +73,7 @@ def onExecutionEnd(self, event): # Get execution metrics from SQLAppStatusStore, # as SparkListenerSQLExecutionEnd event does not provide them: - # https://github.com/apache/spark/blob/v3.5.1/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala + # https://github.com/apache/spark/blob/v3.5.2/sql/core/src/main/scala/org/apache/spark/sql/execution/ui/SQLAppStatusStore.scala session_status_store = self.spark._jsparkSession.sharedState().statusStore() # noqa: WPS437 raw_execution = session_status_store.execution(execution.id).get() metrics = raw_execution.metrics() diff --git a/onetl/_metrics/listener/stage.py b/onetl/_metrics/listener/stage.py index 4bf4dffb..89d6a6ae 100644 --- a/onetl/_metrics/listener/stage.py +++ b/onetl/_metrics/listener/stage.py @@ -21,7 +21,7 @@ def __str__(self): @dataclass class SparkListenerStage: - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/StageInfo.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/StageInfo.html id: int status: SparkListenerStageStatus = SparkListenerStageStatus.PENDING metrics: SparkListenerTaskMetrics = field(default_factory=SparkListenerTaskMetrics, repr=False, init=False) @@ -39,11 +39,11 @@ def create(cls, stage_info): return cls(id=stage_info.stageId()) def on_stage_start(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerStageSubmitted.html self.status = SparkListenerStageStatus.ACTIVE def on_stage_end(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerStageCompleted.html stage_info = event.stageInfo() if stage_info.failureReason().isDefined(): self.status = SparkListenerStageStatus.FAILED diff --git a/onetl/_metrics/listener/task.py b/onetl/_metrics/listener/task.py index 4b27ffcf..ced938a8 100644 --- a/onetl/_metrics/listener/task.py +++ b/onetl/_metrics/listener/task.py @@ -81,14 +81,14 @@ class SparkListenerTask: @classmethod def create(cls, task_info): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/TaskInfo.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/TaskInfo.html return cls(id=task_info.taskId()) def on_task_start(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerTaskStart.html self.status = SparkListenerTaskStatus(event.taskInfo().status()) def on_task_end(self, event): - # https://spark.apache.org/docs/3.5.1/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html + # https://spark.apache.org/docs/3.5.2/api/java/org/apache/spark/scheduler/SparkListenerTaskEnd.html self.status = SparkListenerTaskStatus(event.taskInfo().status()) self.metrics = SparkListenerTaskMetrics.create(event.taskMetrics()) diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index f7d018b3..547095af 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -143,7 +143,7 @@ def estimate_dataframe_size(spark_session: SparkSession, df: DataFrame) -> int: """ Estimate in-memory DataFrame size in bytes. If cannot be estimated, return 0. - Using Spark's `SizeEstimator `_. + Using Spark's `SizeEstimator `_. """ try: size_estimator = spark_session._jvm.org.apache.spark.util.SizeEstimator # type: ignore[union-attr] diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index b404eafb..9b8bf2cd 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -332,7 +332,7 @@ def write_df_to_target( write_options.update(options.dict(by_alias=True, exclude_none=True, exclude={"if_exists"})) write_options["topic"] = target - # As of Apache Spark version 3.5.0, the mode 'error' is not functioning as expected. + # As of Apache Spark version 3.5.2, the mode 'error' is not functioning as expected. # This issue has been reported and can be tracked at: # https://issues.apache.org/jira/browse/SPARK-44774 mode = options.if_exists @@ -418,7 +418,7 @@ def get_packages( from onetl.connection import Kafka Kafka.get_packages(spark_version="3.2.4") - Kafka.get_packages(spark_version="3.2.4", scala_version="2.13") + Kafka.get_packages(spark_version="3.2.4", scala_version="2.12") """ diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index eb74d698..8fe07d10 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -133,7 +133,7 @@ class SparkS3(SparkFileDFConnection): from pyspark.sql import SparkSession # Create Spark session with Hadoop AWS libraries loaded - maven_packages = SparkS3.get_packages(spark_version="3.5.0") + maven_packages = SparkS3.get_packages(spark_version="3.5.2") # Some dependencies are not used, but downloading takes a lot of time. Skipping them. excluded_packages = [ "com.google.cloud.bigdataoss:gcs-connector", @@ -236,8 +236,8 @@ def get_packages( from onetl.connection import SparkS3 - SparkS3.get_packages(spark_version="3.5.0") - SparkS3.get_packages(spark_version="3.5.0", scala_version="2.12") + SparkS3.get_packages(spark_version="3.5.2") + SparkS3.get_packages(spark_version="3.5.2", scala_version="2.12") """ diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 3699620b..418e4064 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -88,7 +88,7 @@ class Avro(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with Avro package loaded - maven_packages = Avro.get_packages(spark_version="3.5.0") + maven_packages = Avro.get_packages(spark_version="3.5.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -151,7 +151,7 @@ def get_packages( from onetl.file.format import Avro Avro.get_packages(spark_version="3.2.4") - Avro.get_packages(spark_version="3.2.4", scala_version="2.13") + Avro.get_packages(spark_version="3.2.4", scala_version="2.12") """ diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index 2ec12758..3f26522f 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -87,7 +87,7 @@ class Excel(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with Excel package loaded - maven_packages = Excel.get_packages(spark_version="3.5.0") + maven_packages = Excel.get_packages(spark_version="3.5.1") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -139,7 +139,7 @@ def get_packages( If ``None``, ``spark_version`` is used to determine Scala version. package_version : str, optional - Package version in format ``major.minor.patch``. Default is ``0.20.3``. + Package version in format ``major.minor.patch``. Default is ``0.20.4``. .. warning:: @@ -157,12 +157,12 @@ def get_packages( from onetl.file.format import Excel - Excel.get_packages(spark_version="3.5.0") - Excel.get_packages(spark_version="3.5.0", scala_version="2.13") + Excel.get_packages(spark_version="3.5.1") + Excel.get_packages(spark_version="3.5.1", scala_version="2.12") Excel.get_packages( - spark_version="3.5.0", - scala_version="2.13", - package_version="0.20.3", + spark_version="3.5.1", + scala_version="2.12", + package_version="0.20.4", ) """ @@ -176,7 +176,7 @@ def get_packages( raise ValueError(f"Package version should be at least 0.15, got {package_version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version("0.20.3") + version = Version("0.20.4") spark_ver = Version(spark_version).min_digits(3) if spark_ver < Version("3.2"): diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index cc7cd477..11425809 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -119,7 +119,7 @@ class XML(ReadWriteFileFormat): from pyspark.sql import SparkSession # Create Spark session with XML package loaded - maven_packages = XML.get_packages(spark_version="3.5.0") + maven_packages = XML.get_packages(spark_version="3.5.2") spark = ( SparkSession.builder.appName("spark-app-name") .config("spark.jars.packages", ",".join(maven_packages)) @@ -184,10 +184,10 @@ def get_packages( # noqa: WPS231 from onetl.file.format import XML - XML.get_packages(spark_version="3.5.0") - XML.get_packages(spark_version="3.5.0", scala_version="2.12") + XML.get_packages(spark_version="3.5.2") + XML.get_packages(spark_version="3.5.2", scala_version="2.12") XML.get_packages( - spark_version="3.5.0", + spark_version="3.5.2", scala_version="2.12", package_version="0.18.0", ) diff --git a/requirements/tests/spark-3.5.0.txt b/requirements/tests/spark-3.5.2.txt similarity index 76% rename from requirements/tests/spark-3.5.0.txt rename to requirements/tests/spark-3.5.2.txt index 2e49168a..214f0d63 100644 --- a/requirements/tests/spark-3.5.0.txt +++ b/requirements/tests/spark-3.5.2.txt @@ -1,5 +1,5 @@ numpy>=1.16 pandas>=1.0 pyarrow>=1.0 -pyspark==3.5.0 +pyspark==3.5.2 sqlalchemy diff --git a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py index 3c2ef160..53c7a67a 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_avro_unit.py @@ -29,14 +29,14 @@ def test_avro_get_packages_scala_version_not_supported(): [ # Detect Scala version by Spark version ("2.4.0", None, "org.apache.spark:spark-avro_2.11:2.4.0"), - ("3.5.0", None, "org.apache.spark:spark-avro_2.12:3.5.0"), + ("3.5.2", None, "org.apache.spark:spark-avro_2.12:3.5.2"), # Override Scala version ("2.4.0", "2.11", "org.apache.spark:spark-avro_2.11:2.4.0"), ("2.4.0", "2.12", "org.apache.spark:spark-avro_2.12:2.4.0"), - ("3.5.0", "2.12", "org.apache.spark:spark-avro_2.12:3.5.0"), - ("3.5.0", "2.13", "org.apache.spark:spark-avro_2.13:3.5.0"), + ("3.5.2", "2.12", "org.apache.spark:spark-avro_2.12:3.5.2"), + ("3.5.2", "2.13", "org.apache.spark:spark-avro_2.13:3.5.2"), # Scala version contain three digits when only two needed - ("3.5.0", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.0"), + ("3.5.2", "2.12.1", "org.apache.spark:spark-avro_2.12:3.5.2"), ], ) def test_avro_get_packages(spark_version, scala_version, package): diff --git a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py index 95dae3da..ecacb2ca 100644 --- a/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py +++ b/tests/tests_unit/test_file/test_format_unit/test_excel_unit.py @@ -34,18 +34,18 @@ def test_excel_get_packages_package_version_not_supported(): "spark_version, scala_version, package_version, packages", [ # Detect Scala version by Spark version - ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.3"]), - ("3.5.0", None, None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), + ("3.2.4", None, None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.4"]), + ("3.5.2", None, None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), # Override Scala version - ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.3"]), - ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.20.3"]), - ("3.5.0", "2.12", None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), - ("3.5.0", "2.13", None, ["com.crealytics:spark-excel_2.13:3.5.0_0.20.3"]), + ("3.2.4", "2.12", None, ["com.crealytics:spark-excel_2.12:3.2.4_0.20.4"]), + ("3.2.4", "2.13", None, ["com.crealytics:spark-excel_2.13:3.2.4_0.20.4"]), + ("3.5.2", "2.12", None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), + ("3.5.2", "2.13", None, ["com.crealytics:spark-excel_2.13:3.5.2_0.20.4"]), # Override package version ("3.2.0", None, "0.16.0", ["com.crealytics:spark-excel_2.12:3.2.0_0.16.0"]), - ("3.5.0", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.0_0.18.0"]), + ("3.5.2", None, "0.18.0", ["com.crealytics:spark-excel_2.12:3.5.2_0.18.0"]), # Scala version contain three digits when only two needed - ("3.5.0", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.0_0.20.3"]), + ("3.5.2", "2.12.1", None, ["com.crealytics:spark-excel_2.12:3.5.2_0.20.4"]), ], ) def test_excel_get_packages(caplog, spark_version, scala_version, package_version, packages): diff --git a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py index 34ac4387..9a5e6fac 100644 --- a/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py +++ b/tests/tests_unit/tests_file_df_connection_unit/test_spark_s3_unit.py @@ -10,9 +10,9 @@ @pytest.mark.parametrize( "spark_version, scala_version, package", [ - ("3.5.0", None, "org.apache.spark:spark-hadoop-cloud_2.12:3.5.0"), - ("3.5.0", "2.12", "org.apache.spark:spark-hadoop-cloud_2.12:3.5.0"), - ("3.5.0", "2.13", "org.apache.spark:spark-hadoop-cloud_2.13:3.5.0"), + ("3.5.2", None, "org.apache.spark:spark-hadoop-cloud_2.12:3.5.2"), + ("3.5.2", "2.12", "org.apache.spark:spark-hadoop-cloud_2.12:3.5.2"), + ("3.5.2", "2.13", "org.apache.spark:spark-hadoop-cloud_2.13:3.5.2"), ], ) def test_spark_s3_get_packages(spark_version, scala_version, package): From 75f74f7a96cb6cc2d18dc5989241ccfe061a6004 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Aug 2024 06:52:34 +0000 Subject: [PATCH 45/64] Bump tj-actions/changed-files from 44 to 45 in the github-actions group Bumps the github-actions group with 1 update: [tj-actions/changed-files](https://github.com/tj-actions/changed-files). Updates `tj-actions/changed-files` from 44 to 45 - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v44...v45) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major dependency-group: github-actions ... Signed-off-by: dependabot[bot] --- .github/workflows/get-matrix.yml | 46 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/.github/workflows/get-matrix.yml b/.github/workflows/get-matrix.yml index eba22eaf..c169bc2d 100644 --- a/.github/workflows/get-matrix.yml +++ b/.github/workflows/get-matrix.yml @@ -86,7 +86,7 @@ jobs: - name: Check if base files are changed id: changed-base - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/base/tracked.txt files_ignore_from_source_file: .github/workflows/data/base/ignored.txt @@ -97,7 +97,7 @@ jobs: - name: Check if db-related files are changed id: changed-db - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/db/tracked.txt files_ignore_from_source_file: .github/workflows/data/db/ignored.txt @@ -108,7 +108,7 @@ jobs: - name: Check if file-related files are changed id: changed-file - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/file/tracked.txt files_ignore_from_source_file: .github/workflows/data/file/ignored.txt @@ -119,7 +119,7 @@ jobs: - name: Check if file-df-related files are changed id: changed-file-df - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/file-df/tracked.txt files_ignore_from_source_file: .github/workflows/data/file-df/ignored.txt @@ -130,7 +130,7 @@ jobs: - name: Check if core files are changed id: changed-core - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/core/tracked.txt files_ignore_from_source_file: .github/workflows/data/core/ignored.txt @@ -160,7 +160,7 @@ jobs: - name: Check if Clickhouse files are changed id: changed-clickhouse - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/clickhouse/tracked.txt files_ignore_from_source_file: .github/workflows/data/clickhouse/ignored.txt @@ -190,7 +190,7 @@ jobs: - name: Check if Greenplum files are changed id: changed-greenplum - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/greenplum/tracked.txt files_ignore_from_source_file: .github/workflows/data/greenplum/ignored.txt @@ -220,7 +220,7 @@ jobs: - name: Check if Hive files are changed id: changed-hive - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/hive/tracked.txt files_ignore_from_source_file: .github/workflows/data/hive/ignored.txt @@ -250,7 +250,7 @@ jobs: - name: Check if Kafka files are changed id: changed-kafka - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/kafka/tracked.txt files_ignore_from_source_file: .github/workflows/data/kafka/ignored.txt @@ -280,7 +280,7 @@ jobs: - name: Check if LocalFS files are changed id: changed-local-fs - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/local-fs/tracked.txt files_ignore_from_source_file: .github/workflows/data/local-fs/ignored.txt @@ -310,7 +310,7 @@ jobs: - name: Check if MongoDB files are changed id: changed-mongodb - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/mongodb/tracked.txt files_ignore_from_source_file: .github/workflows/data/mongodb/ignored.txt @@ -340,7 +340,7 @@ jobs: - name: Check if MSSQL files are changed id: changed-mssql - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/mssql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mssql/ignored.txt @@ -370,7 +370,7 @@ jobs: - name: Check if MySQL files are changed id: changed-mysql - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/mysql/tracked.txt files_ignore_from_source_file: .github/workflows/data/mysql/ignored.txt @@ -400,7 +400,7 @@ jobs: - name: Check if Oracle files are changed id: changed-oracle - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/oracle/tracked.txt files_ignore_from_source_file: .github/workflows/data/oracle/ignored.txt @@ -430,7 +430,7 @@ jobs: - name: Check if Postgres files are changed id: changed-postgres - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/postgres/tracked.txt files_ignore_from_source_file: .github/workflows/data/postgres/ignored.txt @@ -460,7 +460,7 @@ jobs: - name: Check if Teradata files are changed id: changed-teradata - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/teradata/tracked.txt files_ignore_from_source_file: .github/workflows/data/teradata/ignored.txt @@ -490,7 +490,7 @@ jobs: - name: Check if FTP files are changed id: changed-ftp - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/ftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftp/ignored.txt @@ -520,7 +520,7 @@ jobs: - name: Check if FTPS files are changed id: changed-ftps - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/ftps/tracked.txt files_ignore_from_source_file: .github/workflows/data/ftps/ignored.txt @@ -550,7 +550,7 @@ jobs: - name: Check if HDFS files are changed id: changed-hdfs - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/hdfs/tracked.txt files_ignore_from_source_file: .github/workflows/data/hdfs/ignored.txt @@ -580,7 +580,7 @@ jobs: - name: Check if S3 files are changed id: changed-s3 - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/s3/tracked.txt files_ignore_from_source_file: .github/workflows/data/s3/ignored.txt @@ -610,7 +610,7 @@ jobs: - name: Check if SFTP files are changed id: changed-sftp - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/sftp/tracked.txt files_ignore_from_source_file: .github/workflows/data/sftp/ignored.txt @@ -640,7 +640,7 @@ jobs: - name: Check if Samba files are changed id: changed-samba - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/samba/tracked.txt files_ignore_from_source_file: .github/workflows/data/samba/ignored.txt @@ -670,7 +670,7 @@ jobs: - name: Check if WebDAV files are changed id: changed-webdav - uses: tj-actions/changed-files@v44 + uses: tj-actions/changed-files@v45 with: files_from_source_file: .github/workflows/data/webdav/tracked.txt files_ignore_from_source_file: .github/workflows/data/webdav/ignored.txt From c6b09be3f019a08889f5a388dd6ceb3bd8ab714e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 26 Aug 2024 16:39:08 +0000 Subject: [PATCH 46/64] [DOP-18574] Relax check for number of yields in hooks --- docs/changelog/next_release/+yield.feature.rst | 1 + onetl/hooks/hook.py | 2 +- tests/tests_unit/test_hooks/test_hooks_callback.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) create mode 100644 docs/changelog/next_release/+yield.feature.rst diff --git a/docs/changelog/next_release/+yield.feature.rst b/docs/changelog/next_release/+yield.feature.rst new file mode 100644 index 00000000..efc58606 --- /dev/null +++ b/docs/changelog/next_release/+yield.feature.rst @@ -0,0 +1 @@ +Do not raise exception if yield-based hook whas something past (and only one) ``yield``. diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index d49297f1..619cff7d 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -285,7 +285,7 @@ def __enter__(self): try: self.first_yield_result = self.gen.send(None) except StopIteration: - raise RuntimeError("generator didn't yield") from None + pass return self diff --git a/tests/tests_unit/test_hooks/test_hooks_callback.py b/tests/tests_unit/test_hooks/test_hooks_callback.py index d1b69cf9..7fbc00ed 100644 --- a/tests/tests_unit/test_hooks/test_hooks_callback.py +++ b/tests/tests_unit/test_hooks/test_hooks_callback.py @@ -291,8 +291,8 @@ def plus(self, arg: int) -> int: def modify_callback(self, arg: int): yield from (i for i in ()) # noqa: WPS335 - with pytest.raises(RuntimeError, match="generator didn't yield"): - Calculator(1).plus(2) + # no yield = no override + assert Calculator(1).plus(2) == 3 def test_hooks_execute_callback_too_many_yields(caplog): From 3c4a1e02ebbb3658642c3c96aeabc9539f51abc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Wed, 28 Aug 2024 10:25:35 +0000 Subject: [PATCH 47/64] [DOP-18570] Fix SparkMetricsListener.onExecutionEnd on Python 3.9 and below --- onetl/_metrics/listener/execution.py | 4 ++++ onetl/_metrics/listener/listener.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/onetl/_metrics/listener/execution.py b/onetl/_metrics/listener/execution.py index f5749e16..1fbc3925 100644 --- a/onetl/_metrics/listener/execution.py +++ b/onetl/_metrics/listener/execution.py @@ -40,6 +40,10 @@ class SparkSQLMetricNames(str, Enum): # noqa: WPS338 def __str__(self): return self.value + @classmethod + def values(cls): + return set(cls.__members__.values()) + @dataclass class SparkListenerExecution: diff --git a/onetl/_metrics/listener/listener.py b/onetl/_metrics/listener/listener.py index 997f22a7..e78dca2a 100644 --- a/onetl/_metrics/listener/listener.py +++ b/onetl/_metrics/listener/listener.py @@ -12,6 +12,8 @@ SparkSQLMetricNames, ) +KNOWN_METRICS = SparkSQLMetricNames.values() + @dataclass class SparkMetricsListener(BaseSparkListener): @@ -81,7 +83,7 @@ def onExecutionEnd(self, event): for i in range(metrics.size()): metric = metrics.apply(i) metric_name = metric.name() - if metric_name not in SparkSQLMetricNames: + if metric_name not in KNOWN_METRICS: continue metric_value = metric_values.get(metric.accumulatorId()) if not metric_value.isDefined(): From 98a57842336c3cb6b7db05e5845c472307665781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 27 Aug 2024 12:40:00 +0000 Subject: [PATCH 48/64] [DOP-19024] Fix passing custom JDBC options to Greenplum.extra --- docs/changelog/next_release/308.bugfix.rst | 1 + .../db_connection/greenplum/connection.py | 42 ++++++++++++------- .../test_greenplum_unit.py | 28 +++++++++++++ 3 files changed, 57 insertions(+), 14 deletions(-) create mode 100644 docs/changelog/next_release/308.bugfix.rst diff --git a/docs/changelog/next_release/308.bugfix.rst b/docs/changelog/next_release/308.bugfix.rst new file mode 100644 index 00000000..3ffcdcc5 --- /dev/null +++ b/docs/changelog/next_release/308.bugfix.rst @@ -0,0 +1 @@ +Fix passing ``Greenplum(extra={"options": ...)`` during read/write operations. diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index 0f40436f..cc3191af 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -7,6 +7,7 @@ import textwrap import warnings from typing import TYPE_CHECKING, Any, ClassVar +from urllib.parse import quote, urlencode, urlparse, urlunparse from etl_entities.instance import Host @@ -274,17 +275,20 @@ def __str__(self): def jdbc_url(self) -> str: return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}" + @property + def jdbc_custom_params(self) -> dict: + result = { + key: value + for key, value in self.extra.dict(by_alias=True).items() + if not (key.startswith("server.") or key.startswith("pool.")) + } + result["ApplicationName"] = result.get("ApplicationName", self.spark.sparkContext.appName) + return result + @property def jdbc_params(self) -> dict: result = super().jdbc_params - result.update( - { - key: value - for key, value in self.extra.dict(by_alias=True).items() - if not (key.startswith("server.") or key.startswith("pool.")) - }, - ) - result["ApplicationName"] = result.get("ApplicationName", self.spark.sparkContext.appName) + result.update(self.jdbc_custom_params) return result @slot @@ -305,7 +309,7 @@ def read_source_as_df( fake_query_for_log = self.dialect.get_sql_query(table=source, columns=columns, where=where, limit=limit) log_lines(log, fake_query_for_log) - df = self.spark.read.format("greenplum").options(**self._connector_params(source), **read_options).load() + df = self.spark.read.format("greenplum").options(**self._get_connector_params(source), **read_options).load() self._check_expected_jobs_number(df, action="read") if where: @@ -340,7 +344,7 @@ def write_df_to_target( else write_options.if_exists.value ) df.write.format("greenplum").options( - **self._connector_params(target), + **self._get_connector_params(target), **options_dict, ).mode(mode).save() @@ -425,21 +429,31 @@ def _check_java_class_imported(cls, spark): raise ValueError(msg) from e return spark - def _connector_params( + def _get_connector_params( self, table: str, ) -> dict: schema, table_name = table.split(".") # noqa: WPS414 extra = self.extra.dict(by_alias=True, exclude_none=True) - extra = {key: value for key, value in extra.items() if key.startswith("server.") or key.startswith("pool.")} + greenplum_connector_options = { + key: value for key, value in extra.items() if key.startswith("server.") or key.startswith("pool.") + } + + # Greenplum connector requires all JDBC params to be passed via JDBC URL: + # https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#specifying-session-parameters + parsed_jdbc_url = urlparse(self.jdbc_url) + sorted_jdbc_params = [(k, v) for k, v in sorted(self.jdbc_custom_params.items(), key=lambda x: x[0].lower())] + jdbc_url_query = urlencode(sorted_jdbc_params, quote_via=quote) + jdbc_url = urlunparse(parsed_jdbc_url._replace(query=jdbc_url_query)) + return { "driver": self.DRIVER, - "url": self.jdbc_url, + "url": jdbc_url, "user": self.user, "password": self.password.get_secret_value(), "dbschema": schema, "dbtable": table_name, - **extra, + **greenplum_connector_options, } def _options_to_connection_properties(self, options: JDBCFetchOptions | JDBCExecuteOptions): diff --git a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py index 47821642..b6ea9544 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_greenplum_unit.py @@ -128,6 +128,14 @@ def test_greenplum(spark_mock): "ApplicationName": "abc", "tcpKeepAlive": "true", } + assert conn._get_connector_params("some.table") == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database?ApplicationName=abc&tcpKeepAlive=true", + "dbschema": "some", + "dbtable": "table", + } assert "passwd" not in repr(conn) @@ -154,6 +162,14 @@ def test_greenplum_with_port(spark_mock): "ApplicationName": "abc", "tcpKeepAlive": "true", } + assert conn._get_connector_params("some.table") == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5000/database?ApplicationName=abc&tcpKeepAlive=true", + "dbschema": "some", + "dbtable": "table", + } assert conn.instance_url == "greenplum://some_host:5000/database" assert str(conn) == "Greenplum[some_host:5000/database]" @@ -174,6 +190,7 @@ def test_greenplum_with_extra(spark_mock): "autosave": "always", "tcpKeepAlive": "false", "ApplicationName": "override", + "options": "-c search_path=public", "server.port": 8000, "pool.maxSize": 40, }, @@ -191,6 +208,17 @@ def test_greenplum_with_extra(spark_mock): "ApplicationName": "override", "tcpKeepAlive": "false", "autosave": "always", + "options": "-c search_path=public", + } + assert conn._get_connector_params("some.table") == { + "user": "user", + "password": "passwd", + "driver": "org.postgresql.Driver", + "url": "jdbc:postgresql://some_host:5432/database?ApplicationName=override&autosave=always&options=-c%20search_path%3Dpublic&tcpKeepAlive=false", + "dbschema": "some", + "dbtable": "table", + "pool.maxSize": 40, + "server.port": 8000, } From a7d4f40497974e34d2f9f6015d2d93b664a71e7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 29 Aug 2024 06:59:00 +0000 Subject: [PATCH 49/64] Update LICENSE --- .spdx-license-header.txt | 2 +- LICENSE.txt | 2 +- docs/conf.py | 2 +- onetl/__init__.py | 2 +- onetl/_metrics/__init__.py | 2 +- onetl/_metrics/command.py | 2 +- onetl/_metrics/driver.py | 2 +- onetl/_metrics/executor.py | 2 +- onetl/_metrics/extract.py | 2 +- onetl/_metrics/input.py | 2 +- onetl/_metrics/listener/__init__.py | 2 +- onetl/_metrics/listener/base.py | 2 +- onetl/_metrics/listener/execution.py | 2 +- onetl/_metrics/listener/job.py | 2 +- onetl/_metrics/listener/listener.py | 2 +- onetl/_metrics/listener/stage.py | 2 +- onetl/_metrics/listener/task.py | 2 +- onetl/_metrics/output.py | 2 +- onetl/_metrics/recorder.py | 2 +- onetl/_util/__init__.py | 2 +- onetl/_util/classproperty.py | 2 +- onetl/_util/file.py | 2 +- onetl/_util/hadoop.py | 2 +- onetl/_util/java.py | 2 +- onetl/_util/scala.py | 2 +- onetl/_util/spark.py | 2 +- onetl/_util/sql.py | 2 +- onetl/_util/version.py | 2 +- onetl/base/__init__.py | 2 +- onetl/base/base_connection.py | 2 +- onetl/base/base_db_connection.py | 2 +- onetl/base/base_file_connection.py | 2 +- onetl/base/base_file_df_connection.py | 2 +- onetl/base/base_file_filter.py | 2 +- onetl/base/base_file_format.py | 2 +- onetl/base/base_file_limit.py | 2 +- onetl/base/contains_exception.py | 2 +- onetl/base/contains_get_df_schema.py | 2 +- onetl/base/contains_get_min_max_values.py | 2 +- onetl/base/path_protocol.py | 2 +- onetl/base/path_stat_protocol.py | 2 +- onetl/base/pure_path_protocol.py | 2 +- onetl/base/supports_rename_dir.py | 2 +- onetl/connection/__init__.py | 2 +- onetl/connection/db_connection/__init__.py | 2 +- onetl/connection/db_connection/clickhouse/__init__.py | 2 +- onetl/connection/db_connection/clickhouse/connection.py | 2 +- onetl/connection/db_connection/clickhouse/dialect.py | 2 +- onetl/connection/db_connection/clickhouse/options.py | 2 +- onetl/connection/db_connection/db_connection/__init__.py | 2 +- onetl/connection/db_connection/db_connection/connection.py | 2 +- onetl/connection/db_connection/db_connection/dialect.py | 2 +- onetl/connection/db_connection/dialect_mixins/__init__.py | 2 +- .../db_connection/dialect_mixins/not_support_columns.py | 2 +- .../db_connection/dialect_mixins/not_support_df_schema.py | 2 +- .../connection/db_connection/dialect_mixins/not_support_hint.py | 2 +- .../db_connection/dialect_mixins/not_support_where.py | 2 +- .../db_connection/dialect_mixins/requires_df_schema.py | 2 +- .../db_connection/dialect_mixins/support_columns_list.py | 2 +- .../connection/db_connection/dialect_mixins/support_hint_str.py | 2 +- .../db_connection/dialect_mixins/support_hwm_expression_str.py | 2 +- .../connection/db_connection/dialect_mixins/support_name_any.py | 2 +- .../dialect_mixins/support_name_with_schema_only.py | 2 +- .../db_connection/dialect_mixins/support_where_str.py | 2 +- onetl/connection/db_connection/greenplum/__init__.py | 2 +- onetl/connection/db_connection/greenplum/connection.py | 2 +- onetl/connection/db_connection/greenplum/connection_limit.py | 2 +- onetl/connection/db_connection/greenplum/dialect.py | 2 +- onetl/connection/db_connection/greenplum/options.py | 2 +- onetl/connection/db_connection/hive/__init__.py | 2 +- onetl/connection/db_connection/hive/connection.py | 2 +- onetl/connection/db_connection/hive/dialect.py | 2 +- onetl/connection/db_connection/hive/options.py | 2 +- onetl/connection/db_connection/hive/slots.py | 2 +- onetl/connection/db_connection/jdbc_connection/__init__.py | 2 +- onetl/connection/db_connection/jdbc_connection/connection.py | 2 +- onetl/connection/db_connection/jdbc_connection/dialect.py | 2 +- onetl/connection/db_connection/jdbc_connection/options.py | 2 +- onetl/connection/db_connection/jdbc_mixin/__init__.py | 2 +- onetl/connection/db_connection/jdbc_mixin/connection.py | 2 +- onetl/connection/db_connection/jdbc_mixin/options.py | 2 +- onetl/connection/db_connection/kafka/__init__.py | 2 +- onetl/connection/db_connection/kafka/connection.py | 2 +- onetl/connection/db_connection/kafka/dialect.py | 2 +- onetl/connection/db_connection/kafka/extra.py | 2 +- onetl/connection/db_connection/kafka/kafka_auth.py | 2 +- onetl/connection/db_connection/kafka/kafka_basic_auth.py | 2 +- onetl/connection/db_connection/kafka/kafka_kerberos_auth.py | 2 +- .../connection/db_connection/kafka/kafka_plaintext_protocol.py | 2 +- onetl/connection/db_connection/kafka/kafka_protocol.py | 2 +- onetl/connection/db_connection/kafka/kafka_scram_auth.py | 2 +- onetl/connection/db_connection/kafka/kafka_ssl_protocol.py | 2 +- onetl/connection/db_connection/kafka/options.py | 2 +- onetl/connection/db_connection/kafka/slots.py | 2 +- onetl/connection/db_connection/mongodb/__init__.py | 2 +- onetl/connection/db_connection/mongodb/connection.py | 2 +- onetl/connection/db_connection/mongodb/dialect.py | 2 +- onetl/connection/db_connection/mongodb/options.py | 2 +- onetl/connection/db_connection/mssql/__init__.py | 2 +- onetl/connection/db_connection/mssql/connection.py | 2 +- onetl/connection/db_connection/mssql/dialect.py | 2 +- onetl/connection/db_connection/mssql/options.py | 2 +- onetl/connection/db_connection/mysql/__init__.py | 2 +- onetl/connection/db_connection/mysql/connection.py | 2 +- onetl/connection/db_connection/mysql/dialect.py | 2 +- onetl/connection/db_connection/mysql/options.py | 2 +- onetl/connection/db_connection/oracle/__init__.py | 2 +- onetl/connection/db_connection/oracle/connection.py | 2 +- onetl/connection/db_connection/oracle/dialect.py | 2 +- onetl/connection/db_connection/oracle/options.py | 2 +- onetl/connection/db_connection/postgres/__init__.py | 2 +- onetl/connection/db_connection/postgres/connection.py | 2 +- onetl/connection/db_connection/postgres/dialect.py | 2 +- onetl/connection/db_connection/postgres/options.py | 2 +- onetl/connection/db_connection/teradata/__init__.py | 2 +- onetl/connection/db_connection/teradata/connection.py | 2 +- onetl/connection/db_connection/teradata/dialect.py | 2 +- onetl/connection/db_connection/teradata/options.py | 2 +- onetl/connection/file_connection/__init__.py | 2 +- onetl/connection/file_connection/file_connection.py | 2 +- onetl/connection/file_connection/ftp.py | 2 +- onetl/connection/file_connection/ftps.py | 2 +- onetl/connection/file_connection/hdfs/__init__.py | 2 +- onetl/connection/file_connection/hdfs/connection.py | 2 +- onetl/connection/file_connection/hdfs/slots.py | 2 +- onetl/connection/file_connection/mixins/__init__.py | 2 +- onetl/connection/file_connection/mixins/rename_dir_mixin.py | 2 +- onetl/connection/file_connection/s3.py | 2 +- onetl/connection/file_connection/samba.py | 2 +- onetl/connection/file_connection/sftp.py | 2 +- onetl/connection/file_connection/webdav.py | 2 +- onetl/connection/file_df_connection/__init__.py | 2 +- onetl/connection/file_df_connection/spark_file_df_connection.py | 2 +- onetl/connection/file_df_connection/spark_hdfs/__init__.py | 2 +- onetl/connection/file_df_connection/spark_hdfs/connection.py | 2 +- onetl/connection/file_df_connection/spark_hdfs/slots.py | 2 +- onetl/connection/file_df_connection/spark_local_fs.py | 2 +- onetl/connection/file_df_connection/spark_s3/__init__.py | 2 +- onetl/connection/file_df_connection/spark_s3/connection.py | 2 +- onetl/connection/file_df_connection/spark_s3/extra.py | 2 +- onetl/connection/kerberos_helpers.py | 2 +- onetl/core/__init__.py | 2 +- onetl/core/file_filter/__init__.py | 2 +- onetl/core/file_filter/file_filter.py | 2 +- onetl/core/file_limit/__init__.py | 2 +- onetl/core/file_limit/file_limit.py | 2 +- onetl/db/__init__.py | 2 +- onetl/db/db_reader/__init__.py | 2 +- onetl/db/db_reader/db_reader.py | 2 +- onetl/db/db_writer/__init__.py | 2 +- onetl/db/db_writer/db_writer.py | 2 +- onetl/exception.py | 2 +- onetl/file/__init__.py | 2 +- onetl/file/file_df_reader/__init__.py | 2 +- onetl/file/file_df_reader/file_df_reader.py | 2 +- onetl/file/file_df_reader/options.py | 2 +- onetl/file/file_df_writer/__init__.py | 2 +- onetl/file/file_df_writer/file_df_writer.py | 2 +- onetl/file/file_df_writer/options.py | 2 +- onetl/file/file_downloader/__init__.py | 2 +- onetl/file/file_downloader/file_downloader.py | 2 +- onetl/file/file_downloader/options.py | 2 +- onetl/file/file_downloader/result.py | 2 +- onetl/file/file_mover/__init__.py | 2 +- onetl/file/file_mover/file_mover.py | 2 +- onetl/file/file_mover/options.py | 2 +- onetl/file/file_mover/result.py | 2 +- onetl/file/file_result.py | 2 +- onetl/file/file_set.py | 2 +- onetl/file/file_uploader/__init__.py | 2 +- onetl/file/file_uploader/file_uploader.py | 2 +- onetl/file/file_uploader/options.py | 2 +- onetl/file/file_uploader/result.py | 2 +- onetl/file/filter/__init__.py | 2 +- onetl/file/filter/exclude_dir.py | 2 +- onetl/file/filter/file_hwm.py | 2 +- onetl/file/filter/glob.py | 2 +- onetl/file/filter/match_all_filters.py | 2 +- onetl/file/filter/regexp.py | 2 +- onetl/file/format/__init__.py | 2 +- onetl/file/format/avro.py | 2 +- onetl/file/format/csv.py | 2 +- onetl/file/format/excel.py | 2 +- onetl/file/format/file_format.py | 2 +- onetl/file/format/json.py | 2 +- onetl/file/format/jsonline.py | 2 +- onetl/file/format/orc.py | 2 +- onetl/file/format/parquet.py | 2 +- onetl/file/format/xml.py | 2 +- onetl/file/limit/__init__.py | 2 +- onetl/file/limit/limits_reached.py | 2 +- onetl/file/limit/limits_stop_at.py | 2 +- onetl/file/limit/max_files_count.py | 2 +- onetl/file/limit/reset_limits.py | 2 +- onetl/hooks/__init__.py | 2 +- onetl/hooks/hook.py | 2 +- onetl/hooks/hook_collection.py | 2 +- onetl/hooks/hooks_state.py | 2 +- onetl/hooks/method_inheritance_stack.py | 2 +- onetl/hooks/slot.py | 2 +- onetl/hooks/support_hooks.py | 2 +- onetl/hwm/__init__.py | 2 +- onetl/hwm/auto_hwm.py | 2 +- onetl/hwm/store/__init__.py | 2 +- onetl/hwm/store/hwm_class_registry.py | 2 +- onetl/hwm/store/yaml_hwm_store.py | 2 +- onetl/hwm/window.py | 2 +- onetl/impl/__init__.py | 2 +- onetl/impl/base_model.py | 2 +- onetl/impl/failed_local_file.py | 2 +- onetl/impl/file_exist_behavior.py | 2 +- onetl/impl/frozen_model.py | 2 +- onetl/impl/generic_options.py | 2 +- onetl/impl/local_path.py | 2 +- onetl/impl/path_container.py | 2 +- onetl/impl/path_repr.py | 2 +- onetl/impl/remote_directory.py | 2 +- onetl/impl/remote_file.py | 2 +- onetl/impl/remote_path.py | 2 +- onetl/impl/remote_path_stat.py | 2 +- onetl/log.py | 2 +- onetl/plugins/__init__.py | 2 +- onetl/plugins/import_plugins.py | 2 +- onetl/strategy/__init__.py | 2 +- onetl/strategy/base_strategy.py | 2 +- onetl/strategy/batch_hwm_strategy.py | 2 +- onetl/strategy/hwm_store/__init__.py | 2 +- onetl/strategy/hwm_strategy.py | 2 +- onetl/strategy/incremental_strategy.py | 2 +- onetl/strategy/snapshot_strategy.py | 2 +- onetl/strategy/strategy_manager.py | 2 +- onetl/version.py | 2 +- 232 files changed, 232 insertions(+), 232 deletions(-) diff --git a/.spdx-license-header.txt b/.spdx-license-header.txt index 19a8b2e4..44939ae1 100644 --- a/.spdx-license-header.txt +++ b/.spdx-license-header.txt @@ -1,2 +1,2 @@ -SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +SPDX-FileCopyrightText: 2021-2024 MTS PJSC SPDX-License-Identifier: Apache-2.0 diff --git a/LICENSE.txt b/LICENSE.txt index a22e190a..6b68d87e 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright 2021-2024 MTS (Mobile Telesystems). All rights reserved. +Copyright 2021-2024 MTS PJSC. All rights reserved. Apache License Version 2.0, January 2004 diff --git a/docs/conf.py b/docs/conf.py index 867d4daf..e1cc58f0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -25,7 +25,7 @@ # -- Project information ----------------------------------------------------- project = "onETL" -copyright = "2021-2024 MTS (Mobile Telesystems)" +copyright = "2021-2024 MTS PJSC" author = "DataOps.ETL" # The version info for the project you're documenting, acts as replacement for diff --git a/onetl/__init__.py b/onetl/__init__.py index 04793543..02dbd6aa 100644 --- a/onetl/__init__.py +++ b/onetl/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import os diff --git a/onetl/_metrics/__init__.py b/onetl/_metrics/__init__.py index 5d7482b6..94099688 100644 --- a/onetl/_metrics/__init__.py +++ b/onetl/_metrics/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl._metrics.command import SparkCommandMetrics from onetl._metrics.driver import SparkDriverMetrics diff --git a/onetl/_metrics/command.py b/onetl/_metrics/command.py index 2a8a53c6..c823e4c4 100644 --- a/onetl/_metrics/command.py +++ b/onetl/_metrics/command.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/driver.py b/onetl/_metrics/driver.py index 4e685719..d3b49d96 100644 --- a/onetl/_metrics/driver.py +++ b/onetl/_metrics/driver.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/executor.py b/onetl/_metrics/executor.py index 3fd6f3fc..bbb6d732 100644 --- a/onetl/_metrics/executor.py +++ b/onetl/_metrics/executor.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/extract.py b/onetl/_metrics/extract.py index 8b623bb8..4b058092 100644 --- a/onetl/_metrics/extract.py +++ b/onetl/_metrics/extract.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/input.py b/onetl/_metrics/input.py index 39061311..71451634 100644 --- a/onetl/_metrics/input.py +++ b/onetl/_metrics/input.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/listener/__init__.py b/onetl/_metrics/listener/__init__.py index 112e4fba..720c3da6 100644 --- a/onetl/_metrics/listener/__init__.py +++ b/onetl/_metrics/listener/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl._metrics.listener.execution import ( SparkListenerExecution, diff --git a/onetl/_metrics/listener/base.py b/onetl/_metrics/listener/base.py index a8d5b855..bbc6431c 100644 --- a/onetl/_metrics/listener/base.py +++ b/onetl/_metrics/listener/base.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/listener/execution.py b/onetl/_metrics/listener/execution.py index 1fbc3925..a0d2a522 100644 --- a/onetl/_metrics/listener/execution.py +++ b/onetl/_metrics/listener/execution.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/listener/job.py b/onetl/_metrics/listener/job.py index 915f1f3d..5581d76e 100644 --- a/onetl/_metrics/listener/job.py +++ b/onetl/_metrics/listener/job.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/listener/listener.py b/onetl/_metrics/listener/listener.py index e78dca2a..04fe53c2 100644 --- a/onetl/_metrics/listener/listener.py +++ b/onetl/_metrics/listener/listener.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/listener/stage.py b/onetl/_metrics/listener/stage.py index 89d6a6ae..b858e151 100644 --- a/onetl/_metrics/listener/stage.py +++ b/onetl/_metrics/listener/stage.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/listener/task.py b/onetl/_metrics/listener/task.py index ced938a8..5a17ffc5 100644 --- a/onetl/_metrics/listener/task.py +++ b/onetl/_metrics/listener/task.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/output.py b/onetl/_metrics/output.py index 8600bb68..8f27a346 100644 --- a/onetl/_metrics/output.py +++ b/onetl/_metrics/output.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_metrics/recorder.py b/onetl/_metrics/recorder.py index 4cc5745b..4c65fe8d 100644 --- a/onetl/_metrics/recorder.py +++ b/onetl/_metrics/recorder.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/__init__.py b/onetl/_util/__init__.py index 07325b1d..54237d1f 100644 --- a/onetl/_util/__init__.py +++ b/onetl/_util/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/_util/classproperty.py b/onetl/_util/classproperty.py index e971638a..4e0ed39e 100644 --- a/onetl/_util/classproperty.py +++ b/onetl/_util/classproperty.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/file.py b/onetl/_util/file.py index ee27c57f..2dbb9915 100644 --- a/onetl/_util/file.py +++ b/onetl/_util/file.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/hadoop.py b/onetl/_util/hadoop.py index aed572e0..12376749 100644 --- a/onetl/_util/hadoop.py +++ b/onetl/_util/hadoop.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/java.py b/onetl/_util/java.py index 1ec50a0d..c0dcbd0d 100644 --- a/onetl/_util/java.py +++ b/onetl/_util/java.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/scala.py b/onetl/_util/scala.py index 5e6c21bc..5d472f2f 100644 --- a/onetl/_util/scala.py +++ b/onetl/_util/scala.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/spark.py b/onetl/_util/spark.py index 547095af..ab2090b0 100644 --- a/onetl/_util/spark.py +++ b/onetl/_util/spark.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/_util/sql.py b/onetl/_util/sql.py index 37aa09a7..80118555 100644 --- a/onetl/_util/sql.py +++ b/onetl/_util/sql.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 def clear_statement(statement: str) -> str: """ diff --git a/onetl/_util/version.py b/onetl/_util/version.py index 85bde1c7..075928c6 100644 --- a/onetl/_util/version.py +++ b/onetl/_util/version.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/__init__.py b/onetl/base/__init__.py index 4178e7c9..7c30c412 100644 --- a/onetl/base/__init__.py +++ b/onetl/base/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.base.base_connection import BaseConnection from onetl.base.base_db_connection import BaseDBConnection, BaseDBDialect diff --git a/onetl/base/base_connection.py b/onetl/base/base_connection.py index dc2cbd4f..264fc367 100644 --- a/onetl/base/base_connection.py +++ b/onetl/base/base_connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod from typing import TypeVar diff --git a/onetl/base/base_db_connection.py b/onetl/base/base_db_connection.py index 2c427deb..6e2e7e08 100644 --- a/onetl/base/base_db_connection.py +++ b/onetl/base/base_db_connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/base_file_connection.py b/onetl/base/base_file_connection.py index 81d57bfb..28949499 100644 --- a/onetl/base/base_file_connection.py +++ b/onetl/base/base_file_connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/base_file_df_connection.py b/onetl/base/base_file_df_connection.py index 28c57f3c..85e22da4 100644 --- a/onetl/base/base_file_df_connection.py +++ b/onetl/base/base_file_df_connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/base_file_filter.py b/onetl/base/base_file_filter.py index 01a9893f..d2fb0a64 100644 --- a/onetl/base/base_file_filter.py +++ b/onetl/base/base_file_filter.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/base_file_format.py b/onetl/base/base_file_format.py index a4c72e3e..17fb3612 100644 --- a/onetl/base/base_file_format.py +++ b/onetl/base/base_file_format.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/base_file_limit.py b/onetl/base/base_file_limit.py index d930690d..b8cdd0f3 100644 --- a/onetl/base/base_file_limit.py +++ b/onetl/base/base_file_limit.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/contains_exception.py b/onetl/base/contains_exception.py index cf9ae5aa..12b7a09c 100644 --- a/onetl/base/contains_exception.py +++ b/onetl/base/contains_exception.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from typing_extensions import Protocol, runtime_checkable diff --git a/onetl/base/contains_get_df_schema.py b/onetl/base/contains_get_df_schema.py index ccb7d34a..f607cf60 100644 --- a/onetl/base/contains_get_df_schema.py +++ b/onetl/base/contains_get_df_schema.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/contains_get_min_max_values.py b/onetl/base/contains_get_min_max_values.py index d23029d5..e2362269 100644 --- a/onetl/base/contains_get_min_max_values.py +++ b/onetl/base/contains_get_min_max_values.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/path_protocol.py b/onetl/base/path_protocol.py index 68594a10..eed285a2 100644 --- a/onetl/base/path_protocol.py +++ b/onetl/base/path_protocol.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/path_stat_protocol.py b/onetl/base/path_stat_protocol.py index a42f0569..961ad288 100644 --- a/onetl/base/path_stat_protocol.py +++ b/onetl/base/path_stat_protocol.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/pure_path_protocol.py b/onetl/base/pure_path_protocol.py index d5757c3b..ccf69c40 100644 --- a/onetl/base/pure_path_protocol.py +++ b/onetl/base/pure_path_protocol.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/base/supports_rename_dir.py b/onetl/base/supports_rename_dir.py index 8f0d3971..f44b42d1 100644 --- a/onetl/base/supports_rename_dir.py +++ b/onetl/base/supports_rename_dir.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/__init__.py b/onetl/connection/__init__.py index 4a25e210..608beb41 100644 --- a/onetl/connection/__init__.py +++ b/onetl/connection/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/__init__.py b/onetl/connection/db_connection/__init__.py index 07325b1d..54237d1f 100644 --- a/onetl/connection/db_connection/__init__.py +++ b/onetl/connection/db_connection/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/db_connection/clickhouse/__init__.py b/onetl/connection/db_connection/clickhouse/__init__.py index b830a78d..d2a57f44 100644 --- a/onetl/connection/db_connection/clickhouse/__init__.py +++ b/onetl/connection/db_connection/clickhouse/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.clickhouse.connection import ( Clickhouse, diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 482cc941..21b282b3 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/clickhouse/dialect.py b/onetl/connection/db_connection/clickhouse/dialect.py index 2c03620d..394843b8 100644 --- a/onetl/connection/db_connection/clickhouse/dialect.py +++ b/onetl/connection/db_connection/clickhouse/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/clickhouse/options.py b/onetl/connection/db_connection/clickhouse/options.py index 5e35c969..a6907793 100644 --- a/onetl/connection/db_connection/clickhouse/options.py +++ b/onetl/connection/db_connection/clickhouse/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/db_connection/db_connection/__init__.py b/onetl/connection/db_connection/db_connection/__init__.py index 71439ddc..acdab0d3 100644 --- a/onetl/connection/db_connection/db_connection/__init__.py +++ b/onetl/connection/db_connection/db_connection/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.db_connection.connection import DBConnection from onetl.connection.db_connection.db_connection.dialect import DBDialect diff --git a/onetl/connection/db_connection/db_connection/connection.py b/onetl/connection/db_connection/db_connection/connection.py index 1372cd69..1158942c 100644 --- a/onetl/connection/db_connection/db_connection/connection.py +++ b/onetl/connection/db_connection/db_connection/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/db_connection/dialect.py b/onetl/connection/db_connection/db_connection/dialect.py index 73efba33..7080e324 100644 --- a/onetl/connection/db_connection/db_connection/dialect.py +++ b/onetl/connection/db_connection/db_connection/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/__init__.py b/onetl/connection/db_connection/dialect_mixins/__init__.py index da36f089..b40538f5 100644 --- a/onetl/connection/db_connection/dialect_mixins/__init__.py +++ b/onetl/connection/db_connection/dialect_mixins/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.dialect_mixins.not_support_columns import ( NotSupportColumns, diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_columns.py b/onetl/connection/db_connection/dialect_mixins/not_support_columns.py index 2d98ac74..3ba8ae48 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_columns.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_columns.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py b/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py index b99f3873..1973799b 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_df_schema.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_hint.py b/onetl/connection/db_connection/dialect_mixins/not_support_hint.py index 7680c4aa..47039532 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_hint.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_hint.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/not_support_where.py b/onetl/connection/db_connection/dialect_mixins/not_support_where.py index 122de982..7bb3956e 100644 --- a/onetl/connection/db_connection/dialect_mixins/not_support_where.py +++ b/onetl/connection/db_connection/dialect_mixins/not_support_where.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py b/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py index 9b026572..cb9261bc 100644 --- a/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py +++ b/onetl/connection/db_connection/dialect_mixins/requires_df_schema.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py index e443cf8f..5d204f5c 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_columns_list.py +++ b/onetl/connection/db_connection/dialect_mixins/support_columns_list.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_hint_str.py b/onetl/connection/db_connection/dialect_mixins/support_hint_str.py index b90d9309..cb081808 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_hint_str.py +++ b/onetl/connection/db_connection/dialect_mixins/support_hint_str.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py b/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py index 157ffecd..e3a9ccc2 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py +++ b/onetl/connection/db_connection/dialect_mixins/support_hwm_expression_str.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_name_any.py b/onetl/connection/db_connection/dialect_mixins/support_name_any.py index dbe23024..1c7e55de 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_name_any.py +++ b/onetl/connection/db_connection/dialect_mixins/support_name_any.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py b/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py index d13cfc74..0e66b980 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py +++ b/onetl/connection/db_connection/dialect_mixins/support_name_with_schema_only.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/dialect_mixins/support_where_str.py b/onetl/connection/db_connection/dialect_mixins/support_where_str.py index c171ec4d..3949a6b1 100644 --- a/onetl/connection/db_connection/dialect_mixins/support_where_str.py +++ b/onetl/connection/db_connection/dialect_mixins/support_where_str.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/greenplum/__init__.py b/onetl/connection/db_connection/greenplum/__init__.py index 8a401d0b..71fdd32d 100644 --- a/onetl/connection/db_connection/greenplum/__init__.py +++ b/onetl/connection/db_connection/greenplum/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.greenplum.connection import Greenplum from onetl.connection.db_connection.greenplum.dialect import GreenplumDialect diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index cc3191af..ff5c5a76 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/greenplum/connection_limit.py b/onetl/connection/db_connection/greenplum/connection_limit.py index 32cb99d8..81ce6120 100644 --- a/onetl/connection/db_connection/greenplum/connection_limit.py +++ b/onetl/connection/db_connection/greenplum/connection_limit.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/greenplum/dialect.py b/onetl/connection/db_connection/greenplum/dialect.py index f4bafa68..8a602215 100644 --- a/onetl/connection/db_connection/greenplum/dialect.py +++ b/onetl/connection/db_connection/greenplum/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/greenplum/options.py b/onetl/connection/db_connection/greenplum/options.py index e1cd1902..4d19ae7a 100644 --- a/onetl/connection/db_connection/greenplum/options.py +++ b/onetl/connection/db_connection/greenplum/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/hive/__init__.py b/onetl/connection/db_connection/hive/__init__.py index bc1a73e3..092d6c5a 100644 --- a/onetl/connection/db_connection/hive/__init__.py +++ b/onetl/connection/db_connection/hive/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.hive.connection import Hive from onetl.connection.db_connection.hive.dialect import HiveDialect diff --git a/onetl/connection/db_connection/hive/connection.py b/onetl/connection/db_connection/hive/connection.py index 855a0ead..6e21f9c3 100644 --- a/onetl/connection/db_connection/hive/connection.py +++ b/onetl/connection/db_connection/hive/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/hive/dialect.py b/onetl/connection/db_connection/hive/dialect.py index 38b737fe..15520bd4 100644 --- a/onetl/connection/db_connection/hive/dialect.py +++ b/onetl/connection/db_connection/hive/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/hive/options.py b/onetl/connection/db_connection/hive/options.py index 16d21a0e..40097619 100644 --- a/onetl/connection/db_connection/hive/options.py +++ b/onetl/connection/db_connection/hive/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/hive/slots.py b/onetl/connection/db_connection/hive/slots.py index 3044950f..de813e5d 100644 --- a/onetl/connection/db_connection/hive/slots.py +++ b/onetl/connection/db_connection/hive/slots.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/jdbc_connection/__init__.py b/onetl/connection/db_connection/jdbc_connection/__init__.py index 476e7ea1..3b6eb5b2 100644 --- a/onetl/connection/db_connection/jdbc_connection/__init__.py +++ b/onetl/connection/db_connection/jdbc_connection/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_connection.connection import JDBCConnection from onetl.connection.db_connection.jdbc_connection.dialect import JDBCDialect diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 0f3ac024..2752dc25 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/jdbc_connection/dialect.py b/onetl/connection/db_connection/jdbc_connection/dialect.py index cbf0ceb6..1fce839a 100644 --- a/onetl/connection/db_connection/jdbc_connection/dialect.py +++ b/onetl/connection/db_connection/jdbc_connection/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/jdbc_connection/options.py b/onetl/connection/db_connection/jdbc_connection/options.py index a2aa39ad..c04e8850 100644 --- a/onetl/connection/db_connection/jdbc_connection/options.py +++ b/onetl/connection/db_connection/jdbc_connection/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/jdbc_mixin/__init__.py b/onetl/connection/db_connection/jdbc_mixin/__init__.py index 0f368a12..a4b3975b 100644 --- a/onetl/connection/db_connection/jdbc_mixin/__init__.py +++ b/onetl/connection/db_connection/jdbc_mixin/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_mixin.connection import ( JDBCMixin, diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index 2f25b5a9..a6830ae4 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/jdbc_mixin/options.py b/onetl/connection/db_connection/jdbc_mixin/options.py index 2504c364..ce9710a1 100644 --- a/onetl/connection/db_connection/jdbc_mixin/options.py +++ b/onetl/connection/db_connection/jdbc_mixin/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/__init__.py b/onetl/connection/db_connection/kafka/__init__.py index 71d01ebc..1eadb815 100644 --- a/onetl/connection/db_connection/kafka/__init__.py +++ b/onetl/connection/db_connection/kafka/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.kafka.connection import Kafka diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index 9b8bf2cd..93f9d821 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/dialect.py b/onetl/connection/db_connection/kafka/dialect.py index 16c4d605..b09c19cf 100644 --- a/onetl/connection/db_connection/kafka/dialect.py +++ b/onetl/connection/db_connection/kafka/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/extra.py b/onetl/connection/db_connection/kafka/extra.py index 6dd95e2c..ff1b87cc 100644 --- a/onetl/connection/db_connection/kafka/extra.py +++ b/onetl/connection/db_connection/kafka/extra.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.impl import GenericOptions diff --git a/onetl/connection/db_connection/kafka/kafka_auth.py b/onetl/connection/db_connection/kafka/kafka_auth.py index f1bbdf10..2c451d5e 100644 --- a/onetl/connection/db_connection/kafka/kafka_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_auth.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/kafka_basic_auth.py b/onetl/connection/db_connection/kafka/kafka_basic_auth.py index 4038dd02..de5cc57f 100644 --- a/onetl/connection/db_connection/kafka/kafka_basic_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_basic_auth.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py index 40e9aa55..083c3787 100644 --- a/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_kerberos_auth.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py index 2dd3a6a9..011713a0 100644 --- a/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_plaintext_protocol.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/kafka_protocol.py b/onetl/connection/db_connection/kafka/kafka_protocol.py index 5d2a328c..8c884fb7 100644 --- a/onetl/connection/db_connection/kafka/kafka_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_protocol.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/kafka_scram_auth.py b/onetl/connection/db_connection/kafka/kafka_scram_auth.py index 823d0f82..af2c6b3d 100644 --- a/onetl/connection/db_connection/kafka/kafka_scram_auth.py +++ b/onetl/connection/db_connection/kafka/kafka_scram_auth.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py index 24dd52f6..ea464b36 100644 --- a/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py +++ b/onetl/connection/db_connection/kafka/kafka_ssl_protocol.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/options.py b/onetl/connection/db_connection/kafka/options.py index e2a4a8d3..68f89711 100644 --- a/onetl/connection/db_connection/kafka/options.py +++ b/onetl/connection/db_connection/kafka/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/kafka/slots.py b/onetl/connection/db_connection/kafka/slots.py index 2abf00ce..15e0f48f 100644 --- a/onetl/connection/db_connection/kafka/slots.py +++ b/onetl/connection/db_connection/kafka/slots.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mongodb/__init__.py b/onetl/connection/db_connection/mongodb/__init__.py index b452ca53..839d31c0 100644 --- a/onetl/connection/db_connection/mongodb/__init__.py +++ b/onetl/connection/db_connection/mongodb/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.mongodb.connection import MongoDB, MongoDBExtra from onetl.connection.db_connection.mongodb.dialect import MongoDBDialect diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index f81a3bf8..4cc7e3ed 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mongodb/dialect.py b/onetl/connection/db_connection/mongodb/dialect.py index 247c58aa..8faba8b3 100644 --- a/onetl/connection/db_connection/mongodb/dialect.py +++ b/onetl/connection/db_connection/mongodb/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mongodb/options.py b/onetl/connection/db_connection/mongodb/options.py index 223e05ec..323c1f93 100644 --- a/onetl/connection/db_connection/mongodb/options.py +++ b/onetl/connection/db_connection/mongodb/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mssql/__init__.py b/onetl/connection/db_connection/mssql/__init__.py index 5b07949f..747ff244 100644 --- a/onetl/connection/db_connection/mssql/__init__.py +++ b/onetl/connection/db_connection/mssql/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.mssql.connection import MSSQL, MSSQLExtra from onetl.connection.db_connection.mssql.dialect import MSSQLDialect diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index f2a29b44..18a08e32 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mssql/dialect.py b/onetl/connection/db_connection/mssql/dialect.py index 7dcfdd61..6be43c80 100644 --- a/onetl/connection/db_connection/mssql/dialect.py +++ b/onetl/connection/db_connection/mssql/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mssql/options.py b/onetl/connection/db_connection/mssql/options.py index c14e38b6..856536ab 100644 --- a/onetl/connection/db_connection/mssql/options.py +++ b/onetl/connection/db_connection/mssql/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_connection.options import ( diff --git a/onetl/connection/db_connection/mysql/__init__.py b/onetl/connection/db_connection/mysql/__init__.py index 490f356e..df5dcb7b 100644 --- a/onetl/connection/db_connection/mysql/__init__.py +++ b/onetl/connection/db_connection/mysql/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.mysql.connection import MySQL, MySQLExtra from onetl.connection.db_connection.mysql.dialect import MySQLDialect diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index e3c91196..15ab2a62 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mysql/dialect.py b/onetl/connection/db_connection/mysql/dialect.py index b9c186e2..5b59bc38 100644 --- a/onetl/connection/db_connection/mysql/dialect.py +++ b/onetl/connection/db_connection/mysql/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/mysql/options.py b/onetl/connection/db_connection/mysql/options.py index 06abd6d2..60018fc3 100644 --- a/onetl/connection/db_connection/mysql/options.py +++ b/onetl/connection/db_connection/mysql/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/db_connection/oracle/__init__.py b/onetl/connection/db_connection/oracle/__init__.py index 3bcca706..4c880840 100644 --- a/onetl/connection/db_connection/oracle/__init__.py +++ b/onetl/connection/db_connection/oracle/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.oracle.connection import Oracle, OracleExtra from onetl.connection.db_connection.oracle.dialect import OracleDialect diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 40164fe1..96ba9bd7 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/oracle/dialect.py b/onetl/connection/db_connection/oracle/dialect.py index 70e0eff3..2f121871 100644 --- a/onetl/connection/db_connection/oracle/dialect.py +++ b/onetl/connection/db_connection/oracle/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/oracle/options.py b/onetl/connection/db_connection/oracle/options.py index 61b82e1b..a9cc7ae2 100644 --- a/onetl/connection/db_connection/oracle/options.py +++ b/onetl/connection/db_connection/oracle/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/db_connection/postgres/__init__.py b/onetl/connection/db_connection/postgres/__init__.py index 3eef06fa..43ac2584 100644 --- a/onetl/connection/db_connection/postgres/__init__.py +++ b/onetl/connection/db_connection/postgres/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.postgres.connection import Postgres, PostgresExtra from onetl.connection.db_connection.postgres.dialect import PostgresDialect diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index 1c11d9e3..ac5e50d1 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/postgres/dialect.py b/onetl/connection/db_connection/postgres/dialect.py index 0e4f67ab..1dca8ec9 100644 --- a/onetl/connection/db_connection/postgres/dialect.py +++ b/onetl/connection/db_connection/postgres/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/postgres/options.py b/onetl/connection/db_connection/postgres/options.py index 3a4dd806..8e6741c6 100644 --- a/onetl/connection/db_connection/postgres/options.py +++ b/onetl/connection/db_connection/postgres/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_connection.options import ( diff --git a/onetl/connection/db_connection/teradata/__init__.py b/onetl/connection/db_connection/teradata/__init__.py index 8356d51e..b29dbf26 100644 --- a/onetl/connection/db_connection/teradata/__init__.py +++ b/onetl/connection/db_connection/teradata/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.teradata.connection import Teradata, TeradataExtra from onetl.connection.db_connection.teradata.dialect import TeradataDialect diff --git a/onetl/connection/db_connection/teradata/connection.py b/onetl/connection/db_connection/teradata/connection.py index 9c8f073c..71f4aeed 100644 --- a/onetl/connection/db_connection/teradata/connection.py +++ b/onetl/connection/db_connection/teradata/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/teradata/dialect.py b/onetl/connection/db_connection/teradata/dialect.py index ac225ce4..b7fc9c47 100644 --- a/onetl/connection/db_connection/teradata/dialect.py +++ b/onetl/connection/db_connection/teradata/dialect.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/db_connection/teradata/options.py b/onetl/connection/db_connection/teradata/options.py index eb77f8c8..4f879e70 100644 --- a/onetl/connection/db_connection/teradata/options.py +++ b/onetl/connection/db_connection/teradata/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.db_connection.jdbc_connection.options import ( diff --git a/onetl/connection/file_connection/__init__.py b/onetl/connection/file_connection/__init__.py index 07325b1d..54237d1f 100644 --- a/onetl/connection/file_connection/__init__.py +++ b/onetl/connection/file_connection/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/file_connection/file_connection.py b/onetl/connection/file_connection/file_connection.py index 0a158409..de5916c4 100644 --- a/onetl/connection/file_connection/file_connection.py +++ b/onetl/connection/file_connection/file_connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/ftp.py b/onetl/connection/file_connection/ftp.py index d5ff5216..135f6bbb 100644 --- a/onetl/connection/file_connection/ftp.py +++ b/onetl/connection/file_connection/ftp.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/ftps.py b/onetl/connection/file_connection/ftps.py index 0180edf4..ef69ae44 100644 --- a/onetl/connection/file_connection/ftps.py +++ b/onetl/connection/file_connection/ftps.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import ftplib # noqa: S402 # nosec import textwrap diff --git a/onetl/connection/file_connection/hdfs/__init__.py b/onetl/connection/file_connection/hdfs/__init__.py index 0eedb25b..ac1bb02f 100644 --- a/onetl/connection/file_connection/hdfs/__init__.py +++ b/onetl/connection/file_connection/hdfs/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_connection.hdfs.connection import HDFS from onetl.connection.file_connection.hdfs.slots import HDFSSlots diff --git a/onetl/connection/file_connection/hdfs/connection.py b/onetl/connection/file_connection/hdfs/connection.py index 89c0ec96..8cb6d1b5 100644 --- a/onetl/connection/file_connection/hdfs/connection.py +++ b/onetl/connection/file_connection/hdfs/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/hdfs/slots.py b/onetl/connection/file_connection/hdfs/slots.py index 2f75fefa..5e6dac2e 100644 --- a/onetl/connection/file_connection/hdfs/slots.py +++ b/onetl/connection/file_connection/hdfs/slots.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/mixins/__init__.py b/onetl/connection/file_connection/mixins/__init__.py index 7b11e58a..422a1a03 100644 --- a/onetl/connection/file_connection/mixins/__init__.py +++ b/onetl/connection/file_connection/mixins/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_connection.mixins.rename_dir_mixin import RenameDirMixin diff --git a/onetl/connection/file_connection/mixins/rename_dir_mixin.py b/onetl/connection/file_connection/mixins/rename_dir_mixin.py index c110745c..858e29ed 100644 --- a/onetl/connection/file_connection/mixins/rename_dir_mixin.py +++ b/onetl/connection/file_connection/mixins/rename_dir_mixin.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/s3.py b/onetl/connection/file_connection/s3.py index 0f411c85..2b941483 100644 --- a/onetl/connection/file_connection/s3.py +++ b/onetl/connection/file_connection/s3.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/samba.py b/onetl/connection/file_connection/samba.py index 430e15a7..c5fe74a5 100644 --- a/onetl/connection/file_connection/samba.py +++ b/onetl/connection/file_connection/samba.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/sftp.py b/onetl/connection/file_connection/sftp.py index 92db2adc..37a74a40 100644 --- a/onetl/connection/file_connection/sftp.py +++ b/onetl/connection/file_connection/sftp.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_connection/webdav.py b/onetl/connection/file_connection/webdav.py index 44ac766a..2b9e50eb 100644 --- a/onetl/connection/file_connection/webdav.py +++ b/onetl/connection/file_connection/webdav.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_df_connection/__init__.py b/onetl/connection/file_df_connection/__init__.py index 07325b1d..54237d1f 100644 --- a/onetl/connection/file_df_connection/__init__.py +++ b/onetl/connection/file_df_connection/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 diff --git a/onetl/connection/file_df_connection/spark_file_df_connection.py b/onetl/connection/file_df_connection/spark_file_df_connection.py index 06121139..c75b060c 100644 --- a/onetl/connection/file_df_connection/spark_file_df_connection.py +++ b/onetl/connection/file_df_connection/spark_file_df_connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_df_connection/spark_hdfs/__init__.py b/onetl/connection/file_df_connection/spark_hdfs/__init__.py index 6977eb4a..338638e9 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/__init__.py +++ b/onetl/connection/file_df_connection/spark_hdfs/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_df_connection.spark_hdfs.connection import SparkHDFS from onetl.connection.file_df_connection.spark_hdfs.slots import SparkHDFSSlots diff --git a/onetl/connection/file_df_connection/spark_hdfs/connection.py b/onetl/connection/file_df_connection/spark_hdfs/connection.py index 10ff1005..36d20d4b 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/connection.py +++ b/onetl/connection/file_df_connection/spark_hdfs/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_df_connection/spark_hdfs/slots.py b/onetl/connection/file_df_connection/spark_hdfs/slots.py index 4dab6b54..c8e0123d 100644 --- a/onetl/connection/file_df_connection/spark_hdfs/slots.py +++ b/onetl/connection/file_df_connection/spark_hdfs/slots.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_df_connection/spark_local_fs.py b/onetl/connection/file_df_connection/spark_local_fs.py index 71c70414..678b1dd8 100644 --- a/onetl/connection/file_df_connection/spark_local_fs.py +++ b/onetl/connection/file_df_connection/spark_local_fs.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_df_connection/spark_s3/__init__.py b/onetl/connection/file_df_connection/spark_s3/__init__.py index 303a7496..2a72ab82 100644 --- a/onetl/connection/file_df_connection/spark_s3/__init__.py +++ b/onetl/connection/file_df_connection/spark_s3/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.connection.file_df_connection.spark_s3.connection import SparkS3 from onetl.connection.file_df_connection.spark_s3.extra import SparkS3Extra diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 8fe07d10..182955cd 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/connection/file_df_connection/spark_s3/extra.py b/onetl/connection/file_df_connection/spark_s3/extra.py index 440eabed..62af3a5b 100644 --- a/onetl/connection/file_df_connection/spark_s3/extra.py +++ b/onetl/connection/file_df_connection/spark_s3/extra.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import re diff --git a/onetl/connection/kerberos_helpers.py b/onetl/connection/kerberos_helpers.py index 5e2bd65b..b1dd5019 100644 --- a/onetl/connection/kerberos_helpers.py +++ b/onetl/connection/kerberos_helpers.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/core/__init__.py b/onetl/core/__init__.py index 1768b603..9796b173 100644 --- a/onetl/core/__init__.py +++ b/onetl/core/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import textwrap import warnings diff --git a/onetl/core/file_filter/__init__.py b/onetl/core/file_filter/__init__.py index 9f2c9a5a..8e875f72 100644 --- a/onetl/core/file_filter/__init__.py +++ b/onetl/core/file_filter/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.core.file_filter.file_filter import FileFilter diff --git a/onetl/core/file_filter/file_filter.py b/onetl/core/file_filter/file_filter.py index a8fd7b69..5a885170 100644 --- a/onetl/core/file_filter/file_filter.py +++ b/onetl/core/file_filter/file_filter.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/core/file_limit/__init__.py b/onetl/core/file_limit/__init__.py index 58759c16..9913f1f4 100644 --- a/onetl/core/file_limit/__init__.py +++ b/onetl/core/file_limit/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.core.file_limit.file_limit import FileLimit diff --git a/onetl/core/file_limit/file_limit.py b/onetl/core/file_limit/file_limit.py index d3b98718..de82dafe 100644 --- a/onetl/core/file_limit/file_limit.py +++ b/onetl/core/file_limit/file_limit.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/db/__init__.py b/onetl/db/__init__.py index 2cd60981..115d2a6a 100644 --- a/onetl/db/__init__.py +++ b/onetl/db/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.db.db_reader import DBReader from onetl.db.db_writer import DBWriter diff --git a/onetl/db/db_reader/__init__.py b/onetl/db/db_reader/__init__.py index a71bb526..66f207cd 100644 --- a/onetl/db/db_reader/__init__.py +++ b/onetl/db/db_reader/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.db.db_reader.db_reader import DBReader diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index f560104d..a4f45ab0 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/db/db_writer/__init__.py b/onetl/db/db_writer/__init__.py index b181c7f0..f5408a18 100644 --- a/onetl/db/db_writer/__init__.py +++ b/onetl/db/db_writer/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.db.db_writer.db_writer import DBWriter diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index 0b07ec4e..c261ad23 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/exception.py b/onetl/exception.py index 03650e9a..e3c965f9 100644 --- a/onetl/exception.py +++ b/onetl/exception.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import textwrap diff --git a/onetl/file/__init__.py b/onetl/file/__init__.py index a747f037..6f6fdd4f 100644 --- a/onetl/file/__init__.py +++ b/onetl/file/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.file_df_reader import FileDFReader from onetl.file.file_df_writer import FileDFWriter diff --git a/onetl/file/file_df_reader/__init__.py b/onetl/file/file_df_reader/__init__.py index b7b1ff18..a273107a 100644 --- a/onetl/file/file_df_reader/__init__.py +++ b/onetl/file/file_df_reader/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.file_df_reader.file_df_reader import FileDFReader diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index f1e2f01e..517e23a9 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_df_reader/options.py b/onetl/file/file_df_reader/options.py index 714cf1a9..d6559bfb 100644 --- a/onetl/file/file_df_reader/options.py +++ b/onetl/file/file_df_reader/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_df_writer/__init__.py b/onetl/file/file_df_writer/__init__.py index 0cbb35bb..37407c47 100644 --- a/onetl/file/file_df_writer/__init__.py +++ b/onetl/file/file_df_writer/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.file_df_writer.file_df_writer import FileDFWriter diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index 35baaf15..0daea008 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_df_writer/options.py b/onetl/file/file_df_writer/options.py index 01bd9ee4..81971919 100644 --- a/onetl/file/file_df_writer/options.py +++ b/onetl/file/file_df_writer/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_downloader/__init__.py b/onetl/file/file_downloader/__init__.py index e19f5052..dd6660d0 100644 --- a/onetl/file/file_downloader/__init__.py +++ b/onetl/file/file_downloader/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.file_downloader.file_downloader import FileDownloader from onetl.file.file_downloader.options import FileDownloaderOptions diff --git a/onetl/file/file_downloader/file_downloader.py b/onetl/file/file_downloader/file_downloader.py index 069f8c69..ffd4925c 100644 --- a/onetl/file/file_downloader/file_downloader.py +++ b/onetl/file/file_downloader/file_downloader.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_downloader/options.py b/onetl/file/file_downloader/options.py index 91dd44d9..54b74b32 100644 --- a/onetl/file/file_downloader/options.py +++ b/onetl/file/file_downloader/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_downloader/result.py b/onetl/file/file_downloader/result.py index 96d184e9..36d3bff1 100644 --- a/onetl/file/file_downloader/result.py +++ b/onetl/file/file_downloader/result.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_mover/__init__.py b/onetl/file/file_mover/__init__.py index a1baa0db..f1260d07 100644 --- a/onetl/file/file_mover/__init__.py +++ b/onetl/file/file_mover/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.file_mover.file_mover import FileMover from onetl.file.file_mover.options import FileMoverOptions diff --git a/onetl/file/file_mover/file_mover.py b/onetl/file/file_mover/file_mover.py index 0bb2e666..3fd7e5c6 100644 --- a/onetl/file/file_mover/file_mover.py +++ b/onetl/file/file_mover/file_mover.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_mover/options.py b/onetl/file/file_mover/options.py index ce9c12b2..9d52139f 100644 --- a/onetl/file/file_mover/options.py +++ b/onetl/file/file_mover/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_mover/result.py b/onetl/file/file_mover/result.py index 99313d0f..e7afcdab 100644 --- a/onetl/file/file_mover/result.py +++ b/onetl/file/file_mover/result.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_result.py b/onetl/file/file_result.py index d2b9aec6..01844d7f 100644 --- a/onetl/file/file_result.py +++ b/onetl/file/file_result.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_set.py b/onetl/file/file_set.py index 2447ad04..c4fb2159 100644 --- a/onetl/file/file_set.py +++ b/onetl/file/file_set.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import os import textwrap diff --git a/onetl/file/file_uploader/__init__.py b/onetl/file/file_uploader/__init__.py index 85fc5fcd..9b1a2974 100644 --- a/onetl/file/file_uploader/__init__.py +++ b/onetl/file/file_uploader/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.file_uploader.file_uploader import FileUploader from onetl.file.file_uploader.options import FileUploaderOptions diff --git a/onetl/file/file_uploader/file_uploader.py b/onetl/file/file_uploader/file_uploader.py index fc6709ce..ebcae6f1 100644 --- a/onetl/file/file_uploader/file_uploader.py +++ b/onetl/file/file_uploader/file_uploader.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_uploader/options.py b/onetl/file/file_uploader/options.py index b046ec3a..98db43eb 100644 --- a/onetl/file/file_uploader/options.py +++ b/onetl/file/file_uploader/options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/file_uploader/result.py b/onetl/file/file_uploader/result.py index 34638bae..cf45a0c4 100644 --- a/onetl/file/file_uploader/result.py +++ b/onetl/file/file_uploader/result.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/filter/__init__.py b/onetl/file/filter/__init__.py index 1ebee030..88e2f835 100644 --- a/onetl/file/filter/__init__.py +++ b/onetl/file/filter/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.filter.exclude_dir import ExcludeDir from onetl.file.filter.file_hwm import FileHWMFilter diff --git a/onetl/file/filter/exclude_dir.py b/onetl/file/filter/exclude_dir.py index f5b096d2..d0bc1f7e 100644 --- a/onetl/file/filter/exclude_dir.py +++ b/onetl/file/filter/exclude_dir.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/filter/file_hwm.py b/onetl/file/filter/file_hwm.py index 232bf451..398cab21 100644 --- a/onetl/file/filter/file_hwm.py +++ b/onetl/file/filter/file_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/filter/glob.py b/onetl/file/filter/glob.py index db622cfd..529feae2 100644 --- a/onetl/file/filter/glob.py +++ b/onetl/file/filter/glob.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/filter/match_all_filters.py b/onetl/file/filter/match_all_filters.py index 484bee93..cfc59d17 100644 --- a/onetl/file/filter/match_all_filters.py +++ b/onetl/file/filter/match_all_filters.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import logging from typing import Iterable diff --git a/onetl/file/filter/regexp.py b/onetl/file/filter/regexp.py index 48e321ad..e698bc15 100644 --- a/onetl/file/filter/regexp.py +++ b/onetl/file/filter/regexp.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/__init__.py b/onetl/file/format/__init__.py index f149f5c2..a7e17229 100644 --- a/onetl/file/format/__init__.py +++ b/onetl/file/format/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.format.avro import Avro from onetl.file.format.csv import CSV diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 418e4064..1f6e2e0e 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/csv.py b/onetl/file/format/csv.py index 1c4442fd..c958bb69 100644 --- a/onetl/file/format/csv.py +++ b/onetl/file/format/csv.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/excel.py b/onetl/file/format/excel.py index 3f26522f..a62d2941 100644 --- a/onetl/file/format/excel.py +++ b/onetl/file/format/excel.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/file_format.py b/onetl/file/format/file_format.py index 4a7de4f3..e7998223 100644 --- a/onetl/file/format/file_format.py +++ b/onetl/file/format/file_format.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/json.py b/onetl/file/format/json.py index 085d125e..bfbda83d 100644 --- a/onetl/file/format/json.py +++ b/onetl/file/format/json.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/jsonline.py b/onetl/file/format/jsonline.py index 1d1c910d..d573e26e 100644 --- a/onetl/file/format/jsonline.py +++ b/onetl/file/format/jsonline.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/orc.py b/onetl/file/format/orc.py index f108a150..b342de43 100644 --- a/onetl/file/format/orc.py +++ b/onetl/file/format/orc.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/parquet.py b/onetl/file/format/parquet.py index f96ad444..f3c9a75b 100644 --- a/onetl/file/format/parquet.py +++ b/onetl/file/format/parquet.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 11425809..7946f997 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/limit/__init__.py b/onetl/file/limit/__init__.py index 1e00ffdd..2d353fc2 100644 --- a/onetl/file/limit/__init__.py +++ b/onetl/file/limit/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.file.limit.limits_reached import limits_reached from onetl.file.limit.limits_stop_at import limits_stop_at diff --git a/onetl/file/limit/limits_reached.py b/onetl/file/limit/limits_reached.py index 27d7fb50..8171df93 100644 --- a/onetl/file/limit/limits_reached.py +++ b/onetl/file/limit/limits_reached.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/limit/limits_stop_at.py b/onetl/file/limit/limits_stop_at.py index 035ac642..9e478a8a 100644 --- a/onetl/file/limit/limits_stop_at.py +++ b/onetl/file/limit/limits_stop_at.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/limit/max_files_count.py b/onetl/file/limit/max_files_count.py index ec604ff4..c62292fa 100644 --- a/onetl/file/limit/max_files_count.py +++ b/onetl/file/limit/max_files_count.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/file/limit/reset_limits.py b/onetl/file/limit/reset_limits.py index de9201da..8f95a911 100644 --- a/onetl/file/limit/reset_limits.py +++ b/onetl/file/limit/reset_limits.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hooks/__init__.py b/onetl/hooks/__init__.py index 6002b402..8ba492dc 100644 --- a/onetl/hooks/__init__.py +++ b/onetl/hooks/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.hooks.hook import HookPriority, hook from onetl.hooks.hooks_state import resume_all_hooks, skip_all_hooks, stop_all_hooks diff --git a/onetl/hooks/hook.py b/onetl/hooks/hook.py index 619cff7d..abf010ef 100644 --- a/onetl/hooks/hook.py +++ b/onetl/hooks/hook.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hooks/hook_collection.py b/onetl/hooks/hook_collection.py index d715086c..443aa9f2 100644 --- a/onetl/hooks/hook_collection.py +++ b/onetl/hooks/hook_collection.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hooks/hooks_state.py b/onetl/hooks/hooks_state.py index 53a2c0c3..4d489f0f 100644 --- a/onetl/hooks/hooks_state.py +++ b/onetl/hooks/hooks_state.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hooks/method_inheritance_stack.py b/onetl/hooks/method_inheritance_stack.py index 99fef076..4999fc6f 100644 --- a/onetl/hooks/method_inheritance_stack.py +++ b/onetl/hooks/method_inheritance_stack.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hooks/slot.py b/onetl/hooks/slot.py index 6d4b0b87..ee066606 100644 --- a/onetl/hooks/slot.py +++ b/onetl/hooks/slot.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hooks/support_hooks.py b/onetl/hooks/support_hooks.py index 33d440b5..d6323107 100644 --- a/onetl/hooks/support_hooks.py +++ b/onetl/hooks/support_hooks.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hwm/__init__.py b/onetl/hwm/__init__.py index e516ebea..88984b6b 100644 --- a/onetl/hwm/__init__.py +++ b/onetl/hwm/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.hwm.auto_hwm import AutoDetectHWM from onetl.hwm.window import Edge, Window diff --git a/onetl/hwm/auto_hwm.py b/onetl/hwm/auto_hwm.py index 5d346433..bb996509 100644 --- a/onetl/hwm/auto_hwm.py +++ b/onetl/hwm/auto_hwm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hwm/store/__init__.py b/onetl/hwm/store/__init__.py index 0e34caa0..4fb2d991 100644 --- a/onetl/hwm/store/__init__.py +++ b/onetl/hwm/store/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import textwrap import warnings diff --git a/onetl/hwm/store/hwm_class_registry.py b/onetl/hwm/store/hwm_class_registry.py index 82b0eef2..b15b77af 100644 --- a/onetl/hwm/store/hwm_class_registry.py +++ b/onetl/hwm/store/hwm_class_registry.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hwm/store/yaml_hwm_store.py b/onetl/hwm/store/yaml_hwm_store.py index 2ffd24f6..4a7ad690 100644 --- a/onetl/hwm/store/yaml_hwm_store.py +++ b/onetl/hwm/store/yaml_hwm_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/hwm/window.py b/onetl/hwm/window.py index 7a902000..35251dad 100644 --- a/onetl/hwm/window.py +++ b/onetl/hwm/window.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/__init__.py b/onetl/impl/__init__.py index 76d32f48..f8396131 100644 --- a/onetl/impl/__init__.py +++ b/onetl/impl/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.impl.base_model import BaseModel from onetl.impl.failed_local_file import FailedLocalFile diff --git a/onetl/impl/base_model.py b/onetl/impl/base_model.py index 3208619e..7478a823 100644 --- a/onetl/impl/base_model.py +++ b/onetl/impl/base_model.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 # isort: skip_file diff --git a/onetl/impl/failed_local_file.py b/onetl/impl/failed_local_file.py index 34a12bb9..a53c2fbe 100644 --- a/onetl/impl/failed_local_file.py +++ b/onetl/impl/failed_local_file.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/file_exist_behavior.py b/onetl/impl/file_exist_behavior.py index 1081aa0e..0933bf44 100644 --- a/onetl/impl/file_exist_behavior.py +++ b/onetl/impl/file_exist_behavior.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import logging import warnings diff --git a/onetl/impl/frozen_model.py b/onetl/impl/frozen_model.py index 10cb06a6..e9965b53 100644 --- a/onetl/impl/frozen_model.py +++ b/onetl/impl/frozen_model.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/generic_options.py b/onetl/impl/generic_options.py index 8d3e629b..df4fea7a 100644 --- a/onetl/impl/generic_options.py +++ b/onetl/impl/generic_options.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/local_path.py b/onetl/impl/local_path.py index 0dc70986..0f52681e 100644 --- a/onetl/impl/local_path.py +++ b/onetl/impl/local_path.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 import os import sys diff --git a/onetl/impl/path_container.py b/onetl/impl/path_container.py index 85ee7e82..b974aeb6 100644 --- a/onetl/impl/path_container.py +++ b/onetl/impl/path_container.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/path_repr.py b/onetl/impl/path_repr.py index 5a61b740..ad007280 100644 --- a/onetl/impl/path_repr.py +++ b/onetl/impl/path_repr.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/remote_directory.py b/onetl/impl/remote_directory.py index 2b52346e..6e0188e0 100644 --- a/onetl/impl/remote_directory.py +++ b/onetl/impl/remote_directory.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/remote_file.py b/onetl/impl/remote_file.py index 86193d44..1f0408ce 100644 --- a/onetl/impl/remote_file.py +++ b/onetl/impl/remote_file.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/impl/remote_path.py b/onetl/impl/remote_path.py index 78fce394..672cee3e 100644 --- a/onetl/impl/remote_path.py +++ b/onetl/impl/remote_path.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from pathlib import PurePosixPath diff --git a/onetl/impl/remote_path_stat.py b/onetl/impl/remote_path_stat.py index 85974803..7b500962 100644 --- a/onetl/impl/remote_path_stat.py +++ b/onetl/impl/remote_path_stat.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/log.py b/onetl/log.py index aeed6403..d7fdd8e8 100644 --- a/onetl/log.py +++ b/onetl/log.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/plugins/__init__.py b/onetl/plugins/__init__.py index 8127ff0d..e73221fb 100644 --- a/onetl/plugins/__init__.py +++ b/onetl/plugins/__init__.py @@ -1,3 +1,3 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.plugins.import_plugins import import_plugins diff --git a/onetl/plugins/import_plugins.py b/onetl/plugins/import_plugins.py index f22bd280..17ea9280 100644 --- a/onetl/plugins/import_plugins.py +++ b/onetl/plugins/import_plugins.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/strategy/__init__.py b/onetl/strategy/__init__.py index 1a0ff464..6fb490c3 100644 --- a/onetl/strategy/__init__.py +++ b/onetl/strategy/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from onetl.strategy.base_strategy import BaseStrategy from onetl.strategy.incremental_strategy import ( diff --git a/onetl/strategy/base_strategy.py b/onetl/strategy/base_strategy.py index 0daf309d..ccac5361 100644 --- a/onetl/strategy/base_strategy.py +++ b/onetl/strategy/base_strategy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/strategy/batch_hwm_strategy.py b/onetl/strategy/batch_hwm_strategy.py index 1e5ec5b5..deaaf80c 100644 --- a/onetl/strategy/batch_hwm_strategy.py +++ b/onetl/strategy/batch_hwm_strategy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/strategy/hwm_store/__init__.py b/onetl/strategy/hwm_store/__init__.py index 7a0338d3..de994c20 100644 --- a/onetl/strategy/hwm_store/__init__.py +++ b/onetl/strategy/hwm_store/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 # TODO: remove in 1.0.0 diff --git a/onetl/strategy/hwm_strategy.py b/onetl/strategy/hwm_strategy.py index 02249554..570b6c2a 100644 --- a/onetl/strategy/hwm_strategy.py +++ b/onetl/strategy/hwm_strategy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/strategy/incremental_strategy.py b/onetl/strategy/incremental_strategy.py index 0397514b..14ac8862 100644 --- a/onetl/strategy/incremental_strategy.py +++ b/onetl/strategy/incremental_strategy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/strategy/snapshot_strategy.py b/onetl/strategy/snapshot_strategy.py index 77ed4b35..0c0a54b0 100644 --- a/onetl/strategy/snapshot_strategy.py +++ b/onetl/strategy/snapshot_strategy.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/strategy/strategy_manager.py b/onetl/strategy/strategy_manager.py index bb380ae5..f66930c7 100644 --- a/onetl/strategy/strategy_manager.py +++ b/onetl/strategy/strategy_manager.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/onetl/version.py b/onetl/version.py index 1a3c6cec..af26d895 100644 --- a/onetl/version.py +++ b/onetl/version.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2021-2024 MTS (Mobile Telesystems) +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC # SPDX-License-Identifier: Apache-2.0 """ __version__ parameter required to be able to output to the console From cf1dca4f0ec34d88f03257118a376bbc7c7ae59d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 29 Aug 2024 09:59:10 +0000 Subject: [PATCH 50/64] Bump version --- docs/changelog/0.12.0.rst | 53 +++++++++++++++++++ docs/changelog/index.rst | 1 + .../changelog/next_release/+yield.feature.rst | 1 - docs/changelog/next_release/292.feature.rst | 1 - docs/changelog/next_release/303.feature.1.rst | 1 - docs/changelog/next_release/303.feature.2.rst | 10 ---- docs/changelog/next_release/304.breaking.rst | 3 -- docs/changelog/next_release/304.feature.rst | 6 --- docs/changelog/next_release/305.feature.rst | 1 - docs/changelog/next_release/306.feature.rst | 1 - docs/changelog/next_release/308.bugfix.rst | 1 - onetl/VERSION | 2 +- 12 files changed, 55 insertions(+), 26 deletions(-) create mode 100644 docs/changelog/0.12.0.rst delete mode 100644 docs/changelog/next_release/+yield.feature.rst delete mode 100644 docs/changelog/next_release/292.feature.rst delete mode 100644 docs/changelog/next_release/303.feature.1.rst delete mode 100644 docs/changelog/next_release/303.feature.2.rst delete mode 100644 docs/changelog/next_release/304.breaking.rst delete mode 100644 docs/changelog/next_release/304.feature.rst delete mode 100644 docs/changelog/next_release/305.feature.rst delete mode 100644 docs/changelog/next_release/306.feature.rst delete mode 100644 docs/changelog/next_release/308.bugfix.rst diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst new file mode 100644 index 00000000..dc4f4b83 --- /dev/null +++ b/docs/changelog/0.12.0.rst @@ -0,0 +1,53 @@ +0.12.0 (2024-08-29) +=================== + +Breaking Changes +---------------- + +- Change connection URL used for generating HWM names of S3 and Samba sources: + * ``smb://host:port`` -> ``smb://host:port/share`` + * ``s3://host:port`` -> ``s3://host:port/bucket`` (:github:pull:`304`) + +- Update ``Excel`` package from ``0.20.3`` to ``0.20.4``, to include Spark 3.5.1 support. (:github:pull:`306`) + +Features +-------- + +- Add support for specifying file formats (``ORC``, ``Parquet``, ``CSV``, etc.) in ``HiveWriteOptions.format`` (:github:pull:`292`): + + .. code:: python + + Hive.WriteOptions(format=ORC(compression="snappy")) + +- Collect Spark execution metrics in following methods, and log then in DEBUG mode: + * ``DBWriter.run()`` + * ``FileDFWriter.run()`` + * ``Hive.sql()`` + * ``Hive.execute()`` + + This is implemented using custom ``SparkListener`` which wraps the entire method call, and + then report collected metrics. But these metrics sometimes may be missing due to Spark architecture, + so they are not reliable source of information. That's why logs are printed only in DEBUG mode, and + are not returned as method call result. (:github:pull:`303`) + +- Generate default ``jobDescription`` based on currently executed method. Examples: + * ``DBWriter() -> Postgres[host:5432/database]`` + * ``MongoDB[localhost:27017/admin] -> DBReader.run()`` + * ``Hive[cluster].execute()`` + + If user already set custom ``jobDescription``, it will left intact. (:github:pull:`304`) + +- Add log.info about JDBC dialect usage (:github:pull:`305`): + + .. code:: text + + |MySQL| Detected dialect: 'org.apache.spark.sql.jdbc.MySQLDialect' + +- Log estimated size of in-memory dataframe created by ``JDBC.fetch`` and ``JDBC.execute`` methods. (:github:pull:`303`) + + +Bug Fixes +--------- + +- Fix passing ``Greenplum(extra={"options": ...)`` during read/write operations. (:github:pull:`308`) +- Do not raise exception if yield-based hook whas something past (and only one) ``yield``. diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 4bdac946..7700528e 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,6 +3,7 @@ :caption: Changelog DRAFT + 0.12.0 0.11.1 0.11.0 0.10.2 diff --git a/docs/changelog/next_release/+yield.feature.rst b/docs/changelog/next_release/+yield.feature.rst deleted file mode 100644 index efc58606..00000000 --- a/docs/changelog/next_release/+yield.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Do not raise exception if yield-based hook whas something past (and only one) ``yield``. diff --git a/docs/changelog/next_release/292.feature.rst b/docs/changelog/next_release/292.feature.rst deleted file mode 100644 index e50a5fcd..00000000 --- a/docs/changelog/next_release/292.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add support for specifying file formats (``ORC``, ``Parquet``, ``CSV``, etc.) in ``HiveWriteOptions.format``: ``Hive.WriteOptions(format=ORC(compression="snappy"))``. diff --git a/docs/changelog/next_release/303.feature.1.rst b/docs/changelog/next_release/303.feature.1.rst deleted file mode 100644 index 8c0b1e19..00000000 --- a/docs/changelog/next_release/303.feature.1.rst +++ /dev/null @@ -1 +0,0 @@ -Log estimated size of in-memory dataframe created by ``JDBC.fetch`` and ``JDBC.execute`` methods. diff --git a/docs/changelog/next_release/303.feature.2.rst b/docs/changelog/next_release/303.feature.2.rst deleted file mode 100644 index 92bbe13c..00000000 --- a/docs/changelog/next_release/303.feature.2.rst +++ /dev/null @@ -1,10 +0,0 @@ -Collect Spark execution metrics in following methods, and log then in DEBUG mode: -* ``DBWriter.run()`` -* ``FileDFWriter.run()`` -* ``Hive.sql()`` -* ``Hive.execute()`` - -This is implemented using custom ``SparkListener`` which wraps the entire method call, and -then report collected metrics. But these metrics sometimes may be missing due to Spark architecture, -so they are not reliable source of information. That's why logs are printed only in DEBUG mode, and -are not returned as method call result. diff --git a/docs/changelog/next_release/304.breaking.rst b/docs/changelog/next_release/304.breaking.rst deleted file mode 100644 index 60598321..00000000 --- a/docs/changelog/next_release/304.breaking.rst +++ /dev/null @@ -1,3 +0,0 @@ -Change connection URL used for generating HWM names of S3 and Samba sources: -* ``smb://host:port`` -> ``smb://host:port/share`` -* ``s3://host:port`` -> ``s3://host:port/bucket`` diff --git a/docs/changelog/next_release/304.feature.rst b/docs/changelog/next_release/304.feature.rst deleted file mode 100644 index 97560354..00000000 --- a/docs/changelog/next_release/304.feature.rst +++ /dev/null @@ -1,6 +0,0 @@ -Generate default ``jobDescription`` based on currently executed method. Examples: -* ``DBWriter() -> Postgres[host:5432/database]`` -* ``MongoDB[localhost:27017/admin] -> DBReader.run()`` -* ``Hive[cluster].execute()`` - -If user already set custom ``jobDescription``, it will left intact. diff --git a/docs/changelog/next_release/305.feature.rst b/docs/changelog/next_release/305.feature.rst deleted file mode 100644 index c4c44dc6..00000000 --- a/docs/changelog/next_release/305.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Add log.info about JDBC dialect usage: ``Detected dialect: 'org.apache.spark.sql.jdbc.MySQLDialect'`` diff --git a/docs/changelog/next_release/306.feature.rst b/docs/changelog/next_release/306.feature.rst deleted file mode 100644 index 1c2b95f7..00000000 --- a/docs/changelog/next_release/306.feature.rst +++ /dev/null @@ -1 +0,0 @@ -Update ``Excel`` package from ``0.20.3`` to ``0.20.4``, to include Spark 3.5.1 support. diff --git a/docs/changelog/next_release/308.bugfix.rst b/docs/changelog/next_release/308.bugfix.rst deleted file mode 100644 index 3ffcdcc5..00000000 --- a/docs/changelog/next_release/308.bugfix.rst +++ /dev/null @@ -1 +0,0 @@ -Fix passing ``Greenplum(extra={"options": ...)`` during read/write operations. diff --git a/onetl/VERSION b/onetl/VERSION index bc859cbd..ac454c6a 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.11.2 +0.12.0 From 7e70db16b71251a0c68716a2811e529aeb8062e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 29 Aug 2024 09:59:50 +0000 Subject: [PATCH 51/64] [DOP-16999] Log detected JDBC dialect while using DBReader --- onetl/connection/db_connection/greenplum/connection.py | 1 + onetl/connection/db_connection/jdbc_connection/connection.py | 1 + 2 files changed, 2 insertions(+) diff --git a/onetl/connection/db_connection/greenplum/connection.py b/onetl/connection/db_connection/greenplum/connection.py index ff5c5a76..7f7e8961 100644 --- a/onetl/connection/db_connection/greenplum/connection.py +++ b/onetl/connection/db_connection/greenplum/connection.py @@ -357,6 +357,7 @@ def get_df_schema( columns: list[str] | None = None, options: JDBCReadOptions | None = None, ) -> StructType: + log.info("|%s| Detected dialect: '%s'", self.__class__.__name__, self._get_spark_dialect_name()) log.info("|%s| Fetching schema of table %r ...", self.__class__.__name__, source) query = self.dialect.get_sql_query(source, columns=columns, limit=0, compact=True) diff --git a/onetl/connection/db_connection/jdbc_connection/connection.py b/onetl/connection/db_connection/jdbc_connection/connection.py index 2752dc25..0ea3078c 100644 --- a/onetl/connection/db_connection/jdbc_connection/connection.py +++ b/onetl/connection/db_connection/jdbc_connection/connection.py @@ -159,6 +159,7 @@ def read_source_as_df( limit=limit, ) + log.info("|%s| Detected dialect: '%s'", self.__class__.__name__, self._get_spark_dialect_name()) log.info("|%s| Executing SQL query (on executor):", self.__class__.__name__) log_lines(log, query) From 82685e966d968c518fce87f87aa18dafd0967f3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 29 Aug 2024 10:22:57 +0000 Subject: [PATCH 52/64] Update JDBC & MongoDB packages to latest versions --- docs/changelog/0.12.0.rst | 8 ++++++++ .../db_connection/mongodb/types.rst | 4 ++-- .../db_connection/clickhouse/connection.py | 8 ++++---- .../db_connection/mongodb/connection.py | 14 ++++++------- .../db_connection/mssql/connection.py | 10 +++++----- .../db_connection/mysql/connection.py | 8 ++++---- .../db_connection/oracle/connection.py | 10 +++++----- .../db_connection/postgres/connection.py | 8 ++++---- onetl/file/format/xml.py | 2 +- .../test_clickhouse_unit.py | 6 +++--- .../test_mongodb_unit.py | 18 ++++++++--------- .../test_mssql_unit.py | 20 +++++++++---------- .../test_mysql_unit.py | 6 +++--- .../test_oracle_unit.py | 16 +++++++-------- .../test_postgres_unit.py | 8 ++++---- 15 files changed, 77 insertions(+), 69 deletions(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index dc4f4b83..d0a87cc7 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -8,6 +8,14 @@ Breaking Changes * ``smb://host:port`` -> ``smb://host:port/share`` * ``s3://host:port`` -> ``s3://host:port/bucket`` (:github:pull:`304`) +- Update DB connectors/drivers to latest versions: + * Clickhouse ``0.6.0-patch5`` → ``0.6.4`` + * MongoDB ``10.3.0`` → ``10.4.0`` + * MSSQL ``12.6.2`` → ``12.8.1`` + * MySQL ``8.4.0`` → ``9.0.0`` + * Oracle ``23.4.0.24.05`` → ``23.5.0.24.07`` + * Postgres ``42.7.3`` → ``42.7.4`` + - Update ``Excel`` package from ``0.20.3`` to ``0.20.4``, to include Spark 3.5.1 support. (:github:pull:`306`) Features diff --git a/docs/connection/db_connection/mongodb/types.rst b/docs/connection/db_connection/mongodb/types.rst index 4b22b7cb..f701ac93 100644 --- a/docs/connection/db_connection/mongodb/types.rst +++ b/docs/connection/db_connection/mongodb/types.rst @@ -73,8 +73,8 @@ References Here you can find source code with type conversions: -* `MongoDB -> Spark `_ -* `Spark -> MongoDB `_ +* `MongoDB -> Spark `_ +* `Spark -> MongoDB `_ Supported types --------------- diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index 21b282b3..fad82942 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -37,7 +37,7 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.0-patch5 `_ + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.4 `_ (`official Clickhouse JDBC driver `_). .. seealso:: @@ -139,7 +139,7 @@ def get_packages( Parameters ---------- package_version : str, optional - ClickHouse JDBC version client packages. Defaults to ``0.6.0-patch5``. + ClickHouse JDBC version client packages. Defaults to ``0.6.4``. .. versionadded:: 0.11.0 @@ -158,7 +158,7 @@ def get_packages( Clickhouse.get_packages(package_version="0.6.0", apache_http_client_version="5.3.1") """ - default_jdbc_version = "0.6.0-patch5" + default_jdbc_version = "0.6.4" default_http_version = "5.3.1" jdbc_version = Version(package_version or default_jdbc_version).min_digits(3) @@ -177,7 +177,7 @@ def get_packages( @classproperty def package(self) -> str: """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" - return "com.clickhouse:clickhouse-jdbc:0.6.0-patch5,com.clickhouse:clickhouse-http-client:0.6.0-patch5,org.apache.httpcomponents.client5:httpclient5:5.3.1" + return "com.clickhouse:clickhouse-jdbc:0.6.4,com.clickhouse:clickhouse-http-client:0.6.4,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index 4cc7e3ed..f406a232 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -50,7 +50,7 @@ class Config: class MongoDB(DBConnection): """MongoDB connection. |support_hooks| - Based on package `org.mongodb.spark:mongo-spark-connector:10.3.0 `_ + Based on package `org.mongodb.spark:mongo-spark-connector:10.4.0 `_ (`MongoDB connector for Spark `_) .. seealso:: @@ -153,7 +153,7 @@ def get_packages( Spark version in format ``major.minor``. Used only if ``scala_version=None``. package_version : str, optional - Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.3.0``. + Specifies the version of the MongoDB Spark connector to use. Defaults to ``10.4.0``. .. versionadded:: 0.11.0 @@ -166,10 +166,10 @@ def get_packages( MongoDB.get_packages(scala_version="2.12") # specify custom connector version - MongoDB.get_packages(scala_version="2.12", package_version="10.3.0") + MongoDB.get_packages(scala_version="2.12", package_version="10.4.0") """ - default_package_version = "10.3.0" + default_package_version = "10.4.0" if scala_version: scala_ver = Version(scala_version).min_digits(2) @@ -196,7 +196,7 @@ def package_spark_3_2(cls) -> str: "use `MongoDB.get_packages(spark_version='3.2')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0" @classproperty def package_spark_3_3(cls) -> str: @@ -206,7 +206,7 @@ def package_spark_3_3(cls) -> str: "use `MongoDB.get_packages(spark_version='3.3')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0" @classproperty def package_spark_3_4(cls) -> str: @@ -216,7 +216,7 @@ def package_spark_3_4(cls) -> str: "use `MongoDB.get_packages(spark_version='3.4')` instead" ) warnings.warn(msg, UserWarning, stacklevel=3) - return "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + return "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0" @slot def pipeline( diff --git a/onetl/connection/db_connection/mssql/connection.py b/onetl/connection/db_connection/mssql/connection.py index 18a08e32..50235332 100644 --- a/onetl/connection/db_connection/mssql/connection.py +++ b/onetl/connection/db_connection/mssql/connection.py @@ -34,7 +34,7 @@ class Config: class MSSQL(JDBCConnection): """MSSQL JDBC connection. |support_hooks| - Based on Maven package `com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8 `_ + Based on Maven package `com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8 `_ (`official MSSQL JDBC driver `_). @@ -205,7 +205,7 @@ def get_packages( java_version : str, optional Java major version, defaults to ``8``. Must be ``8`` or ``11``. package_version : str, optional - Specifies the version of the MSSQL JDBC driver to use. Defaults to ``12.6.2.``. + Specifies the version of the MSSQL JDBC driver to use. Defaults to ``12.8.1.``. Examples -------- @@ -216,10 +216,10 @@ def get_packages( MSSQL.get_packages() # specify Java and package versions - MSSQL.get_packages(java_version="8", package_version="12.6.2.jre11") + MSSQL.get_packages(java_version="8", package_version="12.8.1.jre11") """ default_java_version = "8" - default_package_version = "12.6.2" + default_package_version = "12.8.1" java_ver = Version(java_version or default_java_version) if java_ver.major < 8: @@ -241,7 +241,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MSSQL.package` will be removed in 1.0.0, use `MSSQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8" + return "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8" @property def jdbc_url(self) -> str: diff --git a/onetl/connection/db_connection/mysql/connection.py b/onetl/connection/db_connection/mysql/connection.py index 15ab2a62..2588d79a 100644 --- a/onetl/connection/db_connection/mysql/connection.py +++ b/onetl/connection/db_connection/mysql/connection.py @@ -36,7 +36,7 @@ class Config: class MySQL(JDBCConnection): """MySQL JDBC connection. |support_hooks| - Based on Maven package `com.mysql:mysql-connector-j:8.4.0 `_ + Based on Maven package `com.mysql:mysql-connector-j:9.0.0 `_ (`official MySQL JDBC driver `_). .. seealso:: @@ -132,7 +132,7 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: Parameters ---------- package_version : str, optional - Specifies the version of the MySQL JDBC driver to use. Defaults to ``8.4.0``. + Specifies the version of the MySQL JDBC driver to use. Defaults to ``9.0.0``. .. versionadded:: 0.11.0 @@ -147,7 +147,7 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: # specify a custom package version MySQL.get_packages(package_version="8.2.0") """ - default_version = "8.4.0" + default_version = "9.0.0" version = Version(package_version or default_version).min_digits(3) return [f"com.mysql:mysql-connector-j:{version}"] @@ -157,7 +157,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`MySQL.package` will be removed in 1.0.0, use `MySQL.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.mysql:mysql-connector-j:8.4.0" + return "com.mysql:mysql-connector-j:9.0.0" @property def jdbc_url(self) -> str: diff --git a/onetl/connection/db_connection/oracle/connection.py b/onetl/connection/db_connection/oracle/connection.py index 96ba9bd7..0d163d22 100644 --- a/onetl/connection/db_connection/oracle/connection.py +++ b/onetl/connection/db_connection/oracle/connection.py @@ -79,7 +79,7 @@ class Config: class Oracle(JDBCConnection): """Oracle JDBC connection. |support_hooks| - Based on Maven package `com.oracle.database.jdbc:ojdbc8:23.4.0.24.05 `_ + Based on Maven package `com.oracle.database.jdbc:ojdbc8:23.5.0.24.07 `_ (`official Oracle JDBC driver `_). .. seealso:: @@ -208,7 +208,7 @@ def get_packages( java_version : str, optional Java major version, defaults to "8". Must be "8" or "11". package_version : str, optional - Specifies the version of the Oracle JDBC driver to use. Defaults to "23.4.0.24.05". + Specifies the version of the Oracle JDBC driver to use. Defaults to "23.5.0.24.07". Examples -------- @@ -220,11 +220,11 @@ def get_packages( Oracle.get_packages() # specify Java and package versions - Oracle.get_packages(java_version="8", package_version="23.4.0.24.05") + Oracle.get_packages(java_version="8", package_version="23.5.0.24.07") """ default_java_version = "8" - default_package_version = "23.4.0.24.05" + default_package_version = "23.5.0.24.07" java_ver = Version(java_version or default_java_version) if java_ver.major < 8: @@ -240,7 +240,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Oracle.package` will be removed in 1.0.0, use `Oracle.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "com.oracle.database.jdbc:ojdbc8:23.4.0.24.05" + return "com.oracle.database.jdbc:ojdbc8:23.5.0.24.07" @property def jdbc_url(self) -> str: diff --git a/onetl/connection/db_connection/postgres/connection.py b/onetl/connection/db_connection/postgres/connection.py index ac5e50d1..c8a272ba 100644 --- a/onetl/connection/db_connection/postgres/connection.py +++ b/onetl/connection/db_connection/postgres/connection.py @@ -44,7 +44,7 @@ class Config: class Postgres(JDBCConnection): """PostgreSQL JDBC connection. |support_hooks| - Based on Maven package `org.postgresql:postgresql:42.7.3 `_ + Based on Maven package `org.postgresql:postgresql:42.7.4 `_ (`official Postgres JDBC driver `_). .. seealso:: @@ -140,7 +140,7 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: Parameters ---------- package_version : str, optional - Specifies the version of the PostgreSQL JDBC driver to use. Defaults to ``42.7.3``. + Specifies the version of the PostgreSQL JDBC driver to use. Defaults to ``42.7.4``. Examples -------- @@ -155,7 +155,7 @@ def get_packages(cls, package_version: str | None = None) -> list[str]: Postgres.get_packages(package_version="42.6.0") """ - default_version = "42.7.3" + default_version = "42.7.4" version = Version(package_version or default_version).min_digits(3) return [f"org.postgresql:postgresql:{version}"] @@ -165,7 +165,7 @@ def package(cls) -> str: """Get package name to be downloaded by Spark.""" msg = "`Postgres.package` will be removed in 1.0.0, use `Postgres.get_packages()` instead" warnings.warn(msg, UserWarning, stacklevel=3) - return "org.postgresql:postgresql:42.7.3" + return "org.postgresql:postgresql:42.7.4" @property def jdbc_url(self) -> str: diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index 7946f997..2e1ad003 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -200,7 +200,7 @@ def get_packages( # noqa: WPS231 raise ValueError(f"Package version must be above 0.13, got {version}") log.warning("Passed custom package version %r, it is not guaranteed to be supported", package_version) else: - version = Version("0.18.0").min_digits(3) + version = Version("0.18.0") spark_ver = Version(spark_version) scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 287061d2..9bf7a068 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -11,7 +11,7 @@ def test_clickhouse_driver(): def test_clickhouse_package(): expected_packages = ( - "com.clickhouse:clickhouse-jdbc:0.6.0-patch5,com.clickhouse:clickhouse-http-client:0.6.0-patch5," + "com.clickhouse:clickhouse-jdbc:0.6.4,com.clickhouse:clickhouse-http-client:0.6.4," "org.apache.httpcomponents.client5:httpclient5:5.3.1" ) assert Clickhouse.package == expected_packages @@ -24,8 +24,8 @@ def test_clickhouse_package(): None, None, [ - "com.clickhouse:clickhouse-jdbc:0.6.0-patch5", - "com.clickhouse:clickhouse-http-client:0.6.0-patch5", + "com.clickhouse:clickhouse-jdbc:0.6.4", + "com.clickhouse:clickhouse-http-client:0.6.4", "org.apache.httpcomponents.client5:httpclient5:5.3.1", ], ), diff --git a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py index 9142848e..4e67f93a 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mongodb_unit.py @@ -12,9 +12,9 @@ def test_mongodb_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MongoDB.get_packages(spark_version=") with pytest.warns(UserWarning, match=warning_msg): - assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" - assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" - assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0" + assert MongoDB.package_spark_3_2 == "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0" + assert MongoDB.package_spark_3_3 == "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0" + assert MongoDB.package_spark_3_4 == "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0" def test_mongodb_get_packages_no_input(): @@ -50,16 +50,16 @@ def test_mongodb_get_packages_scala_version_not_supported(scala_version): @pytest.mark.parametrize( "spark_version, scala_version, package_version, package", [ - (None, "2.12", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), - (None, "2.13", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.13:10.3.0"), - ("3.2", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), - ("3.3", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), - ("3.4", None, "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + (None, "2.12", "10.4.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0"), + (None, "2.13", "10.4.0", "org.mongodb.spark:mongo-spark-connector_2.13:10.4.0"), + ("3.2", None, "10.4.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0"), + ("3.3", None, "10.4.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0"), + ("3.4", None, "10.4.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0"), ("3.2", "2.12", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.1.1"), ("3.4", "2.13", "10.1.1", "org.mongodb.spark:mongo-spark-connector_2.13:10.1.1"), ("3.2", "2.12", "10.2.1", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.1"), ("3.2", "2.12", "10.2.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.2.0"), - ("3.2.4", "2.12.1", "10.3.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.3.0"), + ("3.2.4", "2.12.1", "10.4.0", "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0"), ], ) def test_mongodb_get_packages(spark_version, scala_version, package_version, package): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py index d9f3cfda..aedd1990 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mssql_unit.py @@ -14,23 +14,23 @@ def test_mssql_class_attributes(): def test_mssql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MSSQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8" + assert MSSQL.package == "com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8" @pytest.mark.parametrize( "java_version, package_version, expected_packages", [ - (None, None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), - ("8", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), - ("9", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), - ("11", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), - ("20", None, ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), - ("8", "12.6.2.jre8", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), - ("11", "12.6.2.jre11", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + (None, None, ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8"]), + ("8", None, ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8"]), + ("9", None, ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8"]), + ("11", None, ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11"]), + ("20", None, ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11"]), + ("8", "12.8.1.jre8", ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8"]), + ("11", "12.8.1.jre11", ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11"]), ("11", "12.7.0.jre11-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre11-preview"]), ("8", "12.7.0.jre8-preview", ["com.microsoft.sqlserver:mssql-jdbc:12.7.0.jre8-preview"]), - ("8", "12.6.2", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre8"]), - ("11", "12.6.2", ["com.microsoft.sqlserver:mssql-jdbc:12.6.2.jre11"]), + ("8", "12.8.1", ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre8"]), + ("11", "12.8.1", ["com.microsoft.sqlserver:mssql-jdbc:12.8.1.jre11"]), ], ) def test_mssql_get_packages(java_version, package_version, expected_packages): diff --git a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py index 0d57da48..54913c07 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_mysql_unit.py @@ -14,14 +14,14 @@ def test_mysql_class_attributes(): def test_mysql_package(): warning_msg = re.escape("will be removed in 1.0.0, use `MySQL.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert MySQL.package == "com.mysql:mysql-connector-j:8.4.0" + assert MySQL.package == "com.mysql:mysql-connector-j:9.0.0" @pytest.mark.parametrize( "package_version, expected_packages", [ - (None, ["com.mysql:mysql-connector-j:8.4.0"]), - ("8.4.0", ["com.mysql:mysql-connector-j:8.4.0"]), + (None, ["com.mysql:mysql-connector-j:9.0.0"]), + ("9.0.0", ["com.mysql:mysql-connector-j:9.0.0"]), ("8.1.0", ["com.mysql:mysql-connector-j:8.1.0"]), ("8.0.33", ["com.mysql:mysql-connector-j:8.0.33"]), ], diff --git a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py index dd02b5c9..d0618c89 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_oracle_unit.py @@ -14,11 +14,11 @@ def test_oracle_class_attributes(): def test_oracle_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Oracle.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.4.0.24.05" + assert Oracle.package == "com.oracle.database.jdbc:ojdbc8:23.5.0.24.07" def test_oracle_get_packages_no_input(): - assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"] + assert Oracle.get_packages() == ["com.oracle.database.jdbc:ojdbc8:23.5.0.24.07"] @pytest.mark.parametrize("java_version", ["7", "6"]) @@ -30,16 +30,16 @@ def test_oracle_get_packages_java_version_not_supported(java_version): @pytest.mark.parametrize( "java_version, package_version, expected_packages", [ - (None, None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), - ("8", None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), - ("8", "23.4.0.24.05", ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + (None, None, ["com.oracle.database.jdbc:ojdbc8:23.5.0.24.07"]), + ("8", None, ["com.oracle.database.jdbc:ojdbc8:23.5.0.24.07"]), + ("8", "23.5.0.24.07", ["com.oracle.database.jdbc:ojdbc8:23.5.0.24.07"]), ("8", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), - ("9", None, ["com.oracle.database.jdbc:ojdbc8:23.4.0.24.05"]), + ("9", None, ["com.oracle.database.jdbc:ojdbc8:23.5.0.24.07"]), ("9", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc8:21.13.0.0"]), - ("11", None, ["com.oracle.database.jdbc:ojdbc11:23.4.0.24.05"]), + ("11", None, ["com.oracle.database.jdbc:ojdbc11:23.5.0.24.07"]), ("11", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), ("17", "21.13.0.0", ["com.oracle.database.jdbc:ojdbc11:21.13.0.0"]), - ("20", "23.4.0.24.05", ["com.oracle.database.jdbc:ojdbc11:23.4.0.24.05"]), + ("20", "23.5.0.24.07", ["com.oracle.database.jdbc:ojdbc11:23.5.0.24.07"]), ], ) def test_oracle_get_packages(java_version, package_version, expected_packages): diff --git a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py index 2b0080bf..eae69579 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_postgres_unit.py @@ -14,15 +14,15 @@ def test_postgres_class_attributes(): def test_postgres_package(): warning_msg = re.escape("will be removed in 1.0.0, use `Postgres.get_packages()` instead") with pytest.warns(UserWarning, match=warning_msg): - assert Postgres.package == "org.postgresql:postgresql:42.7.3" + assert Postgres.package == "org.postgresql:postgresql:42.7.4" @pytest.mark.parametrize( "package_version, expected_packages", [ - (None, ["org.postgresql:postgresql:42.7.3"]), - ("42.7.3", ["org.postgresql:postgresql:42.7.3"]), - ("42.7.3-patch", ["org.postgresql:postgresql:42.7.3-patch"]), + (None, ["org.postgresql:postgresql:42.7.4"]), + ("42.7.4", ["org.postgresql:postgresql:42.7.4"]), + ("42.7.4-patch", ["org.postgresql:postgresql:42.7.4-patch"]), ("42.6.0", ["org.postgresql:postgresql:42.6.0"]), ], ) From d50c123086a5176d19e5c088c2e926a9ae5964f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Thu, 29 Aug 2024 11:37:00 +0000 Subject: [PATCH 53/64] Update JDBC prerequisites --- docs/connection/db_connection/clickhouse/prerequisites.rst | 4 +++- docs/connection/db_connection/mongodb/prerequisites.rst | 2 +- docs/connection/db_connection/mssql/prerequisites.rst | 4 +++- docs/connection/db_connection/mysql/prerequisites.rst | 6 ++++-- docs/connection/db_connection/oracle/prerequisites.rst | 4 +++- docs/connection/db_connection/postgres/prerequisites.rst | 2 +- 6 files changed, 15 insertions(+), 7 deletions(-) diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index 03384b1a..b9a7577b 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* Clickhouse server versions: 21.1 or higher +* Clickhouse server versions: + * Officially declared: 22.8 or higher + * Actually supported: 21.1 or higher * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst index 7df5f502..5cc8179a 100644 --- a/docs/connection/db_connection/mongodb/prerequisites.rst +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -10,7 +10,7 @@ Version Compatibility * Spark versions: 3.2.x - 3.5.x * Java versions: 8 - 20 -See `official documentation `_. +See `official documentation `_. Installing PySpark ------------------ diff --git a/docs/connection/db_connection/mssql/prerequisites.rst b/docs/connection/db_connection/mssql/prerequisites.rst index 8dde0f6c..c3c9059a 100644 --- a/docs/connection/db_connection/mssql/prerequisites.rst +++ b/docs/connection/db_connection/mssql/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* SQL Server versions: 2014 - 2022 +* SQL Server versions: + * Officially declared: 2016 - 2022 + * Actually supported: 2014 - 2022 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index b92f3320..b98186bc 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -6,11 +6,13 @@ Prerequisites Version Compatibility --------------------- -* MySQL server versions: 5.7 - 9.0 +* MySQL server versions: + * Officially declared: 8.0 - 9.0 + * Actually supported: 5.7 - 9.0 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 -See `official documentation `_. +See `official documentation `_. Installing PySpark ------------------ diff --git a/docs/connection/db_connection/oracle/prerequisites.rst b/docs/connection/db_connection/oracle/prerequisites.rst index b5b64e43..82fa55a4 100644 --- a/docs/connection/db_connection/oracle/prerequisites.rst +++ b/docs/connection/db_connection/oracle/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* Oracle Server versions: 23, 21, 19, 18, 12.2 and __probably__ 11.2 (tested, but it's not mentioned in official docs). +* Oracle Server versions: + * Officially declared: 19 - 23 + * Actually supported: 11.2 - 23 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst index ef83144f..fd8aadc8 100644 --- a/docs/connection/db_connection/postgres/prerequisites.rst +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -10,7 +10,7 @@ Version Compatibility * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 -See `official documentation `_. +See `official documentation `_. Installing PySpark ------------------ From 4010f9beff61e629da932ab9ca295a78ce56f9e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 30 Aug 2024 08:07:46 +0000 Subject: [PATCH 54/64] Update DB prerequisites --- .github/workflows/data/clickhouse/matrix.yml | 2 +- .github/workflows/data/local-fs/matrix.yml | 3 +-- .github/workflows/data/mongodb/matrix.yml | 2 +- .github/workflows/data/postgres/matrix.yml | 4 ++-- .github/workflows/data/s3/matrix.yml | 2 +- .github/workflows/data/sftp/matrix.yml | 2 +- docker-compose.yml | 2 +- docs/connection/db_connection/clickhouse/prerequisites.rst | 2 +- docs/connection/db_connection/greenplum/prerequisites.rst | 4 +++- docs/connection/db_connection/hive/prerequisites.rst | 4 +++- docs/connection/db_connection/kafka/prerequisites.rst | 4 +++- docs/connection/db_connection/mongodb/prerequisites.rst | 4 +++- docs/connection/db_connection/mssql/prerequisites.rst | 2 +- docs/connection/db_connection/mysql/prerequisites.rst | 2 +- docs/connection/db_connection/oracle/prerequisites.rst | 2 +- docs/connection/db_connection/postgres/prerequisites.rst | 4 +++- docs/connection/db_connection/teradata/prerequisites.rst | 4 +++- 17 files changed, 30 insertions(+), 19 deletions(-) diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index d18856df..1373d5e8 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -10,7 +10,7 @@ min: &min max: &max clickhouse-image: clickhouse/clickhouse-server - clickhouse-version: 24.6.3.70-alpine + clickhouse-version: 24.8.2.3-alpine spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index c4466f3c..081d9eb8 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,8 +20,7 @@ min_excel: &min_excel os: ubuntu-latest max: &max - # Excel package currently has no release for 3.5.2 - spark-version: 3.5.1 + spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 4c3d9d86..11892d65 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -8,7 +8,7 @@ min: &min os: ubuntu-latest max: &max - mongodb-version: 7.0.12 + mongodb-version: 7.0.14 spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index d37c3a83..cd63ae03 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -1,5 +1,5 @@ min: &min - # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life + # Min supported version by JDBC driver is 8.2, but it is too ancient to be used by anyone in real life postgres-version: 9.4.26-alpine spark-version: 2.3.1 pydantic-version: 1 @@ -8,7 +8,7 @@ min: &min os: ubuntu-latest max: &max - postgres-version: 16.3-alpine + postgres-version: 16.4-alpine spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 405b8b68..ffb9aff9 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -9,7 +9,7 @@ min: &min os: ubuntu-latest max: &max - minio-version: 2024.7.26 + minio-version: RELEASE.2024-08-29T01-40-52Z spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' diff --git a/.github/workflows/data/sftp/matrix.yml b/.github/workflows/data/sftp/matrix.yml index 5a5a757c..e54f796c 100644 --- a/.github/workflows/data/sftp/matrix.yml +++ b/.github/workflows/data/sftp/matrix.yml @@ -6,7 +6,7 @@ min: &min os: ubuntu-latest max: &max - openssh-version: 9.6_p1-r0-ls154 + openssh-version: 9.7_p1-r4-ls166 pydantic-version: 2 python-version: '3.12' os: ubuntu-latest diff --git a/docker-compose.yml b/docker-compose.yml index 73e8a21e..d32f682a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -118,7 +118,7 @@ services: platform: linux/amd64 postgres: - image: ${POSTGRES_IMAGE:-postgres:15.2-alpine} + image: ${POSTGRES_IMAGE:-postgres:alpine} restart: unless-stopped env_file: .env.dependencies ports: diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index b9a7577b..c66be635 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -8,7 +8,7 @@ Version Compatibility * Clickhouse server versions: * Officially declared: 22.8 or higher - * Actually supported: 21.1 or higher + * Actually tested: 21.1, 24.8 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/greenplum/prerequisites.rst b/docs/connection/db_connection/greenplum/prerequisites.rst index 04595766..b3cf52d7 100644 --- a/docs/connection/db_connection/greenplum/prerequisites.rst +++ b/docs/connection/db_connection/greenplum/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* Greenplum server versions: 5.x, 6.x, 7.x (requires ``Greenplum.get_packages(package_version="2.3.0")`` or higher) +* Greenplum server versions: + * Officially declared: 5.x, 6.x, and 7.x (which requires ``Greenplum.get_packages(package_version="2.3.0")`` or higher) + * Actually tested: 6.23, 7.0 * Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) * Java versions: 8 - 11 diff --git a/docs/connection/db_connection/hive/prerequisites.rst b/docs/connection/db_connection/hive/prerequisites.rst index d690f918..0f56e7ba 100644 --- a/docs/connection/db_connection/hive/prerequisites.rst +++ b/docs/connection/db_connection/hive/prerequisites.rst @@ -14,7 +14,9 @@ Prerequisites Version Compatibility --------------------- -* Hive Metastore version: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) +* Hive Metastore version: + * Officially declared: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) + * Actually tested: 1.2.100, 2.3.10, 3.1.3 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/kafka/prerequisites.rst b/docs/connection/db_connection/kafka/prerequisites.rst index 29f5885b..db2598d1 100644 --- a/docs/connection/db_connection/kafka/prerequisites.rst +++ b/docs/connection/db_connection/kafka/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* Kafka server versions: 0.10 or higher +* Kafka server versions: + * Officially declared: 0.10 or higher + * Actually tested: 3.2.3, 3.7.1 (only 3.x supports message headers) * Spark versions: 2.4.x - 3.5.x * Java versions: 8 - 17 diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst index 5cc8179a..8a01d675 100644 --- a/docs/connection/db_connection/mongodb/prerequisites.rst +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* MongoDB server versions: 4.0 or higher +* MongoDB server versions: + * Officially declared: 4.0 or higher + * Actually tested: 4.0, 7.0 * Spark versions: 3.2.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/mssql/prerequisites.rst b/docs/connection/db_connection/mssql/prerequisites.rst index c3c9059a..4e9fd263 100644 --- a/docs/connection/db_connection/mssql/prerequisites.rst +++ b/docs/connection/db_connection/mssql/prerequisites.rst @@ -8,7 +8,7 @@ Version Compatibility * SQL Server versions: * Officially declared: 2016 - 2022 - * Actually supported: 2014 - 2022 + * Actually tested: 2014, 2022 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index b98186bc..15b7c574 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -8,7 +8,7 @@ Version Compatibility * MySQL server versions: * Officially declared: 8.0 - 9.0 - * Actually supported: 5.7 - 9.0 + * Actually tested: 5.7, 9.0 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/oracle/prerequisites.rst b/docs/connection/db_connection/oracle/prerequisites.rst index 82fa55a4..35dd7569 100644 --- a/docs/connection/db_connection/oracle/prerequisites.rst +++ b/docs/connection/db_connection/oracle/prerequisites.rst @@ -8,7 +8,7 @@ Version Compatibility * Oracle Server versions: * Officially declared: 19 - 23 - * Actually supported: 11.2 - 23 + * Actually tested: 11.2, 23 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst index fd8aadc8..b1961b0d 100644 --- a/docs/connection/db_connection/postgres/prerequisites.rst +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* PostgreSQL server versions: 8.2 - 16 +* PostgreSQL server versions: + * Officially declared: 8.2 - 16 + * Actually tested: 9.4, 16 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 diff --git a/docs/connection/db_connection/teradata/prerequisites.rst b/docs/connection/db_connection/teradata/prerequisites.rst index 294f9d53..69f83c49 100644 --- a/docs/connection/db_connection/teradata/prerequisites.rst +++ b/docs/connection/db_connection/teradata/prerequisites.rst @@ -6,7 +6,9 @@ Prerequisites Version Compatibility --------------------- -* Teradata server versions: 16.10 - 20.0 +* Teradata server versions: + * Officially declared: 16.10 - 20.0 + * Actually tested: 16.10 * Spark versions: 2.3.x - 3.5.x * Java versions: 8 - 20 From cb6187402201a8bf71740cbfad33711e16bc7084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Fri, 30 Aug 2024 09:19:31 +0000 Subject: [PATCH 55/64] Fix CI --- .github/workflows/data/local-fs/matrix.yml | 3 ++- .github/workflows/data/s3/matrix.yml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index 081d9eb8..365d1d3c 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -20,7 +20,8 @@ min_excel: &min_excel os: ubuntu-latest max: &max - spark-version: 3.5.2 + # Excel pagkage currently supports Spark 3.5.1 max + spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index ffb9aff9..3990f312 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -9,7 +9,7 @@ min: &min os: ubuntu-latest max: &max - minio-version: RELEASE.2024-08-29T01-40-52Z + minio-version: 2024.8.29 spark-version: 3.5.2 pydantic-version: 2 python-version: '3.12' From baad512aba92f40c28edfb63265386e0d60d93ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 2 Sep 2024 07:50:47 +0000 Subject: [PATCH 56/64] [DOP-18948] Add 0.11.2 to CHANGELOG --- docs/changelog/0.11.2.rst | 7 +++++++ docs/changelog/0.12.0.rst | 2 +- docs/changelog/index.rst | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 docs/changelog/0.11.2.rst diff --git a/docs/changelog/0.11.2.rst b/docs/changelog/0.11.2.rst new file mode 100644 index 00000000..5c954cbf --- /dev/null +++ b/docs/changelog/0.11.2.rst @@ -0,0 +1,7 @@ +0.11.2 (2024-09-02) +=================== + +Bug Fixes +--------- + +- Fix passing ``Greenplum(extra={"options": ...})`` during read/write operations. (:github:pull:`308`) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index d0a87cc7..eb8af50d 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -57,5 +57,5 @@ Features Bug Fixes --------- -- Fix passing ``Greenplum(extra={"options": ...)`` during read/write operations. (:github:pull:`308`) +- Fix passing ``Greenplum(extra={"options": ...})`` during read/write operations. (:github:pull:`308`) - Do not raise exception if yield-based hook whas something past (and only one) ``yield``. diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index 7700528e..756f0cb1 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -4,6 +4,7 @@ DRAFT 0.12.0 + 0.11.2 0.11.1 0.11.0 0.10.2 From 2e713e345f76b9e06322848f208cad5faba32a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 2 Sep 2024 08:02:46 +0000 Subject: [PATCH 57/64] [DOP-18743] Update jobDescription format --- docs/changelog/0.12.0.rst | 4 ++-- .../db_connection/mongodb/connection.py | 5 +---- onetl/db/db_reader/db_reader.py | 15 +++++++-------- onetl/db/db_writer/db_writer.py | 15 +++++++-------- onetl/file/file_df_reader/file_df_reader.py | 11 +++++++---- onetl/file/file_df_writer/file_df_writer.py | 15 +++++++-------- 6 files changed, 31 insertions(+), 34 deletions(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index eb8af50d..4a09682a 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -39,8 +39,8 @@ Features are not returned as method call result. (:github:pull:`303`) - Generate default ``jobDescription`` based on currently executed method. Examples: - * ``DBWriter() -> Postgres[host:5432/database]`` - * ``MongoDB[localhost:27017/admin] -> DBReader.run()`` + * ``DBWriter[schema.table].run() -> Postgres[host:5432/database]`` + * ``MongoDB[localhost:27017/admin] -> DBReader[mycollection].run()`` * ``Hive[cluster].execute()`` If user already set custom ``jobDescription``, it will left intact. (:github:pull:`304`) diff --git a/onetl/connection/db_connection/mongodb/connection.py b/onetl/connection/db_connection/mongodb/connection.py index f406a232..f5495238 100644 --- a/onetl/connection/db_connection/mongodb/connection.py +++ b/onetl/connection/db_connection/mongodb/connection.py @@ -348,10 +348,7 @@ def pipeline( read_options["aggregation.pipeline"] = json.dumps(pipeline) read_options["connection.uri"] = self.connection_url - with override_job_description( - self.spark, - f"{self}.pipeline()", - ): + with override_job_description(self.spark, f"{self}.pipeline()"): spark_reader = self.spark.read.format("mongodb").options(**read_options) if df_schema: diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index a4f45ab0..42f17db3 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -542,10 +542,8 @@ def has_data(self) -> bool: """ self._check_strategy() - with override_job_description( - self.connection.spark, - f"{self.connection} -> {self.__class__.__name__}.has_data()", - ): + job_description = f"{self}.has_data()" + with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() self.connection.check() @@ -637,10 +635,8 @@ def run(self) -> DataFrame: self._check_strategy() - with override_job_description( - self.connection.spark, - f"{self.connection} -> {self.__class__.__name__}.run()", - ): + job_description = f"{self}.run() -> {self.connection}" + with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() self.connection.check() @@ -667,6 +663,9 @@ def run(self) -> DataFrame: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") return df + def __str__(self): + return f"{self.__class__.__name__}[{self.source}]" + def _check_strategy(self): strategy = StrategyManager.get_current() class_name = type(self).__name__ diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index c261ad23..5206c07d 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -202,10 +202,9 @@ def run(self, df: DataFrame) -> None: raise ValueError(f"DataFrame is streaming. {self.__class__.__name__} supports only batch DataFrames.") entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() starts") - with override_job_description( - self.connection.spark, - f"{self.__class__.__name__}.run() -> {self.connection}", - ): + + job_description = f"{self}.run() -> {self.connection}" + with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() log_dataframe_schema(log, df) @@ -214,10 +213,7 @@ def run(self, df: DataFrame) -> None: with SparkMetricsRecorder(self.connection.spark) as recorder: try: - with override_job_description( - self.connection.spark, - f"{self.__class__.__name__}.run() -> {self.connection}", - ): + with override_job_description(self.connection.spark, job_description): self.connection.write_df_to_target( df=df, target=str(self.target), @@ -244,6 +240,9 @@ def run(self, df: DataFrame) -> None: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") + def __str__(self): + return f"{self.__class__.__name__}[{self.target}]" + def _log_parameters(self) -> None: log.info("|Spark| -> |%s| Writing DataFrame to target using parameters:", self.connection.__class__.__name__) log_with_indent(log, "target = '%s'", self.target) diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index 517e23a9..ed83bbc5 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -211,10 +211,8 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: if not self._connection_checked: self._log_parameters(files) - with override_job_description( - self.connection.spark, - f"{self.connection} -> {self.__class__.__name__}.run()", - ): + job_description = f"{self}.run() -> {self.connection}" + with override_job_description(self.connection.spark, job_description): paths: FileSet[PurePathProtocol] = FileSet() if files is not None: paths = FileSet(self._validate_files(files)) @@ -231,6 +229,11 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") return df + def __str__(self): + if self.source_path: + return f"{self.__class__.__name__}[{os.fspath(self.source_path)}]" + return f"{self.__class__.__name__}" + def _read_files(self, paths: FileSet[PurePathProtocol]) -> DataFrame: log.info("|%s| Paths to be read:", self.__class__.__name__) log_lines(log, str(paths)) diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index 0daea008..aeda5f7b 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging +import os from typing import TYPE_CHECKING try: @@ -124,10 +125,8 @@ def run(self, df: DataFrame) -> None: if df.isStreaming: raise ValueError(f"DataFrame is streaming. {self.__class__.__name__} supports only batch DataFrames.") - with override_job_description( - self.connection.spark, - f"{self.__class__.__name__}.run() -> {self.connection}", - ): + job_description = f"{self}).run() -> {self.connection}" + with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters(df) self.connection.check() @@ -135,10 +134,7 @@ def run(self, df: DataFrame) -> None: with SparkMetricsRecorder(self.connection.spark) as recorder: try: - with override_job_description( - self.connection.spark, - f"{self.__class__.__name__}.run() -> {self.connection}", - ): + with override_job_description(self.connection.spark, job_description): self.connection.write_df_as_files( df=df, path=self.target_path, @@ -166,6 +162,9 @@ def run(self, df: DataFrame) -> None: entity_boundary_log(log, f"{self.__class__.__name__}.run() ends", char="-") + def __str__(self): + return f"{self.__class__.__name__}[{os.fspath(self.target_path)}]" + def _log_parameters(self, df: DataFrame) -> None: log.info("|Spark| -> |%s| Writing dataframe using parameters:", self.connection.__class__.__name__) log_with_indent(log, "target_path = '%s'", self.target_path) From 6aa7cdf83f68e3b8bc84a9963a9d37c949db2ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 2 Sep 2024 08:08:33 +0000 Subject: [PATCH 58/64] [DOP-18743] Update setup.py --- docs/changelog/0.12.0.rst | 2 +- setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index 4a09682a..7aca258a 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -9,7 +9,7 @@ Breaking Changes * ``s3://host:port`` -> ``s3://host:port/bucket`` (:github:pull:`304`) - Update DB connectors/drivers to latest versions: - * Clickhouse ``0.6.0-patch5`` → ``0.6.4`` + * Clickhouse ``0.6.0-patch5`` → ``0.6.5`` * MongoDB ``10.3.0`` → ``10.4.0`` * MSSQL ``12.6.2`` → ``12.8.1`` * MySQL ``8.4.0`` → ``9.0.0`` diff --git a/setup.py b/setup.py index c7ce5d0d..2c1126f0 100644 --- a/setup.py +++ b/setup.py @@ -67,6 +67,7 @@ def parse_requirements(file: Path) -> list[str]: "Development Status :: 3 - Alpha", "Framework :: Pydantic", "Framework :: Pydantic :: 1", + "Framework :: Pydantic :: 2", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", From 61695dc20bdba035db1725bb7e27fed7958abb77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 2 Sep 2024 08:40:13 +0000 Subject: [PATCH 59/64] [DOP-18948] Fix CHANGELOG --- docs/changelog/0.12.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index 7aca258a..4a09682a 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -9,7 +9,7 @@ Breaking Changes * ``s3://host:port`` -> ``s3://host:port/bucket`` (:github:pull:`304`) - Update DB connectors/drivers to latest versions: - * Clickhouse ``0.6.0-patch5`` → ``0.6.5`` + * Clickhouse ``0.6.0-patch5`` → ``0.6.4`` * MongoDB ``10.3.0`` → ``10.4.0`` * MSSQL ``12.6.2`` → ``12.8.1`` * MySQL ``8.4.0`` → ``9.0.0`` From 3d5bfe51d219936945cf423e1b0a242779d5f924 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 2 Sep 2024 09:32:49 +0000 Subject: [PATCH 60/64] [DOP-18743] Update jobDescription format --- docs/changelog/0.12.0.rst | 4 ++-- onetl/db/db_reader/db_reader.py | 7 ++----- onetl/db/db_writer/db_writer.py | 5 +---- onetl/file/file_df_reader/file_df_reader.py | 11 +++++------ onetl/file/file_df_writer/file_df_writer.py | 6 +----- 5 files changed, 11 insertions(+), 22 deletions(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index 4a09682a..34c9c18b 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -39,8 +39,8 @@ Features are not returned as method call result. (:github:pull:`303`) - Generate default ``jobDescription`` based on currently executed method. Examples: - * ``DBWriter[schema.table].run() -> Postgres[host:5432/database]`` - * ``MongoDB[localhost:27017/admin] -> DBReader[mycollection].run()`` + * ``DBWriter.run(schema.table) -> Postgres[host:5432/database]`` + * ``MongoDB[localhost:27017/admin] -> DBReader.has_data(mycollection)`` * ``Hive[cluster].execute()`` If user already set custom ``jobDescription``, it will left intact. (:github:pull:`304`) diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 42f17db3..4ad3d236 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -542,7 +542,7 @@ def has_data(self) -> bool: """ self._check_strategy() - job_description = f"{self}.has_data()" + job_description = f"{self.__class__.__name__}.has_data({self.source})" with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() @@ -635,7 +635,7 @@ def run(self) -> DataFrame: self._check_strategy() - job_description = f"{self}.run() -> {self.connection}" + job_description = f"{self.__class__.__name__}.run({self.source}) -> {self.connection}" with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() @@ -663,9 +663,6 @@ def run(self) -> DataFrame: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") return df - def __str__(self): - return f"{self.__class__.__name__}[{self.source}]" - def _check_strategy(self): strategy = StrategyManager.get_current() class_name = type(self).__name__ diff --git a/onetl/db/db_writer/db_writer.py b/onetl/db/db_writer/db_writer.py index 5206c07d..3bcf63aa 100644 --- a/onetl/db/db_writer/db_writer.py +++ b/onetl/db/db_writer/db_writer.py @@ -203,7 +203,7 @@ def run(self, df: DataFrame) -> None: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() starts") - job_description = f"{self}.run() -> {self.connection}" + job_description = f"{self.__class__.__name__}.run({self.target}) -> {self.connection}" with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() @@ -240,9 +240,6 @@ def run(self, df: DataFrame) -> None: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") - def __str__(self): - return f"{self.__class__.__name__}[{self.target}]" - def _log_parameters(self) -> None: log.info("|Spark| -> |%s| Writing DataFrame to target using parameters:", self.connection.__class__.__name__) log_with_indent(log, "target = '%s'", self.target) diff --git a/onetl/file/file_df_reader/file_df_reader.py b/onetl/file/file_df_reader/file_df_reader.py index ed83bbc5..36aab796 100644 --- a/onetl/file/file_df_reader/file_df_reader.py +++ b/onetl/file/file_df_reader/file_df_reader.py @@ -211,7 +211,11 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: if not self._connection_checked: self._log_parameters(files) - job_description = f"{self}.run() -> {self.connection}" + if files: + job_description = f"{self.__class__.__name__}.run([..files..]) -> {self.connection}" + else: + job_description = f"{self.__class__.__name__}.run({self.source_path}) -> {self.connection}" + with override_job_description(self.connection.spark, job_description): paths: FileSet[PurePathProtocol] = FileSet() if files is not None: @@ -229,11 +233,6 @@ def run(self, files: Iterable[str | os.PathLike] | None = None) -> DataFrame: entity_boundary_log(log, msg=f"{self.__class__.__name__}.run() ends", char="-") return df - def __str__(self): - if self.source_path: - return f"{self.__class__.__name__}[{os.fspath(self.source_path)}]" - return f"{self.__class__.__name__}" - def _read_files(self, paths: FileSet[PurePathProtocol]) -> DataFrame: log.info("|%s| Paths to be read:", self.__class__.__name__) log_lines(log, str(paths)) diff --git a/onetl/file/file_df_writer/file_df_writer.py b/onetl/file/file_df_writer/file_df_writer.py index aeda5f7b..037fc7ee 100644 --- a/onetl/file/file_df_writer/file_df_writer.py +++ b/onetl/file/file_df_writer/file_df_writer.py @@ -3,7 +3,6 @@ from __future__ import annotations import logging -import os from typing import TYPE_CHECKING try: @@ -125,7 +124,7 @@ def run(self, df: DataFrame) -> None: if df.isStreaming: raise ValueError(f"DataFrame is streaming. {self.__class__.__name__} supports only batch DataFrames.") - job_description = f"{self}).run() -> {self.connection}" + job_description = f"{self.__class__.__name__}.run({self.target_path}) -> {self.connection}" with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters(df) @@ -162,9 +161,6 @@ def run(self, df: DataFrame) -> None: entity_boundary_log(log, f"{self.__class__.__name__}.run() ends", char="-") - def __str__(self): - return f"{self.__class__.__name__}[{os.fspath(self.target_path)}]" - def _log_parameters(self, df: DataFrame) -> None: log.info("|Spark| -> |%s| Writing dataframe using parameters:", self.connection.__class__.__name__) log_with_indent(log, "target_path = '%s'", self.target_path) From fb86f9222f6c548e6e79a984228240ad74bfa120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Mon, 2 Sep 2024 15:00:28 +0000 Subject: [PATCH 61/64] [DOP-18948] Fix CI tests --- .github/workflows/test-clickhouse.yml | 14 ++++++++------ .github/workflows/test-core.yml | 2 ++ .github/workflows/test-ftp.yml | 2 ++ .github/workflows/test-ftps.yml | 2 ++ .github/workflows/test-greenplum.yml | 2 ++ .github/workflows/test-hdfs.yml | 2 ++ .github/workflows/test-hive.yml | 2 ++ .github/workflows/test-kafka.yml | 2 ++ .github/workflows/test-local-fs.yml | 2 ++ .github/workflows/test-mongodb.yml | 2 ++ .github/workflows/test-mssql.yml | 2 ++ .github/workflows/test-mysql.yml | 2 ++ .github/workflows/test-oracle.yml | 2 ++ .github/workflows/test-postgres.yml | 2 ++ .github/workflows/test-s3.yml | 2 ++ .github/workflows/test-samba.yml | 2 ++ .github/workflows/test-sftp.yml | 2 ++ .github/workflows/test-teradata.yml | 2 ++ .github/workflows/test-webdav.yml | 2 ++ 19 files changed, 44 insertions(+), 6 deletions(-) diff --git a/.github/workflows/test-clickhouse.yml b/.github/workflows/test-clickhouse.yml index db05402d..32dead8f 100644 --- a/.github/workflows/test-clickhouse.yml +++ b/.github/workflows/test-clickhouse.yml @@ -90,12 +90,6 @@ jobs: source ./env ./pytest_runner.sh -m clickhouse - - name: Upload coverage results - uses: actions/upload-artifact@v4 - with: - name: coverage-clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} - path: reports/* - - name: Dump Clickhouse logs on failure if: failure() uses: jwalton/gh-docker-logs@v2 @@ -109,3 +103,11 @@ jobs: with: name: container-logs-clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }} path: logs/* + + - name: Upload coverage results + uses: actions/upload-artifact@v4 + with: + name: coverage-clickhouse-${{ inputs.clickhouse-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} + path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-core.yml b/.github/workflows/test-core.yml index 8a0b3b7a..94fe3075 100644 --- a/.github/workflows/test-core.yml +++ b/.github/workflows/test-core.yml @@ -79,3 +79,5 @@ jobs: with: name: coverage-core-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-ftp.yml b/.github/workflows/test-ftp.yml index 8e45ec32..e91c31e3 100644 --- a/.github/workflows/test-ftp.yml +++ b/.github/workflows/test-ftp.yml @@ -80,6 +80,8 @@ jobs: with: name: container-logs-ftp-${{ inputs.ftp-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: logs/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true - name: Shutdown FTP if: always() diff --git a/.github/workflows/test-ftps.yml b/.github/workflows/test-ftps.yml index dfe8ffed..89eff2e9 100644 --- a/.github/workflows/test-ftps.yml +++ b/.github/workflows/test-ftps.yml @@ -91,3 +91,5 @@ jobs: with: name: coverage-ftps-${{ inputs.ftps-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-greenplum.yml b/.github/workflows/test-greenplum.yml index 5f24f779..297a52cc 100644 --- a/.github/workflows/test-greenplum.yml +++ b/.github/workflows/test-greenplum.yml @@ -137,3 +137,5 @@ jobs: with: name: coverage-greenplum-${{ inputs.greenplum-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-hdfs.yml b/.github/workflows/test-hdfs.yml index e06f01ed..98398f52 100644 --- a/.github/workflows/test-hdfs.yml +++ b/.github/workflows/test-hdfs.yml @@ -114,3 +114,5 @@ jobs: with: name: coverage-hdfs-${{ inputs.hadoop-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-hive.yml b/.github/workflows/test-hive.yml index 17af1d93..7c74ed7a 100644 --- a/.github/workflows/test-hive.yml +++ b/.github/workflows/test-hive.yml @@ -81,3 +81,5 @@ jobs: with: name: coverage-hive-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-kafka.yml b/.github/workflows/test-kafka.yml index b1f06552..127498d1 100644 --- a/.github/workflows/test-kafka.yml +++ b/.github/workflows/test-kafka.yml @@ -135,3 +135,5 @@ jobs: with: name: coverage-kafka-${{ inputs.kafka-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-local-fs.yml b/.github/workflows/test-local-fs.yml index 2672afe4..27e2fc03 100644 --- a/.github/workflows/test-local-fs.yml +++ b/.github/workflows/test-local-fs.yml @@ -81,3 +81,5 @@ jobs: with: name: coverage-local-fs-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-mongodb.yml b/.github/workflows/test-mongodb.yml index 334bbfc1..e1c03774 100644 --- a/.github/workflows/test-mongodb.yml +++ b/.github/workflows/test-mongodb.yml @@ -107,3 +107,5 @@ jobs: with: name: coverage-mongodb-${{ inputs.mongodb-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-mssql.yml b/.github/workflows/test-mssql.yml index 0865492a..cb7252f9 100644 --- a/.github/workflows/test-mssql.yml +++ b/.github/workflows/test-mssql.yml @@ -112,3 +112,5 @@ jobs: with: name: coverage-mssql-${{ inputs.mssql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-mysql.yml b/.github/workflows/test-mysql.yml index e305af6d..8f0b7871 100644 --- a/.github/workflows/test-mysql.yml +++ b/.github/workflows/test-mysql.yml @@ -109,3 +109,5 @@ jobs: with: name: coverage-mysql-${{ inputs.mysql-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-oracle.yml b/.github/workflows/test-oracle.yml index 38a21daf..37ee14e4 100644 --- a/.github/workflows/test-oracle.yml +++ b/.github/workflows/test-oracle.yml @@ -129,3 +129,5 @@ jobs: with: name: coverage-oracle-${{ inputs.oracle-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-postgres.yml b/.github/workflows/test-postgres.yml index 68236134..cd668a00 100644 --- a/.github/workflows/test-postgres.yml +++ b/.github/workflows/test-postgres.yml @@ -108,3 +108,5 @@ jobs: with: name: coverage-postgres-${{ inputs.postgres-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-s3.yml b/.github/workflows/test-s3.yml index 1ef595e6..af25f71e 100644 --- a/.github/workflows/test-s3.yml +++ b/.github/workflows/test-s3.yml @@ -109,3 +109,5 @@ jobs: with: name: coverage-s3-${{ inputs.minio-version }}-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-samba.yml b/.github/workflows/test-samba.yml index f7b07131..f493b0a5 100644 --- a/.github/workflows/test-samba.yml +++ b/.github/workflows/test-samba.yml @@ -89,3 +89,5 @@ jobs: with: name: coverage-samba-${{ inputs.server-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-sftp.yml b/.github/workflows/test-sftp.yml index 2ab8de1c..0b92c423 100644 --- a/.github/workflows/test-sftp.yml +++ b/.github/workflows/test-sftp.yml @@ -86,3 +86,5 @@ jobs: with: name: coverage-sftp-${{ inputs.openssh-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-teradata.yml b/.github/workflows/test-teradata.yml index 8ba3ff60..482e5fbb 100644 --- a/.github/workflows/test-teradata.yml +++ b/.github/workflows/test-teradata.yml @@ -81,3 +81,5 @@ jobs: with: name: coverage-teradata-spark-${{ inputs.spark-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true diff --git a/.github/workflows/test-webdav.yml b/.github/workflows/test-webdav.yml index 2ce0e4ef..22c55b44 100644 --- a/.github/workflows/test-webdav.yml +++ b/.github/workflows/test-webdav.yml @@ -91,3 +91,5 @@ jobs: with: name: coverage-webdav-${{ inputs.webdav-version }}-python-${{ inputs.python-version }}-os-${{ inputs.os }} path: reports/* + # https://github.com/actions/upload-artifact/issues/602 + include-hidden-files: true From ec4937dc33f7d73fea5825e54e6c549e6eb775fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 3 Sep 2024 09:54:46 +0000 Subject: [PATCH 62/64] Update Clickhouse package --- docs/changelog/0.12.0.rst | 2 +- onetl/connection/db_connection/clickhouse/connection.py | 8 ++++---- .../tests_db_connection_unit/test_clickhouse_unit.py | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index 34c9c18b..06491fd2 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -9,7 +9,7 @@ Breaking Changes * ``s3://host:port`` -> ``s3://host:port/bucket`` (:github:pull:`304`) - Update DB connectors/drivers to latest versions: - * Clickhouse ``0.6.0-patch5`` → ``0.6.4`` + * Clickhouse ``0.6.0-patch5`` → ``0.6.5`` * MongoDB ``10.3.0`` → ``10.4.0`` * MSSQL ``12.6.2`` → ``12.8.1`` * MySQL ``8.4.0`` → ``9.0.0`` diff --git a/onetl/connection/db_connection/clickhouse/connection.py b/onetl/connection/db_connection/clickhouse/connection.py index fad82942..95756e2b 100644 --- a/onetl/connection/db_connection/clickhouse/connection.py +++ b/onetl/connection/db_connection/clickhouse/connection.py @@ -37,7 +37,7 @@ class Config: class Clickhouse(JDBCConnection): """Clickhouse JDBC connection. |support_hooks| - Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.4 `_ + Based on Maven package `com.clickhouse:clickhouse-jdbc:0.6.5 `_ (`official Clickhouse JDBC driver `_). .. seealso:: @@ -139,7 +139,7 @@ def get_packages( Parameters ---------- package_version : str, optional - ClickHouse JDBC version client packages. Defaults to ``0.6.4``. + ClickHouse JDBC version client packages. Defaults to ``0.6.5``. .. versionadded:: 0.11.0 @@ -158,7 +158,7 @@ def get_packages( Clickhouse.get_packages(package_version="0.6.0", apache_http_client_version="5.3.1") """ - default_jdbc_version = "0.6.4" + default_jdbc_version = "0.6.5" default_http_version = "5.3.1" jdbc_version = Version(package_version or default_jdbc_version).min_digits(3) @@ -177,7 +177,7 @@ def get_packages( @classproperty def package(self) -> str: """Get a single string of package names to be downloaded by Spark for establishing a Clickhouse connection.""" - return "com.clickhouse:clickhouse-jdbc:0.6.4,com.clickhouse:clickhouse-http-client:0.6.4,org.apache.httpcomponents.client5:httpclient5:5.3.1" + return "com.clickhouse:clickhouse-jdbc:0.6.5,com.clickhouse:clickhouse-http-client:0.6.5,org.apache.httpcomponents.client5:httpclient5:5.3.1" @property def jdbc_url(self) -> str: diff --git a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py index 9bf7a068..d8ccf7f4 100644 --- a/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py +++ b/tests/tests_unit/tests_db_connection_unit/test_clickhouse_unit.py @@ -11,7 +11,7 @@ def test_clickhouse_driver(): def test_clickhouse_package(): expected_packages = ( - "com.clickhouse:clickhouse-jdbc:0.6.4,com.clickhouse:clickhouse-http-client:0.6.4," + "com.clickhouse:clickhouse-jdbc:0.6.5,com.clickhouse:clickhouse-http-client:0.6.5," "org.apache.httpcomponents.client5:httpclient5:5.3.1" ) assert Clickhouse.package == expected_packages @@ -24,8 +24,8 @@ def test_clickhouse_package(): None, None, [ - "com.clickhouse:clickhouse-jdbc:0.6.4", - "com.clickhouse:clickhouse-http-client:0.6.4", + "com.clickhouse:clickhouse-jdbc:0.6.5", + "com.clickhouse:clickhouse-http-client:0.6.5", "org.apache.httpcomponents.client5:httpclient5:5.3.1", ], ), From dc439c34c02509f2f909f563646f589f3ed31de8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 3 Sep 2024 12:04:44 +0000 Subject: [PATCH 63/64] [DOP-18743] Update jobDescription format --- onetl/db/db_reader/db_reader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onetl/db/db_reader/db_reader.py b/onetl/db/db_reader/db_reader.py index 4ad3d236..dd79876a 100644 --- a/onetl/db/db_reader/db_reader.py +++ b/onetl/db/db_reader/db_reader.py @@ -542,7 +542,7 @@ def has_data(self) -> bool: """ self._check_strategy() - job_description = f"{self.__class__.__name__}.has_data({self.source})" + job_description = f"{self.connection} -> {self.__class__.__name__}.has_data({self.source})" with override_job_description(self.connection.spark, job_description): if not self._connection_checked: self._log_parameters() From fe9048d79d249e480efe7e3a0598ced96ced7b0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B0=D1=80=D1=82=D1=8B=D0=BD=D0=BE=D0=B2=20=D0=9C?= =?UTF-8?q?=D0=B0=D0=BA=D1=81=D0=B8=D0=BC=20=D0=A1=D0=B5=D1=80=D0=B3=D0=B5?= =?UTF-8?q?=D0=B5=D0=B2=D0=B8=D1=87?= Date: Tue, 3 Sep 2024 12:17:11 +0000 Subject: [PATCH 64/64] [DOP-18743] Update CHANGELOG --- docs/changelog/0.12.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/changelog/0.12.0.rst b/docs/changelog/0.12.0.rst index 06491fd2..8edd4a49 100644 --- a/docs/changelog/0.12.0.rst +++ b/docs/changelog/0.12.0.rst @@ -1,4 +1,4 @@ -0.12.0 (2024-08-29) +0.12.0 (2024-09-03) =================== Breaking Changes