diff --git a/docs/changelog/0.12.3.rst b/docs/changelog/0.12.3.rst index f931d2ee..ca160fa8 100644 --- a/docs/changelog/0.12.3.rst +++ b/docs/changelog/0.12.3.rst @@ -4,4 +4,4 @@ Bug Fixes --------- -- Allow passing table names in format ``schema."table.with.dots"`` to ``DBReader(name=...)`` and ``DBWriter(name=...)``. +- Allow passing table names in format ``schema."table.with.dots"`` to ``DBReader(source=...)`` and ``DBWriter(target=...)``. diff --git a/docs/changelog/0.12.4.rst b/docs/changelog/0.12.4.rst new file mode 100644 index 00000000..71d44b1f --- /dev/null +++ b/docs/changelog/0.12.4.rst @@ -0,0 +1,7 @@ +0.12.4 (2024-11-27) +=================== + +Bug Fixes +--------- + +- Fix ``DBReader(conn=oracle, options={"partitioning_mode": "hash"})`` lead to data skew in last partition due to wrong ``ora_hash`` usage. (:github:pull:`319`) diff --git a/docs/changelog/index.rst b/docs/changelog/index.rst index bb23a987..647dcc1f 100644 --- a/docs/changelog/index.rst +++ b/docs/changelog/index.rst @@ -3,6 +3,7 @@ :caption: Changelog DRAFT + 0.12.4 0.12.3 0.12.2 0.12.1 diff --git a/onetl/VERSION b/onetl/VERSION index aa22d3ce..e01e0ddd 100644 --- a/onetl/VERSION +++ b/onetl/VERSION @@ -1 +1 @@ -0.12.3 +0.12.4 diff --git a/onetl/connection/db_connection/clickhouse/dialect.py b/onetl/connection/db_connection/clickhouse/dialect.py index 394843b8..1ee213d0 100644 --- a/onetl/connection/db_connection/clickhouse/dialect.py +++ b/onetl/connection/db_connection/clickhouse/dialect.py @@ -10,7 +10,7 @@ class ClickhouseDialect(JDBCDialect): def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str: - return f"modulo(halfMD5({partition_column}), {num_partitions})" + return f"halfMD5({partition_column}) % {num_partitions}" def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: return f"{partition_column} % {num_partitions}" diff --git a/onetl/connection/db_connection/mssql/dialect.py b/onetl/connection/db_connection/mssql/dialect.py index 6be43c80..3cb809ad 100644 --- a/onetl/connection/db_connection/mssql/dialect.py +++ b/onetl/connection/db_connection/mssql/dialect.py @@ -10,7 +10,7 @@ class MSSQLDialect(JDBCDialect): # https://docs.microsoft.com/ru-ru/sql/t-sql/functions/hashbytes-transact-sql?view=sql-server-ver16 def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str: - return f"CONVERT(BIGINT, HASHBYTES ( 'SHA' , {partition_column} )) % {num_partitions}" + return f"CONVERT(BIGINT, HASHBYTES ('SHA', {partition_column})) % {num_partitions}" def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: return f"{partition_column} % {num_partitions}" diff --git a/onetl/connection/db_connection/oracle/dialect.py b/onetl/connection/db_connection/oracle/dialect.py index 2f121871..c7a73903 100644 --- a/onetl/connection/db_connection/oracle/dialect.py +++ b/onetl/connection/db_connection/oracle/dialect.py @@ -43,7 +43,9 @@ def get_sql_query( ) def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str: - return f"ora_hash({partition_column}, {num_partitions})" + # ora_hash returns values from 0 to N including N. + # Balancing N+1 splits to N partitions leads to data skew in last partition. + return f"ora_hash({partition_column}, {num_partitions - 1})" def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str: return f"MOD({partition_column}, {num_partitions})"