Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Nov 27, 2024
2 parents c4d2caa + bf0796e commit 694a71a
Show file tree
Hide file tree
Showing 7 changed files with 15 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docs/changelog/0.12.3.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
Bug Fixes
---------

- Allow passing table names in format ``schema."table.with.dots"`` to ``DBReader(name=...)`` and ``DBWriter(name=...)``.
- Allow passing table names in format ``schema."table.with.dots"`` to ``DBReader(source=...)`` and ``DBWriter(target=...)``.
7 changes: 7 additions & 0 deletions docs/changelog/0.12.4.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
0.12.4 (2024-11-27)
===================

Bug Fixes
---------

- Fix ``DBReader(conn=oracle, options={"partitioning_mode": "hash"})`` lead to data skew in last partition due to wrong ``ora_hash`` usage. (:github:pull:`319`)
1 change: 1 addition & 0 deletions docs/changelog/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
:caption: Changelog

DRAFT
0.12.4
0.12.3
0.12.2
0.12.1
Expand Down
2 changes: 1 addition & 1 deletion onetl/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.12.3
0.12.4
2 changes: 1 addition & 1 deletion onetl/connection/db_connection/clickhouse/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

class ClickhouseDialect(JDBCDialect):
def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str:
return f"modulo(halfMD5({partition_column}), {num_partitions})"
return f"halfMD5({partition_column}) % {num_partitions}"

def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
return f"{partition_column} % {num_partitions}"
Expand Down
2 changes: 1 addition & 1 deletion onetl/connection/db_connection/mssql/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
class MSSQLDialect(JDBCDialect):
# https://docs.microsoft.com/ru-ru/sql/t-sql/functions/hashbytes-transact-sql?view=sql-server-ver16
def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str:
return f"CONVERT(BIGINT, HASHBYTES ( 'SHA' , {partition_column} )) % {num_partitions}"
return f"CONVERT(BIGINT, HASHBYTES ('SHA', {partition_column})) % {num_partitions}"

def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
return f"{partition_column} % {num_partitions}"
Expand Down
4 changes: 3 additions & 1 deletion onetl/connection/db_connection/oracle/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ def get_sql_query(
)

def get_partition_column_hash(self, partition_column: str, num_partitions: int) -> str:
return f"ora_hash({partition_column}, {num_partitions})"
# ora_hash returns values from 0 to N including N.
# Balancing N+1 splits to N partitions leads to data skew in last partition.
return f"ora_hash({partition_column}, {num_partitions - 1})"

def get_partition_column_mod(self, partition_column: str, num_partitions: int) -> str:
return f"MOD({partition_column}, {num_partitions})"
Expand Down

0 comments on commit 694a71a

Please sign in to comment.