diff --git a/.github/workflows/data/clickhouse/matrix.yml b/.github/workflows/data/clickhouse/matrix.yml index 6f1d72619..e28344dd8 100644 --- a/.github/workflows/data/clickhouse/matrix.yml +++ b/.github/workflows/data/clickhouse/matrix.yml @@ -1,4 +1,4 @@ -min: &min +2x: &2x # Clickhouse version with proper DateTime > DateTime64 comparison clickhouse-image: yandex/clickhouse-server clickhouse-version: '21.1' @@ -8,7 +8,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x clickhouse-image: clickhouse/clickhouse-server clickhouse-version: 24.6.3.70-alpine spark-version: 3.5.1 @@ -17,6 +17,15 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + clickhouse-image: clickhouse/clickhouse-server + clickhouse-version: 24.6.3.70-alpine + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest clickhouse-image: clickhouse/clickhouse-server clickhouse-version: latest-alpine @@ -27,6 +36,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *max, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/core/matrix.yml b/.github/workflows/data/core/matrix.yml index d20f074ab..b2f5cdb71 100644 --- a/.github/workflows/data/core/matrix.yml +++ b/.github/workflows/data/core/matrix.yml @@ -1,17 +1,24 @@ -min: &min +2x: &2x spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -max: &max +3x: &3x spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest +4x: &4x + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest spark-version: latest pydantic-version: latest @@ -20,6 +27,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *max, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/greenplum/matrix.yml b/.github/workflows/data/greenplum/matrix.yml index 0935a821e..63c0aaa80 100644 --- a/.github/workflows/data/greenplum/matrix.yml +++ b/.github/workflows/data/greenplum/matrix.yml @@ -1,4 +1,4 @@ -min: &min +23: &23 greenplum-version: 6.23.1 package-version: 2.2.0 # Spark 2.3.0 does not support passing ivysettings.xml @@ -8,7 +8,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +32: &32 greenplum-version: 7.0.0 package-version: 2.3.1 # Greenplum connector does not support Spark 3.3+ @@ -29,6 +29,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *max, *latest] + small: [*32] + full: [*23, *32] + nightly: [*23, *latest] diff --git a/.github/workflows/data/hdfs/matrix.yml b/.github/workflows/data/hdfs/matrix.yml index af4553f14..e2a401263 100644 --- a/.github/workflows/data/hdfs/matrix.yml +++ b/.github/workflows/data/hdfs/matrix.yml @@ -1,4 +1,4 @@ -min: &min +2x: &2x hadoop-version: hadoop2-hdfs spark-version: 2.3.1 pydantic-version: 1 @@ -6,7 +6,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x hadoop-version: hadoop3-hdfs spark-version: 3.5.1 pydantic-version: 2 @@ -14,6 +14,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + hadoop-version: hadoop3-hdfs + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest hadoop-version: hadoop3-hdfs spark-version: latest @@ -23,6 +31,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *max, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/hive/matrix.yml b/.github/workflows/data/hive/matrix.yml index 6ce0d7a8e..b2f5cdb71 100644 --- a/.github/workflows/data/hive/matrix.yml +++ b/.github/workflows/data/hive/matrix.yml @@ -1,17 +1,24 @@ -min: &min +2x: &2x spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -max: &max +3x: &3x spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' java-version: 20 os: ubuntu-latest +4x: &4x + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest spark-version: latest pydantic-version: latest @@ -20,6 +27,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/kafka/matrix.yml b/.github/workflows/data/kafka/matrix.yml index 1b9b23367..8193e26a1 100644 --- a/.github/workflows/data/kafka/matrix.yml +++ b/.github/workflows/data/kafka/matrix.yml @@ -1,5 +1,5 @@ -min: &min - # Headers are supported only since 2.x. +2x: &2x + # Headers are supported only since Kafka 2x. # Images before 3.2.3 are not creating kafka_jaas.conf properly, and failing to start # https://github.com/bitnami/containers/blob/9db9064668365cac89bff58259f63eb78bb97e79/bitnami/kafka/README.md?plain=1#L933 kafka-version: 3.2.3 @@ -9,7 +9,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x kafka-version: 3.7.1 pydantic-version: 2 spark-version: 3.5.1 @@ -17,6 +17,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + kafka-version: 3.7.1 + pydantic-version: 2 + spark-version: 4.0.0 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest kafka-version: latest pydantic-version: latest @@ -26,6 +34,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *max, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/local-fs/matrix.yml b/.github/workflows/data/local-fs/matrix.yml index d1337291e..e2a043163 100644 --- a/.github/workflows/data/local-fs/matrix.yml +++ b/.github/workflows/data/local-fs/matrix.yml @@ -1,25 +1,27 @@ -min: &min +23: &23 spark-version: 2.3.1 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -min_avro: &min_avro +24: &24 + # Avro supported only since Spark 2.4 spark-version: 2.4.8 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -min_excel: &min_excel +32: &32 + # Excel supported only since Spark 3.2 spark-version: 3.2.4 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -max: &max +35: &35 # Excel package currently has no release for 3.5.1 spark-version: 3.5.0 pydantic-version: 2 @@ -27,6 +29,13 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest spark-version: latest pydantic-version: latest @@ -35,15 +44,6 @@ latest: &latest os: ubuntu-latest matrix: - small: - - <<: *max - full: - - <<: *min - - <<: *min_avro - - <<: *min_excel - - <<: *max - nightly: - - <<: *min - - <<: *min_avro - - <<: *min_excel - - <<: *latest + small: [*35] + full: [*23, *24, *32, *35, *4x] + nightly: [*23, *24, *32, *35, *latest] diff --git a/.github/workflows/data/mongodb/matrix.yml b/.github/workflows/data/mongodb/matrix.yml index 98e1fe971..f5ab88e09 100644 --- a/.github/workflows/data/mongodb/matrix.yml +++ b/.github/workflows/data/mongodb/matrix.yml @@ -1,13 +1,13 @@ -min: &min +32: &32 mongodb-version: 4.0.0 - # MongoDB connector does not support Spark 2.x + # MongoDB connector does not support Spark 2x spark-version: 3.2.4 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -max: &max +35: &35 mongodb-version: 7.0.12 spark-version: 3.5.1 pydantic-version: 2 @@ -15,6 +15,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + mongodb-version: 7.0.12 + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 20 + os: ubuntu-latest + latest: &latest mongodb-version: latest spark-version: latest @@ -24,6 +32,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*35] + full: [*32, *35, *4x] + nightly: [*32, *35, *latest] diff --git a/.github/workflows/data/mssql/matrix.yml b/.github/workflows/data/mssql/matrix.yml index fad2e738c..2ad892e6f 100644 --- a/.github/workflows/data/mssql/matrix.yml +++ b/.github/workflows/data/mssql/matrix.yml @@ -1,4 +1,4 @@ -min: &min +2x: &2x mssql-version: 2017-GA-ubuntu spark-version: 2.3.1 pydantic-version: 1 @@ -6,7 +6,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x mssql-version: 2022-CU14-ubuntu-22.04 spark-version: 3.5.1 pydantic-version: 2 @@ -14,6 +14,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + mssql-version: 2022-CU14-ubuntu-22.04 + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest mssql-version: latest spark-version: latest @@ -23,6 +31,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/mysql/matrix.yml b/.github/workflows/data/mysql/matrix.yml index d2e703143..e5e3c275b 100644 --- a/.github/workflows/data/mysql/matrix.yml +++ b/.github/workflows/data/mysql/matrix.yml @@ -1,4 +1,4 @@ -min: &min +2x: &2x # Tags 5.7.6-5.6.12 cannot be downloaded since Docker v26: # "Docker Image Format v1 and Docker Image manifest version 2, schema 1 support is disabled by default" mysql-version: 5.7.13 @@ -8,7 +8,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x mysql-version: 9.0.1 spark-version: 3.5.1 pydantic-version: 2 @@ -16,6 +16,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + mysql-version: 9.0.1 + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest mysql-version: latest spark-version: latest @@ -25,6 +33,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/oracle/matrix.yml b/.github/workflows/data/oracle/matrix.yml index 7a79c68a7..5cc8b3e9e 100644 --- a/.github/workflows/data/oracle/matrix.yml +++ b/.github/workflows/data/oracle/matrix.yml @@ -1,4 +1,4 @@ -min: &min +2x: &2x oracle-image: gvenzl/oracle-xe oracle-version: 11.2.0.2-slim-faststart db-name: XE @@ -8,7 +8,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x oracle-image: gvenzl/oracle-free oracle-version: 23.4-slim-faststart db-name: FREEPDB1 @@ -18,6 +18,16 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + oracle-image: gvenzl/oracle-free + oracle-version: 23.4-slim-faststart + db-name: FREEPDB1 + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest oracle-image: gvenzl/oracle-free oracle-version: slim-faststart @@ -29,6 +39,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/postgres/matrix.yml b/.github/workflows/data/postgres/matrix.yml index 4c5b5f4ef..d22949a18 100644 --- a/.github/workflows/data/postgres/matrix.yml +++ b/.github/workflows/data/postgres/matrix.yml @@ -1,4 +1,4 @@ -min: &min +2x: &2x # Min supported version by JDBC driver is 8.4, but it is too ancient to be used by anyone in real life postgres-version: 9.4.26-alpine spark-version: 2.3.1 @@ -7,7 +7,7 @@ min: &min java-version: 8 os: ubuntu-latest -max: &max +3x: &3x postgres-version: 16.3-alpine spark-version: 3.5.1 pydantic-version: 2 @@ -15,6 +15,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + postgres-version: 16.3-alpine + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest postgres-version: alpine spark-version: latest @@ -24,6 +32,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*3x] + full: [*2x, *3x, *4x] + nightly: [*2x, *3x, *latest] diff --git a/.github/workflows/data/s3/matrix.yml b/.github/workflows/data/s3/matrix.yml index 06d4f7489..65074f2a8 100644 --- a/.github/workflows/data/s3/matrix.yml +++ b/.github/workflows/data/s3/matrix.yml @@ -1,14 +1,14 @@ -min: &min +32: &32 # prior image versions returns empty content of bucket root, some kind of bug minio-version: 2021.3.17 - # Minimal Spark version with Hadoop 3.x support + # Minimal Spark version with Hadoop 3x support spark-version: 3.2.4 pydantic-version: 1 python-version: '3.7' java-version: 8 os: ubuntu-latest -max: &max +35: &35 minio-version: 2024.7.26 spark-version: 3.5.1 pydantic-version: 2 @@ -16,6 +16,14 @@ max: &max java-version: 20 os: ubuntu-latest +4x: &4x + minio-version: 2024.7.26 + spark-version: 4.0.0 + pydantic-version: 2 + python-version: '3.12' + java-version: 22 + os: ubuntu-latest + latest: &latest minio-version: latest spark-version: latest @@ -25,6 +33,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*min, *max] - nightly: [*min, *latest] + small: [*35] + full: [*32, *35, *4x] + nightly: [*32, *35, *latest] diff --git a/.github/workflows/data/samba/matrix.yml b/.github/workflows/data/samba/matrix.yml index 045a093ba..19254d231 100644 --- a/.github/workflows/data/samba/matrix.yml +++ b/.github/workflows/data/samba/matrix.yml @@ -1,5 +1,5 @@ min: &min - # elswork/samba image versions does not correlate with smbd version, it is always 4.x + # elswork/samba image versions does not correlate with smbd version, it is always 4x server-version: latest pydantic-version: 1 python-version: '3.7' diff --git a/.github/workflows/data/teradata/matrix.yml b/.github/workflows/data/teradata/matrix.yml index 6c2a55455..ec71ac865 100644 --- a/.github/workflows/data/teradata/matrix.yml +++ b/.github/workflows/data/teradata/matrix.yml @@ -1,4 +1,4 @@ -max: &max +3x: &3x spark-version: 3.5.1 pydantic-version: 2 python-version: '3.12' @@ -13,6 +13,6 @@ latest: &latest os: ubuntu-latest matrix: - small: [*max] - full: [*max] + small: [*3x] + full: [*3x] nightly: [*latest] diff --git a/README.rst b/README.rst index 0a4cbc97d..efe269aba 100644 --- a/README.rst +++ b/README.rst @@ -55,7 +55,7 @@ Requirements ------------ * **Python 3.7 - 3.12** -* PySpark 2.3.x - 3.5.x (depends on used connector) +* PySpark 2.3.x - 4.0.x (depends on used connector) * Java 8+ (required by Spark, see below) * Kerberos libs & GCC (required by ``Hive``, ``HDFS`` and ``SparkHDFS`` connectors) @@ -171,21 +171,23 @@ Firstly, you should install JDK. The exact installation instruction depends on y Compatibility matrix ^^^^^^^^^^^^^^^^^^^^ -+--------------------------------------------------------------+-------------+-------------+-------+ -| Spark | Python | Java | Scala | -+==============================================================+=============+=============+=======+ -| `2.3.x `_ | 3.7 only | 8 only | 2.11 | -+--------------------------------------------------------------+-------------+-------------+-------+ -| `2.4.x `_ | 3.7 only | 8 only | 2.11 | -+--------------------------------------------------------------+-------------+-------------+-------+ -| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | -+--------------------------------------------------------------+-------------+-------------+-------+ -| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | -+--------------------------------------------------------------+-------------+-------------+-------+ -| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | -+--------------------------------------------------------------+-------------+-------------+-------+ -| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | -+--------------------------------------------------------------+-------------+-------------+-------+ ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| Spark | Python | Java | Scala | ++=======================================================================+=============+=============+=======+ +| `2.3.x `_ | 3.7 only | 8 only | 2.11 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| `2.4.x `_ | 3.7 only | 8 only | 2.11 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| `3.2.x `_ | 3.7 - 3.10 | 8u201 - 11 | 2.12 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| `3.3.x `_ | 3.7 - 3.10 | 8u201 - 17 | 2.12 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| `3.4.x `_ | 3.7 - 3.12 | 8u362 - 20 | 2.12 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| `3.5.x `_ | 3.8 - 3.12 | 8u371 - 20 | 2.12 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ +| `4.0.x `_ | 3.8 - 3.12 | 17 - 22 | 2.13 | ++-----------------------------------------------------------------------+-------------+-------------+-------+ .. _pyspark-install: diff --git a/docs/connection/db_connection/clickhouse/prerequisites.rst b/docs/connection/db_connection/clickhouse/prerequisites.rst index 03384b1a0..1b3666715 100644 --- a/docs/connection/db_connection/clickhouse/prerequisites.rst +++ b/docs/connection/db_connection/clickhouse/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * Clickhouse server versions: 21.1 or higher -* Spark versions: 2.3.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 2.3.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/docs/connection/db_connection/hive/prerequisites.rst b/docs/connection/db_connection/hive/prerequisites.rst index d690f918f..fe5cf5536 100644 --- a/docs/connection/db_connection/hive/prerequisites.rst +++ b/docs/connection/db_connection/hive/prerequisites.rst @@ -15,8 +15,8 @@ Version Compatibility --------------------- * Hive Metastore version: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) -* Spark versions: 2.3.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 2.3.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/docs/connection/db_connection/kafka/prerequisites.rst b/docs/connection/db_connection/kafka/prerequisites.rst index 29f5885b5..308094311 100644 --- a/docs/connection/db_connection/kafka/prerequisites.rst +++ b/docs/connection/db_connection/kafka/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * Kafka server versions: 0.10 or higher -* Spark versions: 2.4.x - 3.5.x -* Java versions: 8 - 17 +* Spark versions: 2.4.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/docs/connection/db_connection/mongodb/prerequisites.rst b/docs/connection/db_connection/mongodb/prerequisites.rst index 7df5f5022..0455deeae 100644 --- a/docs/connection/db_connection/mongodb/prerequisites.rst +++ b/docs/connection/db_connection/mongodb/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * MongoDB server versions: 4.0 or higher -* Spark versions: 3.2.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 3.2.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/docs/connection/db_connection/mssql/prerequisites.rst b/docs/connection/db_connection/mssql/prerequisites.rst index 8dde0f6c3..19b50f26d 100644 --- a/docs/connection/db_connection/mssql/prerequisites.rst +++ b/docs/connection/db_connection/mssql/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * SQL Server versions: 2014 - 2022 -* Spark versions: 2.3.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 2.3.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_ and `official compatibility matrix `_. diff --git a/docs/connection/db_connection/mysql/prerequisites.rst b/docs/connection/db_connection/mysql/prerequisites.rst index b92f33208..e62d93720 100644 --- a/docs/connection/db_connection/mysql/prerequisites.rst +++ b/docs/connection/db_connection/mysql/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * MySQL server versions: 5.7 - 9.0 -* Spark versions: 2.3.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 2.3.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/docs/connection/db_connection/oracle/prerequisites.rst b/docs/connection/db_connection/oracle/prerequisites.rst index b5b64e437..2e168f424 100644 --- a/docs/connection/db_connection/oracle/prerequisites.rst +++ b/docs/connection/db_connection/oracle/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * Oracle Server versions: 23, 21, 19, 18, 12.2 and __probably__ 11.2 (tested, but it's not mentioned in official docs). -* Spark versions: 2.3.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 2.3.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/docs/connection/db_connection/postgres/prerequisites.rst b/docs/connection/db_connection/postgres/prerequisites.rst index ef83144f5..d183ec89b 100644 --- a/docs/connection/db_connection/postgres/prerequisites.rst +++ b/docs/connection/db_connection/postgres/prerequisites.rst @@ -7,8 +7,8 @@ Version Compatibility --------------------- * PostgreSQL server versions: 8.2 - 16 -* Spark versions: 2.3.x - 3.5.x -* Java versions: 8 - 20 +* Spark versions: 2.3.x - 4.0.x +* Java versions: 8 - 22 See `official documentation `_. diff --git a/onetl/_util/scala.py b/onetl/_util/scala.py index 397a91576..c4069e5a3 100644 --- a/onetl/_util/scala.py +++ b/onetl/_util/scala.py @@ -9,6 +9,8 @@ def get_default_scala_version(spark_version: Version) -> Version: """ Get default Scala version for specific Spark version """ - if spark_version.major < 3: + if spark_version.major == 2: return Version("2.11") - return Version("2.12") + if spark_version.major == 3: + return Version("2.12") + return Version("2.13") diff --git a/onetl/connection/db_connection/db_connection/dialect.py b/onetl/connection/db_connection/db_connection/dialect.py index 73efba338..8ce644acc 100644 --- a/onetl/connection/db_connection/db_connection/dialect.py +++ b/onetl/connection/db_connection/db_connection/dialect.py @@ -17,7 +17,7 @@ class DBDialect(BaseDBDialect): def detect_hwm_class(self, field: StructField) -> type[HWM] | None: - return SparkTypeToHWM.get(field.dataType.typeName()) # type: ignore + return SparkTypeToHWM.get(field.dataType) # type: ignore def get_sql_query( self, diff --git a/onetl/connection/db_connection/jdbc_mixin/connection.py b/onetl/connection/db_connection/jdbc_mixin/connection.py index e8c19e38b..8578c5ff8 100644 --- a/onetl/connection/db_connection/jdbc_mixin/connection.py +++ b/onetl/connection/db_connection/jdbc_mixin/connection.py @@ -431,10 +431,11 @@ def _execute_on_driver( statement_args = self._get_statement_args() jdbc_statement = self._build_statement(statement, statement_type, jdbc_connection, statement_args) - return self._execute_statement(jdbc_statement, statement, options, callback, read_only) + return self._execute_statement(jdbc_connection, jdbc_statement, statement, options, callback, read_only) def _execute_statement( self, + jdbc_connection, jdbc_statement, statement: str, options: JDBCFetchOptions | JDBCExecuteOptions, @@ -472,7 +473,7 @@ def _execute_statement( else: jdbc_statement.executeUpdate(statement) - return callback(jdbc_statement) + return callback(jdbc_connection, jdbc_statement) @staticmethod def _build_statement( @@ -501,11 +502,11 @@ def _build_statement( return jdbc_connection.createStatement(*statement_args) - def _statement_to_dataframe(self, jdbc_statement) -> DataFrame: + def _statement_to_dataframe(self, jdbc_connection, jdbc_statement) -> DataFrame: result_set = jdbc_statement.getResultSet() - return self._resultset_to_dataframe(result_set) + return self._resultset_to_dataframe(jdbc_connection, result_set) - def _statement_to_optional_dataframe(self, jdbc_statement) -> DataFrame | None: + def _statement_to_optional_dataframe(self, jdbc_connection, jdbc_statement) -> DataFrame | None: """ Returns ``org.apache.spark.sql.DataFrame`` or ``None``, if ResultSet is does not contain any columns. @@ -522,9 +523,9 @@ def _statement_to_optional_dataframe(self, jdbc_statement) -> DataFrame | None: if not result_column_count: return None - return self._resultset_to_dataframe(result_set) + return self._resultset_to_dataframe(jdbc_connection, result_set) - def _resultset_to_dataframe(self, result_set) -> DataFrame: + def _resultset_to_dataframe(self, jdbc_connection, result_set) -> DataFrame: """ Converts ``java.sql.ResultSet`` to ``org.apache.spark.sql.DataFrame`` using Spark's internal methods. @@ -545,13 +546,27 @@ def _resultset_to_dataframe(self, result_set) -> DataFrame: java_converters = self.spark._jvm.scala.collection.JavaConverters # type: ignore - if get_spark_version(self.spark) >= Version("3.4"): + spark_version = get_spark_version(self.spark) + + if spark_version >= Version("4.0"): + result_schema = jdbc_utils.getSchema( + jdbc_connection, + result_set, + jdbc_dialect, + False, # noqa: WPS425 + False, # noqa: WPS425 + ) + elif spark_version >= Version("3.4"): # https://github.com/apache/spark/commit/2349175e1b81b0a61e1ed90c2d051c01cf78de9b result_schema = jdbc_utils.getSchema(result_set, jdbc_dialect, False, False) # noqa: WPS425 else: result_schema = jdbc_utils.getSchema(result_set, jdbc_dialect, False) # noqa: WPS425 - result_iterator = jdbc_utils.resultSetToRows(result_set, result_schema) + if spark_version.major >= 4: + result_iterator = jdbc_utils.resultSetToRows(result_set, result_schema, jdbc_dialect) + else: + result_iterator = jdbc_utils.resultSetToRows(result_set, result_schema) + result_list = java_converters.seqAsJavaListConverter(result_iterator.toSeq()).asJava() jdf = self.spark._jsparkSession.createDataFrame(result_list, result_schema) # type: ignore diff --git a/onetl/connection/db_connection/kafka/connection.py b/onetl/connection/db_connection/kafka/connection.py index ce3829e49..6d8b377c5 100644 --- a/onetl/connection/db_connection/kafka/connection.py +++ b/onetl/connection/db_connection/kafka/connection.py @@ -432,8 +432,13 @@ def get_packages( raise ValueError(f"Spark version must be at least 2.4, got {spark_ver}") scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) + + if spark_ver.major < 4: + version = spark_ver.format("{0}.{1}.{2}") + else: + version = "4.0.0-preview1" return [ - f"org.apache.spark:spark-sql-kafka-0-10_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}", + f"org.apache.spark:spark-sql-kafka-0-10_{scala_ver.format('{0}.{1}')}:{version}", ] def __enter__(self): diff --git a/onetl/connection/file_df_connection/spark_s3/connection.py b/onetl/connection/file_df_connection/spark_s3/connection.py index 1efe39d4e..8b9e76c36 100644 --- a/onetl/connection/file_df_connection/spark_s3/connection.py +++ b/onetl/connection/file_df_connection/spark_s3/connection.py @@ -246,9 +246,14 @@ def get_packages( # https://issues.apache.org/jira/browse/SPARK-23977 raise ValueError(f"Spark version must be at least 3.x, got {spark_ver}") + if spark_ver.major < 4: + version = spark_ver.format("{0}.{1}.{2}") + else: + version = "4.0.0-preview1" + scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) # https://mvnrepository.com/artifact/org.apache.spark/spark-hadoop-cloud - return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}"] + return [f"org.apache.spark:spark-hadoop-cloud_{scala_ver.format('{0}.{1}')}:{version}"] @slot def path_from_string(self, path: os.PathLike | str) -> RemotePath: diff --git a/onetl/file/format/avro.py b/onetl/file/format/avro.py index 3699620b0..0205c2451 100644 --- a/onetl/file/format/avro.py +++ b/onetl/file/format/avro.py @@ -163,7 +163,12 @@ def get_packages( if scala_ver < Version("2.11"): raise ValueError(f"Scala version should be at least 2.11, got {scala_ver.format('{0}.{1}')}") - return [f"org.apache.spark:spark-avro_{scala_ver.format('{0}.{1}')}:{spark_ver.format('{0}.{1}.{2}')}"] + if spark_ver.major < 4: + version = spark_ver.format("{0}.{1}.{2}") + else: + version = "4.0.0-preview1" + + return [f"org.apache.spark:spark-avro_{scala_ver.format('{0}.{1}')}:{version}"] @slot def check_if_supported(self, spark: SparkSession) -> None: diff --git a/onetl/file/format/xml.py b/onetl/file/format/xml.py index cc7cd4777..7fe2d4b67 100644 --- a/onetl/file/format/xml.py +++ b/onetl/file/format/xml.py @@ -193,6 +193,10 @@ def get_packages( # noqa: WPS231 ) """ + spark_ver = Version(spark_version) + if spark_ver.major >= 4: + # since Spark 4.0, XML is bundled with Spark + return [] if package_version: version = Version(package_version).min_digits(3) @@ -202,7 +206,6 @@ def get_packages( # noqa: WPS231 else: version = Version("0.18.0").min_digits(3) - spark_ver = Version(spark_version) scala_ver = Version(scala_version).min_digits(2) if scala_version else get_default_scala_version(spark_ver) # Ensure compatibility with Spark and Scala versions @@ -216,8 +219,12 @@ def get_packages( # noqa: WPS231 @slot def check_if_supported(self, spark: SparkSession) -> None: - java_class = "com.databricks.spark.xml.XmlReader" + version = get_spark_version(spark) + if version.major >= 4: + # since Spark 4.0, XML is bundled with Spark + return + java_class = "com.databricks.spark.xml.XmlReader" try: try_import_java_class(spark, java_class) except Exception as e: @@ -332,12 +339,12 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: | |-- name: string (nullable = true) | |-- age: integer (nullable = true) """ + from pyspark import __version__ as spark_version from pyspark.sql import Column, SparkSession # noqa: WPS442 spark = SparkSession._instantiatedSession # noqa: WPS437 self.check_if_supported(spark) - from pyspark.sql.column import _to_java_column # noqa: WPS450 from pyspark.sql.functions import col if isinstance(column, Column): @@ -345,6 +352,13 @@ def parse_column(self, column: str | Column, schema: StructType) -> Column: else: column_name, column = column, col(column).cast("string") + if spark_version > "4": + from pyspark.sql.functions import from_xml # noqa: WPS450 + + return from_xml(column, schema, self.dict()).alias(column_name) + + from pyspark.sql.column import _to_java_column # noqa: WPS450 + java_column = _to_java_column(column) java_schema = spark._jsparkSession.parseDataType(schema.json()) # noqa: WPS437 scala_options = spark._jvm.org.apache.spark.api.python.PythonUtils.toScalaMap( # noqa: WPS219, WPS437 diff --git a/onetl/hwm/store/hwm_class_registry.py b/onetl/hwm/store/hwm_class_registry.py index 82b0eef2e..05c3289bd 100644 --- a/onetl/hwm/store/hwm_class_registry.py +++ b/onetl/hwm/store/hwm_class_registry.py @@ -2,10 +2,13 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from typing import ClassVar +from typing import TYPE_CHECKING, ClassVar from etl_entities.hwm import HWM, ColumnDateHWM, ColumnDateTimeHWM, ColumnIntHWM +if TYPE_CHECKING: + from pyspark.sql.types import DataType + class SparkTypeToHWM: """Registry class for HWM types @@ -14,43 +17,67 @@ class SparkTypeToHWM: -------- >>> from etl_entities.hwm import ColumnIntHWM, ColumnDateHWM + >>> from pyspark.sql.types import IntegerType, ShortType, DateType, StringType >>> from onetl.hwm.store import SparkTypeToHWM - >>> SparkTypeToHWM.get("integer") + >>> SparkTypeToHWM.get(IntegerType()) >>> # multiple type names are supported - >>> SparkTypeToHWM.get("short") + >>> SparkTypeToHWM.get(ShortType()) - >>> SparkTypeToHWM.get("date") + >>> SparkTypeToHWM.get(DateType()) - >>> SparkTypeToHWM.get("unknown") + >>> SparkTypeToHWM.get(StringType()) """ - _mapping: ClassVar[dict[str, type[HWM]]] = { - "byte": ColumnIntHWM, - "integer": ColumnIntHWM, - "short": ColumnIntHWM, - "long": ColumnIntHWM, - "date": ColumnDateHWM, - "timestamp": ColumnDateTimeHWM, - # for Oracle which does not differ between int and float/double - everything is Decimal - "float": ColumnIntHWM, - "double": ColumnIntHWM, - "fractional": ColumnIntHWM, - "decimal": ColumnIntHWM, - "numeric": ColumnIntHWM, - } + _mapping: ClassVar[dict[DataType | type[DataType], type[HWM]]] = {} @classmethod - def get(cls, type_name: str) -> type[HWM] | None: - return cls._mapping.get(type_name) + def get(cls, spark_type: DataType) -> type[HWM] | None: + # avoid importing pyspark in the module + from pyspark.sql.types import ( # noqa: WPS235 + ByteType, + DateType, + DecimalType, + DoubleType, + FloatType, + FractionalType, + IntegerType, + LongType, + NumericType, + ShortType, + TimestampType, + ) + + default_mapping: dict[type[DataType], type[HWM]] = { + ByteType: ColumnIntHWM, + IntegerType: ColumnIntHWM, + ShortType: ColumnIntHWM, + LongType: ColumnIntHWM, + DateType: ColumnDateHWM, + TimestampType: ColumnDateTimeHWM, + # for Oracle which does not differ between int and float/double - everything is Decimal + FloatType: ColumnIntHWM, + DoubleType: ColumnIntHWM, + DecimalType: ColumnIntHWM, + FractionalType: ColumnIntHWM, + NumericType: ColumnIntHWM, + } + + return ( + cls._mapping.get(spark_type) + or cls._mapping.get(spark_type.__class__) + or default_mapping.get(spark_type.__class__) + ) @classmethod - def add(cls, type_name: str, klass: type[HWM]) -> None: - cls._mapping[type_name] = klass + def add(cls, spark_type: DataType | type[DataType], klass: type[HWM]) -> None: + cls._mapping[spark_type] = klass -def register_spark_type_to_hwm_type_mapping(*type_names: str): - """Decorator for registering some HWM class with a type name or names +def register_spark_type_to_hwm_type_mapping(*spark_types: DataType | type[DataType]): + """Decorator for registering mapping between Spark data type and HWM type. + + Accepts both data type class and instance. Examples -------- @@ -58,17 +85,20 @@ def register_spark_type_to_hwm_type_mapping(*type_names: str): >>> from etl_entities.hwm import ColumnHWM >>> from onetl.hwm.store import SparkTypeToHWM >>> from onetl.hwm.store import SparkTypeToHWM, register_spark_type_to_hwm_type_mapping - >>> @register_spark_type_to_hwm_type_mapping("somename", "anothername") + >>> from pyspark.sql.types import IntegerType, DecimalType + >>> @register_spark_type_to_hwm_type_mapping(IntegerType, DecimalType(38, 0)) ... class MyHWM(ColumnHWM): ... - >>> SparkTypeToHWM.get("somename") + >>> SparkTypeToHWM.get(IntegerType()) - >>> SparkTypeToHWM.get("anothername") + >>> SparkTypeToHWM.get(DecimalType(38, 0)) + >>> SparkTypeToHWM.get(DecimalType(38, 10)) + """ def wrapper(cls: type[HWM]): - for type_name in type_names: - SparkTypeToHWM.add(type_name, cls) + for spark_type in spark_types: + SparkTypeToHWM.add(spark_type, cls) return cls return wrapper diff --git a/requirements/tests/spark-4.0.0.txt b/requirements/tests/spark-4.0.0.txt new file mode 100644 index 000000000..038d47de7 --- /dev/null +++ b/requirements/tests/spark-4.0.0.txt @@ -0,0 +1,5 @@ +numpy>=1.16 +pandas>=1.0 +pyarrow>=1.0 +pyspark==4.0.0.dev1 +sqlalchemy diff --git a/tests/fixtures/spark.py b/tests/fixtures/spark.py index e7248e84f..e5f52724f 100644 --- a/tests/fixtures/spark.py +++ b/tests/fixtures/spark.py @@ -103,7 +103,7 @@ def maven_packages(request): # There is no MongoDB connector for Spark less than 3.2 packages.extend(MongoDB.get_packages(spark_version=str(pyspark_version))) - if "excel" in markers: + if "excel" in markers and pyspark_version.major < 4: # There is no Excel files support for Spark less than 3.2 packages.extend(Excel.get_packages(spark_version=str(pyspark_version))) diff --git a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py index 6cc860f56..40ff79b06 100644 --- a/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py +++ b/tests/tests_integration/tests_strategy_integration/tests_incremental_strategy_integration/test_strategy_increment_hive.py @@ -200,7 +200,7 @@ def test_hive_strategy_incremental_nothing_to_read(spark, processing, prepare_sc [ ("float_value", ValueError, "Expression 'float_value' returned values"), ("text_string", RuntimeError, "Cannot detect HWM type for"), - ("unknown_column", Exception, r"column .* cannot be resolved|cannot resolve .* given input columns"), + ("unknown_column", Exception, r".*cannot.*resolve.*"), ], ) def test_hive_strategy_incremental_wrong_hwm(