diff --git a/.github/workflows/spark_deployment/spark-defaults.conf b/.github/workflows/spark_deployment/spark-defaults.conf index 7c99df8..0dd1205 100644 --- a/.github/workflows/spark_deployment/spark-defaults.conf +++ b/.github/workflows/spark_deployment/spark-defaults.conf @@ -1,41 +1,37 @@ # Spark Master Configuration spark.master local[3] -# Storage and Catalog Configuration -spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing +# Catalog Configuration +spark.sql.defaultCatalog glue +spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog +spark.sql.catalog.spark_catalog.type hive spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO -spark.sql.defaultCatalog glue -spark.sql.catalog.glue.database dbt-spark-iceberg +spark.sql.catalog.glue.lock-impl org.apache.iceberg.aws.glue.DynamoLockManager +spark.sql.catalog.glue.lock.table myGlueLockTable -# Iceberg Specific Configuration +# Default Schema/Database Configuration +spark.sql.catalog.glue.default-namespace default_snowplow_manifest +spark.sql.database.default default_snowplow_manifest + +# Session Extensions spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + +# Iceberg Specific Configuration spark.sql.iceberg.handle-timestamp-without-timezone true -spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog -spark.sql.catalog.spark_catalog.type hive -spark.sql.catalog.glue.default-namespace default_snowplow_manifest -spark.sql.catalog.glue.table-default.write.format.default iceberg -spark.sql.catalog.glue.table-default.write.merge.mode merge-on-read -spark.sql.catalog.glue.table-default.write.distribution-mode data -spark.sql.catalog.glue.table-default.write.metadata.delete-after-commit.enabled true -spark.sql.catalog.glue.table-default.write.metadata.previous-versions-max 10 +spark.wds.iceberg.format-version 2 +spark.sql.catalogImplementation hive +spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing # AWS S3 Configuration spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com -spark.hadoop.fs.s3a.region eu-west-1 -spark.hadoop.fs.s3a.connection.maximum 200 -spark.hadoop.fs.s3a.connection.timeout 1200000 -spark.hadoop.fs.s3a.connection.establish.timeout 1200000 -spark.hadoop.fs.s3a.attempts.maximum 20 -spark.hadoop.fs.s3a.retry.limit 20 -spark.hadoop.fs.s3a.retry.interval 1000 -spark.hadoop.fs.s3a.fast.upload true -spark.hadoop.fs.s3a.fast.upload.buffer disk -spark.hadoop.fs.s3a.multipart.size 64M +spark.hadoop.fs.s3a.path.style.access true +spark.hadoop.fs.s3a.connection.ssl.enabled true +spark.hadoop.fs.s3a.experimental.input.fadvise random # Memory and Resource Configuration spark.driver.memory 10g @@ -51,41 +47,8 @@ spark.sql.adaptive.enabled true spark.sql.adaptive.coalescePartitions.enabled true spark.sql.adaptive.skewJoin.enabled true spark.sql.adaptive.localShuffleReader.enabled true -spark.sql.adaptive.fetchShuffleBlocksInBatch true -spark.sql.files.maxPartitionBytes 134217728 spark.serializer org.apache.spark.serializer.KryoSerializer -spark.kryoserializer.buffer.max 256m -spark.sql.inMemoryColumnarStorage.compressed true -spark.sql.inMemoryColumnarStorage.batchSize 10000 -spark.sql.shuffle.file.buffer 1m -spark.shuffle.file.buffer 1m -spark.shuffle.compress true -spark.shuffle.spill.compress true -# Transaction and Table Management +# Transaction Management spark.sql.sources.partitionOverwriteMode dynamic -spark.sql.iceberg.vectorization.enabled true -spark.sql.iceberg.maintain.metadata.snapshots true -spark.sql.iceberg.handle-timestamp-without-timezone true -spark.sql.parquet.compression.codec snappy -spark.sql.parquet.mergeSchema true -spark.sql.parquet.filterPushdown true - -# I/O Optimization -spark.sql.files.openCostInBytes 134217728 -spark.sql.broadcastTimeout 600s -spark.sql.autoBroadcastJoinThreshold 10485760 -spark.sql.files.maxRecordsPerFile 50000000 - -# Timeout and Network Configuration -spark.network.timeout 600s -spark.executor.heartbeatInterval 60s -spark.storage.blockManagerSlaveTimeoutMs 180s - -# Off-heap Memory Configuration -spark.memory.offHeap.enabled true -spark.memory.offHeap.size 2g - -# Debug Logging -spark.driver.extraJavaOptions -Dlog4j.rootCategory=INFO,console -Dlog4j.logger.org.apache.spark.network.netty=INFO -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -spark.executor.extraJavaOptions -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions \ No newline at end of file +spark.sql.iceberg.handle-timestamp-without-timezone true \ No newline at end of file