Skip to content

Commit

Permalink
Lets try that
Browse files Browse the repository at this point in the history
  • Loading branch information
ilias1111 committed Nov 11, 2024
1 parent cd075f6 commit 846632f
Showing 1 changed file with 21 additions and 58 deletions.
79 changes: 21 additions & 58 deletions .github/workflows/spark_deployment/spark-defaults.conf
Original file line number Diff line number Diff line change
@@ -1,41 +1,37 @@
# Spark Master Configuration
spark.master local[3]

# Storage and Catalog Configuration
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
# Catalog Configuration
spark.sql.defaultCatalog glue
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
spark.sql.catalog.spark_catalog.type hive
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
spark.sql.defaultCatalog glue
spark.sql.catalog.glue.database dbt-spark-iceberg
spark.sql.catalog.glue.lock-impl org.apache.iceberg.aws.glue.DynamoLockManager
spark.sql.catalog.glue.lock.table myGlueLockTable

# Iceberg Specific Configuration
# Default Schema/Database Configuration
spark.sql.catalog.glue.default-namespace default_snowplow_manifest
spark.sql.database.default default_snowplow_manifest

# Session Extensions
spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions

# Iceberg Specific Configuration
spark.sql.iceberg.handle-timestamp-without-timezone true
spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog
spark.sql.catalog.spark_catalog.type hive
spark.sql.catalog.glue.default-namespace default_snowplow_manifest
spark.sql.catalog.glue.table-default.write.format.default iceberg
spark.sql.catalog.glue.table-default.write.merge.mode merge-on-read
spark.sql.catalog.glue.table-default.write.distribution-mode data
spark.sql.catalog.glue.table-default.write.metadata.delete-after-commit.enabled true
spark.sql.catalog.glue.table-default.write.metadata.previous-versions-max 10
spark.wds.iceberg.format-version 2
spark.sql.catalogImplementation hive
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing

# AWS S3 Configuration
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
spark.hadoop.fs.s3a.region eu-west-1
spark.hadoop.fs.s3a.connection.maximum 200
spark.hadoop.fs.s3a.connection.timeout 1200000
spark.hadoop.fs.s3a.connection.establish.timeout 1200000
spark.hadoop.fs.s3a.attempts.maximum 20
spark.hadoop.fs.s3a.retry.limit 20
spark.hadoop.fs.s3a.retry.interval 1000
spark.hadoop.fs.s3a.fast.upload true
spark.hadoop.fs.s3a.fast.upload.buffer disk
spark.hadoop.fs.s3a.multipart.size 64M
spark.hadoop.fs.s3a.path.style.access true
spark.hadoop.fs.s3a.connection.ssl.enabled true
spark.hadoop.fs.s3a.experimental.input.fadvise random

# Memory and Resource Configuration
spark.driver.memory 10g
Expand All @@ -51,41 +47,8 @@ spark.sql.adaptive.enabled true
spark.sql.adaptive.coalescePartitions.enabled true
spark.sql.adaptive.skewJoin.enabled true
spark.sql.adaptive.localShuffleReader.enabled true
spark.sql.adaptive.fetchShuffleBlocksInBatch true
spark.sql.files.maxPartitionBytes 134217728
spark.serializer org.apache.spark.serializer.KryoSerializer
spark.kryoserializer.buffer.max 256m
spark.sql.inMemoryColumnarStorage.compressed true
spark.sql.inMemoryColumnarStorage.batchSize 10000
spark.sql.shuffle.file.buffer 1m
spark.shuffle.file.buffer 1m
spark.shuffle.compress true
spark.shuffle.spill.compress true

# Transaction and Table Management
# Transaction Management
spark.sql.sources.partitionOverwriteMode dynamic
spark.sql.iceberg.vectorization.enabled true
spark.sql.iceberg.maintain.metadata.snapshots true
spark.sql.iceberg.handle-timestamp-without-timezone true
spark.sql.parquet.compression.codec snappy
spark.sql.parquet.mergeSchema true
spark.sql.parquet.filterPushdown true

# I/O Optimization
spark.sql.files.openCostInBytes 134217728
spark.sql.broadcastTimeout 600s
spark.sql.autoBroadcastJoinThreshold 10485760
spark.sql.files.maxRecordsPerFile 50000000

# Timeout and Network Configuration
spark.network.timeout 600s
spark.executor.heartbeatInterval 60s
spark.storage.blockManagerSlaveTimeoutMs 180s

# Off-heap Memory Configuration
spark.memory.offHeap.enabled true
spark.memory.offHeap.size 2g

# Debug Logging
spark.driver.extraJavaOptions -Dlog4j.rootCategory=INFO,console -Dlog4j.logger.org.apache.spark.network.netty=INFO -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions
spark.executor.extraJavaOptions -XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions
spark.sql.iceberg.handle-timestamp-without-timezone true

0 comments on commit 846632f

Please sign in to comment.