feat: Spark Operator Blueprint update (awslabs#359)

jaradtke-aws · Dec 14, 2023 · 2015f24 · 2015f24
2 parents 9244219 + 51e7912
commit 2015f24
Show file tree

Hide file tree

Showing 22 changed files with 159 additions and 252 deletions.
diff --git a/analytics/terraform/spark-k8s-operator/README.md b/analytics/terraform/spark-k8s-operator/README.md
@@ -58,7 +58,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
 | [aws_availability_zones.available](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/availability_zones) | data source |
 | [aws_caller_identity.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/caller_identity) | data source |
 | [aws_ecrpublic_authorization_token.token](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ecrpublic_authorization_token) | data source |
-| [aws_eks_cluster_auth.this](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/eks_cluster_auth) | data source |
 | [aws_iam_policy_document.grafana](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
 | [aws_iam_policy_document.spark_operator](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/iam_policy_document) | data source |
 | [aws_partition.current](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/partition) | data source |
@@ -70,7 +69,7 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
 
 | Name | Description | Type | Default | Required |
 |------|-------------|------|---------|:--------:|
-| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.26"` | no |
+| <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.28"` | no |
 | <a name="input_eks_data_plane_subnet_secondary_cidr"></a> [eks\_data\_plane\_subnet\_secondary\_cidr](#input\_eks\_data\_plane\_subnet\_secondary\_cidr) | Secondary CIDR blocks. 32766 IPs per Subnet per Subnet/AZ for EKS Node and Pods | `list(string)` | <pre>[<br>  "100.64.0.0/17",<br>  "100.64.128.0/17"<br>]</pre> | no |
 | <a name="input_enable_amazon_prometheus"></a> [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no |
 | <a name="input_enable_vpc_endpoints"></a> [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `bool` | `false` | no |

diff --git a/analytics/terraform/spark-k8s-operator/addons.tf b/analytics/terraform/spark-k8s-operator/addons.tf
@@ -88,13 +88,17 @@ module "eks_blueprints_addons" {
     repository_username = data.aws_ecrpublic_authorization_token.token.user_name
     repository_password = data.aws_ecrpublic_authorization_token.token.password
   }
-
+  karpenter_node = {
+    iam_role_additional_policies = {
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+  }
   #---------------------------------------
   # CloudWatch metrics for EKS
   #---------------------------------------
   enable_aws_cloudwatch_metrics = true
   aws_cloudwatch_metrics = {
-    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-valyes.yaml", {})]
+    values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
   }
 
   #---------------------------------------

diff --git a/analytics/terraform/spark-k8s-operator/data.tf b/analytics/terraform/spark-k8s-operator/data.tf
@@ -1,7 +1,3 @@
-data "aws_eks_cluster_auth" "this" {
-  name = module.eks.cluster_name
-}
-
 data "aws_ecrpublic_authorization_token" "token" {
   provider = aws.ecr
 }

diff --git a/...xamples/benchmark/tpcds-benchmark-1t.yaml → ...xamples/benchmark/tpcds-benchmark-3t.yaml b/...xamples/benchmark/tpcds-benchmark-1t.yaml → ...xamples/benchmark/tpcds-benchmark-3t.yaml
@@ -9,6 +9,11 @@ kind: SparkApplication
 metadata:
   name: tpcds-benchmark-3tb
   namespace: spark-team-a
+  labels:
+    app: "tpcds-benchmark"
+    applicationId: "tpcds-benchmark-3t"
+    # Assign the job to a Yunikorn Queue via label.
+    queue: root.prod
 spec:
   type: Scala
   mode: cluster
@@ -65,52 +70,32 @@ spec:
     spark.kubernetes.driver.requestTimeout: "120000"
     # spark.kubernetes.allocation.batch.size: "20" # default 5 but adjust according to your cluster size
     # -----------------------------------------------------
-  volumes:
-    - name: spark-local-dir-1
-      hostPath:
-        path: /local1
   driver:
-    volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /ossdata1
-        readOnly: false
-    initContainers:
-    - name: volume-permission
-      image: public.ecr.aws/y4g4v0z7/busybox
-      command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
-      volumeMounts:
-        - name: spark-local-dir-1
-          mountPath: /ossdata1
     cores: 4
     coreLimit: "4.1"
     memory: "5g"
     memoryOverhead: "1000"
     serviceAccount: spark-team-a
+    # the c5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       provisioner: spark-compute-optimized
     tolerations:
       - key: "spark-compute-optimized"
         operator: "Exists"
         effect: "NoSchedule"
   executor:
-    volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /ossdata1
-        readOnly: false
-    initContainers:
-    - name: volume-permission
-      image: public.ecr.aws/y4g4v0z7/busybox
-      command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
-      volumeMounts:
-        - name: spark-local-dir-1
-          mountPath: /ossdata1
     cores: 4
     coreLimit: "4.3"
     memory: "6g"
     memoryOverhead: "2g"
     # 8 executors per node
     instances: 47 # changed from 47 to 20 for demo
     serviceAccount: spark-team-a
+    # the c5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       provisioner: spark-compute-optimized
     tolerations:

diff --git a/...k/tpcds-benchmark-data-generation-1t.yaml → ...k/tpcds-benchmark-data-generation-3t.yaml b/...k/tpcds-benchmark-data-generation-1t.yaml → ...k/tpcds-benchmark-data-generation-3t.yaml
@@ -8,6 +8,11 @@ kind: SparkApplication
 metadata:
   name: tpcds-data-generation-3t
   namespace: spark-team-a
+  labels:
+    app: "tpcds-data-generation"
+    applicationId: "tpcds-data-generation-3t"
+    # Assign the job to a Yunikorn Queue via label.
+    queue: root.prod
 spec:
   type: Scala
   mode: cluster
@@ -64,50 +69,30 @@ spec:
 
   restartPolicy:
     type: Never
-  volumes:
-    - name: spark-local-dir-1
-      hostPath:
-        path: /local1
   driver:
-    volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /ossdata1
-        readOnly: false
-    initContainers:
-    - name: volume-permission
-      image: public.ecr.aws/y4g4v0z7/busybox
-      command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
-      volumeMounts:
-        - name: spark-local-dir-1
-          mountPath: /ossdata1
     cores: 10
     coreLimit: "10.1"
     memory: "10g"
     serviceAccount: spark-team-a
+    # the c5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       provisioner: spark-compute-optimized
     tolerations:
       - key: "spark-compute-optimized"
         operator: "Exists"
         effect: "NoSchedule"
   executor:
-    volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /ossdata1
-        readOnly: false
-    initContainers:
-    - name: volume-permission
-      image: public.ecr.aws/y4g4v0z7/busybox
-      command: ['sh', '-c', 'mkdir /ossdata1; chown -R 1000:1000 /ossdata1']
-      volumeMounts:
-        - name: spark-local-dir-1
-          mountPath: /ossdata1
     cores: 11
     coreLimit: "11.1"
     memory: "15g"
     # 3 executors per node 9 nodes
     instances: 26
     serviceAccount: spark-team-a
+    # the c5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # the data generation can utilize a large amount of storage
     nodeSelector:
       provisioner: spark-compute-optimized
     tolerations:

diff --git a/...s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/...s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
@@ -79,51 +79,24 @@ spec:
     onFailureRetryInterval: 10
     onSubmissionFailureRetries: 5
     onSubmissionFailureRetryInterval: 20
-  volumes:  # using NVMe instance storage mounted on /local1
-    - name: spark-local-dir-1
-      hostPath:
-        path: /local1
-        type: Directory
-
   driver:
-    volumeMounts: # Points to InstanceStore 150GB NVMe SSD for shuffle spill over from memory
-      - name: spark-local-dir-1
-        mountPath: /data1
-        readOnly: false
-    initContainers:
-      - name: volume-permissions
-        image: public.ecr.aws/y4g4v0z7/busybox
-        command: [ 'sh', '-c', 'chown -R 185 /local1' ]
-        volumeMounts:
-          - mountPath: "/local1"
-            name: "spark-local-dir-1"
     cores: 1
     coreLimit: "1200m"
     memory: "4g"
     memoryOverhead: "4g"
     serviceAccount: spark-team-a
     labels:
       version: 3.2.1
+    # the r5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       NodeGroupType: "spark-on-demand-ca"
     tolerations:
       - key: "spark-on-demand-ca"
         operator: "Exists"
         effect: "NoSchedule"
   executor:
-    podSecurityContext:
-      fsGroup: 185
-    volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /data1
-        readOnly: false
-    initContainers:
-      - name: volume-permissions
-        image: public.ecr.aws/y4g4v0z7/busybox
-        command: [ 'sh', '-c', 'chown -R 185 /local1' ]
-        volumeMounts:
-          - mountPath: "/local1"
-            name: "spark-local-dir-1"
     cores: 1
     coreLimit: "1200m"
     instances: 4
@@ -132,6 +105,9 @@ spec:
     serviceAccount: spark-team-a
     labels:
       version: 3.2.1
+    # the r5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       NodeGroupType: "spark-spot-ca"
     tolerations:

diff --git a/...uster-autoscaler/nvme-yunikorn-gang-scheduling/nvme-storage-yunikorn-gang-scheduling.yaml b/...uster-autoscaler/nvme-yunikorn-gang-scheduling/nvme-storage-yunikorn-gang-scheduling.yaml
@@ -12,6 +12,7 @@ metadata:
   labels:
     app: "taxi-trip"
     applicationId: "taxi-trip-yunikorn"
+    # Assign the job to a Yunikorn Queue via label.
     queue: root.test
 spec:
 #  To create Ingress object for Spark driver.
@@ -79,24 +80,7 @@ spec:
     onFailureRetryInterval: 10
     onSubmissionFailureRetries: 5
     onSubmissionFailureRetryInterval: 20
-  volumes:  # using NVMe instance storage mounted on /local1
-    - name: spark-local-dir-1
-      hostPath:
-        path: /local1
-        type: Directory
-
   driver:
-    volumeMounts: # Points to InstanceStore 150GB NVMe SSD for shuffle spill over from memory
-      - name: spark-local-dir-1
-        mountPath: /data1
-        readOnly: false
-    initContainers:
-      - name: volume-permissions
-        image: public.ecr.aws/y4g4v0z7/busybox
-        command: [ 'sh', '-c', 'chown -R 185 /local1' ]
-        volumeMounts:
-          - mountPath: "/local1"
-            name: "spark-local-dir-1"
     cores: 1
     coreLimit: "1200m"
     memory: "4g"
@@ -134,26 +118,16 @@ spec:
             },
             "tolerations": [{"key": "spark-spot-ca", "operator": "Exists", "effect": "NoSchedule"}]
         }]
+    # the r5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       NodeGroupType: "spark-on-demand-ca"
     tolerations:
       - key: "spark-on-demand-ca"
         operator: "Exists"
         effect: "NoSchedule"
   executor:
-    podSecurityContext:
-      fsGroup: 185
-    volumeMounts:
-      - name: spark-local-dir-1
-        mountPath: /data1
-        readOnly: false
-    initContainers:
-      - name: volume-permissions
-        image: public.ecr.aws/y4g4v0z7/busybox
-        command: [ 'sh', '-c', 'chown -R 185 /local1' ]
-        volumeMounts:
-          - mountPath: "/local1"
-            name: "spark-local-dir-1"
     cores: 1
     coreLimit: "1200m"
     instances: 4
@@ -164,6 +138,9 @@ spec:
       version: 3.2.1
     annotations:
       yunikorn.apache.org/task-group-name: "spark-executor"
+    # the r5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       NodeGroupType: "spark-spot-ca"
     tolerations:

diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/pyspark-pi-job.yaml b/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/pyspark-pi-job.yaml
@@ -7,6 +7,11 @@ kind: SparkApplication
 metadata:
   name: pyspark-pi
   namespace: spark-team-a
+  labels:
+    app: "pyspark-pi"
+    applicationId: "pyspark-pi-ca"
+    # Assign the job to a Yunikorn Queue via label.
+    queue: root.test
 spec:
   type: Python
   pythonVersion: "3"
@@ -28,6 +33,9 @@ spec:
     labels:
       version: 3.1.1
     serviceAccount: spark-team-a
+    # the r5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       NodeGroupType: "spark-on-demand-ca"
     tolerations:
@@ -41,6 +49,9 @@ spec:
     serviceAccount: spark-team-a
     labels:
       version: 3.1.1
+    # the r5d instances that Karpenter will launch will have the NVMe storage preformatted and available to the pod
+    # we do not need to leverage a hostPath mount or volume to leverage that storage.
+    # ephemeral-storage requests and limits can be used to manage the storage utilization
     nodeSelector:
       NodeGroupType: "spark-spot-ca"
     tolerations: