From d4ca2d8daaf8aac4360f26ef608506601655ff60 Mon Sep 17 00:00:00 2001 From: Ed Date: Wed, 20 Nov 2024 14:42:03 -0500 Subject: [PATCH 1/9] fix(etl): added legacy support and added options --- helm/etl/Chart.yaml | 4 +- helm/etl/README.md | 13 ++-- helm/etl/templates/etl-job.yaml | 36 +++++----- helm/etl/templates/etl-secret.yaml | 104 +++++++++++++++++++++++++++++ helm/etl/values.yaml | 18 ++--- 5 files changed, 137 insertions(+), 38 deletions(-) create mode 100644 helm/etl/templates/etl-secret.yaml diff --git a/helm/etl/Chart.yaml b/helm/etl/Chart.yaml index f2a606bd..2e07c889 100644 --- a/helm/etl/Chart.yaml +++ b/helm/etl/Chart.yaml @@ -15,11 +15,11 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.5 +version: 0.1.6 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. appVersion: "master" -dependencies: [] +dependencies: [] \ No newline at end of file diff --git a/helm/etl/README.md b/helm/etl/README.md index 58c187f0..f6d5e8a8 100644 --- a/helm/etl/README.md +++ b/helm/etl/README.md @@ -1,6 +1,6 @@ # etl -![Version: 0.1.5](https://img.shields.io/badge/Version-0.1.5-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: master](https://img.shields.io/badge/AppVersion-master-informational?style=flat-square) +![Version: 0.1.6](https://img.shields.io/badge/Version-0.1.6-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: master](https://img.shields.io/badge/AppVersion-master-informational?style=flat-square) A Helm chart for gen3 etl @@ -93,17 +93,14 @@ A Helm chart for gen3 etl | image.tube.repository | string | `"quay.io/cdis/tube"` | The Docker image repository for the fence service | | image.tube.tag | string | `"master"` | Overrides the image tag whose default is the chart appVersion. | | imagePullSecrets | list | `[]` | Docker image pull secrets. | +| legacySupport | bool | `false` | | | podAnnotations | map | `{}` | Annotations to add to the pod | -| resources | map | `{"spark":{"limits":{"cpu":1,"memory":"2Gi"},"requests":{"cpu":0.3,"memory":"128Mi"}},"tube":{"limits":{"cpu":1,"memory":"2Gi"},"requests":{"cpu":0.3,"memory":"128Mi"}}}` | Resource requests and limits for the containers in the pod | -| resources.spark.limits | map | `{"cpu":1,"memory":"2Gi"}` | The maximum amount of resources that the container is allowed to use | -| resources.spark.limits.cpu | string | `1` | The maximum amount of CPU the container can use | -| resources.spark.limits.memory | string | `"2Gi"` | The maximum amount of memory the container can use | +| resources | map | `{"spark":{"requests":{"cpu":0.3,"memory":"128Mi"}},"tube":{"requests":{"cpu":0.3,"memory":"128Mi"}}}` | Resource requests and limits for the containers in the pod | | resources.spark.requests | map | `{"cpu":0.3,"memory":"128Mi"}` | The amount of resources that the container requests | | resources.spark.requests.cpu | string | `0.3` | The amount of CPU requested | | resources.spark.requests.memory | string | `"128Mi"` | The amount of memory requested | -| resources.tube.limits | map | `{"cpu":1,"memory":"2Gi"}` | The maximum amount of resources that the container is allowed to use | -| resources.tube.limits.cpu | string | `1` | The maximum amount of CPU the container can use | -| resources.tube.limits.memory | string | `"2Gi"` | The maximum amount of memory the container can use | | resources.tube.requests | map | `{"cpu":0.3,"memory":"128Mi"}` | The amount of resources that the container requests | | resources.tube.requests.cpu | string | `0.3` | The amount of CPU requested | | resources.tube.requests.memory | string | `"128Mi"` | The amount of memory requested | +| schedule | string | `"*/30 * * * *"` | | +| suspendCronjob | bool | `true` | | diff --git a/helm/etl/templates/etl-job.yaml b/helm/etl/templates/etl-job.yaml index 0b306d07..8cb1d9e2 100644 --- a/helm/etl/templates/etl-job.yaml +++ b/helm/etl/templates/etl-job.yaml @@ -3,7 +3,8 @@ kind: CronJob metadata: name: etl-cronjob spec: - schedule: "0 0 1 1 */5" + suspend: {{ .Values.suspendCronjob }} + schedule: {{ .Values.schedule }} jobTemplate: spec: backoffLimit: 0 @@ -35,6 +36,12 @@ spec: values: - ONDEMAND volumes: + {{- if .Values.legacySupport }} + - name: config-volume + secret: + defaultMode: 420 + secretName: etl-secret + {{- end }} - name: signal-volume emptyDir: {} - name: creds-volume @@ -80,9 +87,6 @@ spec: requests: cpu: {{ .Values.resources.spark.requests.cpu }} memory: {{ .Values.resources.spark.requests.memory }} - # limits: - # cpu: {{ .Values.resources.spark.limits.cpu }} - # memory: {{ .Values.resources.spark.limits.memory }} command: ["/bin/bash" ] args: - "-c" @@ -105,7 +109,6 @@ spec: while true; do sleep 5; done - name: tube imagePullPolicy: IfNotPresent - # image: quay.io/cdis/tube:feat_helm_test image: {{ .Values.image.tube.repository }}:{{ .Values.image.tube.tag }} ports: - containerPort: 80 @@ -166,11 +169,11 @@ spec: key: slack_webhook optional: true volumeMounts: - # - name: "creds-volume" - # readOnly: true - # mountPath: "/gen3/tube/creds.json" - # subPath: creds.json - # Volume to signal when to kill spark + {{- if .Values.legacySupport }} + - mountPath: /tube/tube/settings.py + name: config-volume + subPath: settings.py + {{- end }} - mountPath: /usr/share/pod name: signal-volume - name: "etl-mapping" @@ -185,9 +188,6 @@ spec: requests: cpu: {{ .Values.resources.tube.requests.cpu }} memory: {{ .Values.resources.tube.requests.memory }} - # limits: - # cpu: {{ .Values.resources.tube.limits.cpu }} - # memory: {{ .Values.resources.tube.limits.memory }} command: ["/bin/bash"] args: - "-c" @@ -199,9 +199,13 @@ spec: # Port 9000 is open, continue with the rest of the script echo "Port 9000 is now open. Continuing with the script..." - - echo "python run_config.py && python run_etl.py" - python run_config.py && python run_etl.py + if [[ $ETL_FORCED != "false" ]]; then + echo "python run_config.py && python run_etl.py --force" + python run_config.py && python run_etl.py --force + else + echo "python run_config.py && python run_etl.py" + python run_config.py && python run_etl.py + fi exitcode=$? # Kill sidecar and all processes diff --git a/helm/etl/templates/etl-secret.yaml b/helm/etl/templates/etl-secret.yaml new file mode 100644 index 00000000..21a2b117 --- /dev/null +++ b/helm/etl/templates/etl-secret.yaml @@ -0,0 +1,104 @@ +{{ if .Values.legacySupport }} +kind: Secret +apiVersion: v1 +metadata: + name: etl-secret +stringData: + settings.py: |- + import os + import tube.enums as enums + + from cdislogging import get_logger + from tube.config_helper import find_paths, load_json + from .utils.general import get_resource_paths_from_yaml + + + logger = get_logger("__name__", log_level="warn") + + LIST_TABLES_FILES = "tables.txt" + + # + # Load db credentials from a creds.json file. + # See config_helper.py for paths searched for creds.json + # ex: export XDG_DATA_HOME="$HOME/.local/share" + # and setup $XDG_DATA_HOME/.local/share/gen3/tube/creds.json + # + conf_data = load_json("creds.json", "tube") + DB_HOST = os.getenv("DB_HOST") or conf_data.get("db_host", "localhost") + DB_PORT = os.getenv("DB_PORT") or conf_data.get("db_port", "5432") + DB_DATABASE = os.getenv("DB_DATABASE") or conf_data.get("db_database", "sheepdog") + DB_USERNAME = os.getenv("DB_USERNAME") or conf_data.get("db_username", "peregrine") + DB_PASSWORD = os.getenv("DB_PASSWORD") or conf_data.get("db_password", "unknown") + + DB_USE_SSL = os.getenv("DB_USE_SSL") or conf_data.get( + "db_use_ssl", False + ) # optional property to db_use_ssl + JDBC = ( + "jdbc:postgresql://{}:{}/{}".format(DB_HOST, DB_PORT, DB_DATABASE) + if DB_USE_SSL is False + else "jdbc:postgresql://{}:{}/{}?sslmode=require".format( + DB_HOST, DB_PORT, DB_DATABASE + ) + ) + PYDBC = "postgresql://{}:{}@{}:{}/{}".format( + DB_USERNAME, DB_PASSWORD, DB_HOST, DB_PORT, DB_DATABASE + ) + DICTIONARY_URL = os.getenv( + "DICTIONARY_URL", + "https://s3.amazonaws.com/dictionary-artifacts/datadictionary/develop/schema.json", + ) + ES_URL = os.getenv("ES_URL", "esproxy-service") + + HDFS_DIR = "/result" + # Three modes: Test, Dev, Prod + RUNNING_MODE = os.getenv("RUNNING_MODE", enums.RUNNING_MODE_DEV) # 'Prod' or 'Dev' + + PARALLEL_JOBS = 1 + LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") + + ES = { + "es.nodes": ES_URL, + "es.port": "9200", + "es.input.json": "yes", + "es.nodes.client.only": "false", + "es.nodes.discovery": "false", + "es.nodes.data.only": "false", + "es.nodes.wan.only": "true", + } + + HADOOP_HOME = os.getenv("HADOOP_HOME", "/usr/local/Cellar/hadoop/3.1.0/libexec/") + JAVA_HOME = os.getenv( + "JAVA_HOME", "/Library/Java/JavaVirtualMachines/jdk1.8.0_131.jdk/Contents/Home" + ) + HADOOP_URL = os.getenv("HADOOP_URL", "http://spark-service:9000") + ES_HADOOP_VERSION = os.getenv("ES_HADOOP_VERSION", "") + ES_HADOOP_HOME_BIN = "{}/elasticsearch-hadoop-{}".format( + os.getenv("ES_HADOOP_HOME", ""), os.getenv("ES_HADOOP_VERSION", "") + ) + HADOOP_HOST = os.getenv("HADOOP_HOST", "spark-service") + # Searches same folders as load_json above + + try: + MAPPING_FILE = find_paths("etlMapping.yaml", "tube")[0] + except: + MAPPING_FILE = None + + try: + USERYAML_FILE = find_paths("user.yaml", "tube")[0] + except IndexError: + USERYAML_FILE = None + PROJECT_TO_RESOURCE_PATH = get_resource_paths_from_yaml(USERYAML_FILE) + + SPARK_MASTER = os.getenv("SPARK_MASTER", "local[1]") # 'spark-service' + SPARK_EXECUTOR_MEMORY = os.getenv("SPARK_EXECUTOR_MEMORY", "2g") + SPARK_DRIVER_MEMORY = os.getenv("SPARK_DRIVER_MEMORY", "512m") + APP_NAME = "Gen3 ETL" + + os.environ[ + "PYSPARK_SUBMIT_ARGS" + ] = "--jars {}/dist/elasticsearch-spark-20_2.11-{}.jar pyspark-shell".format( + ES_HADOOP_HOME_BIN, ES_HADOOP_VERSION + ) + os.environ["HADOOP_CLIENT_OPTS"] = os.getenv("HADOOP_CLIENT_OPTS", "") + +{{- end }} \ No newline at end of file diff --git a/helm/etl/values.yaml b/helm/etl/values.yaml index 718310b5..916a544b 100644 --- a/helm/etl/values.yaml +++ b/helm/etl/values.yaml @@ -33,12 +33,6 @@ resources: cpu: 0.3 # -- (string) The amount of memory requested memory: 128Mi - # -- (map) The maximum amount of resources that the container is allowed to use - limits: - # -- (string) The maximum amount of CPU the container can use - cpu: 1.0 - # -- (string) The maximum amount of memory the container can use - memory: 2Gi spark: # -- (map) The amount of resources that the container requests requests: @@ -46,12 +40,6 @@ resources: cpu: 0.3 # -- (string) The amount of memory requested memory: 128Mi - # -- (map) The maximum amount of resources that the container is allowed to use - limits: - # -- (string) The maximum amount of CPU the container can use - cpu: 1.0 - # -- (string) The maximum amount of memory the container can use - memory: 2Gi esEndpoint: gen3-elasticsearch-master @@ -154,3 +142,9 @@ esGarbageCollect: custom_image: # -- (string) Slack webhook endpoint to use for cronjob. slack_webhook: None + +schedule: "*/30 * * * *" + +suspendCronjob: true + +legacySupport: false \ No newline at end of file From 766891356d93660b4ec6eb7f4f551143b2a270a5 Mon Sep 17 00:00:00 2001 From: Ed Date: Wed, 20 Nov 2024 14:43:04 -0500 Subject: [PATCH 2/9] fix(etl): added legacy support and added options --- helm/gen3/Chart.yaml | 4 ++-- helm/gen3/README.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/gen3/Chart.yaml b/helm/gen3/Chart.yaml index d45a3d16..ea58260a 100644 --- a/helm/gen3/Chart.yaml +++ b/helm/gen3/Chart.yaml @@ -28,7 +28,7 @@ dependencies: version: 0.1.16 repository: file://../common - name: etl - version: 0.1.5 + version: 0.1.6 repository: file://../etl condition: etl.enabled - name: frontend-framework @@ -128,7 +128,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.49 +version: 0.1.50 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to diff --git a/helm/gen3/README.md b/helm/gen3/README.md index b30ab5d4..7671aa5c 100644 --- a/helm/gen3/README.md +++ b/helm/gen3/README.md @@ -1,6 +1,6 @@ # gen3 -![Version: 0.1.49](https://img.shields.io/badge/Version-0.1.49-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: master](https://img.shields.io/badge/AppVersion-master-informational?style=flat-square) +![Version: 0.1.50](https://img.shields.io/badge/Version-0.1.50-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: master](https://img.shields.io/badge/AppVersion-master-informational?style=flat-square) Helm chart to deploy Gen3 Data Commons @@ -24,7 +24,7 @@ Helm chart to deploy Gen3 Data Commons | file://../audit | audit | 0.1.16 | | file://../aws-es-proxy | aws-es-proxy | 0.1.13 | | file://../common | common | 0.1.16 | -| file://../etl | etl | 0.1.5 | +| file://../etl | etl | 0.1.6 | | file://../fence | fence | 0.1.26 | | file://../frontend-framework | frontend-framework | 0.1.5 | | file://../gen3-network-policies | gen3-network-policies | 0.1.2 | From b8ca2701a1695bcacf2c1f6ea6b87b7b3e36c773 Mon Sep 17 00:00:00 2001 From: Ed Date: Wed, 20 Nov 2024 14:48:22 -0500 Subject: [PATCH 3/9] fix(etl): added legacy support and added options --- helm/etl/README.md | 1 + helm/etl/templates/etl-job.yaml | 2 +- helm/etl/values.yaml | 4 +++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/helm/etl/README.md b/helm/etl/README.md index f6d5e8a8..4c1c5995 100644 --- a/helm/etl/README.md +++ b/helm/etl/README.md @@ -14,6 +14,7 @@ A Helm chart for gen3 etl | esGarbageCollect.enabled | bool | `false` | Whether to create es garbage collect cronjob. | | esGarbageCollect.schedule | string | `"0 0 * * *"` | The cron schedule expression to use in the es garbage collect cronjob. Runs once a day by default. | | esGarbageCollect.slack_webhook | string | `"None"` | Slack webhook endpoint to use for cronjob. | +| etlForced | string | `"TRUE"` | | | etlMapping.mappings[0].aggregated_props[0].fn | string | `"count"` | | | etlMapping.mappings[0].aggregated_props[0].name | string | `"_samples_count"` | | | etlMapping.mappings[0].aggregated_props[0].path | string | `"samples"` | | diff --git a/helm/etl/templates/etl-job.yaml b/helm/etl/templates/etl-job.yaml index 8cb1d9e2..4167f5df 100644 --- a/helm/etl/templates/etl-job.yaml +++ b/helm/etl/templates/etl-job.yaml @@ -156,7 +156,7 @@ spec: - name: SPARK_DRIVER_MEMORY value: 6g - name: ETL_FORCED - value: "TRUE" + value: {{ .Values.etlForced }} - name: gen3Env valueFrom: configMapKeyRef: diff --git a/helm/etl/values.yaml b/helm/etl/values.yaml index 916a544b..b24f9b3f 100644 --- a/helm/etl/values.yaml +++ b/helm/etl/values.yaml @@ -147,4 +147,6 @@ schedule: "*/30 * * * *" suspendCronjob: true -legacySupport: false \ No newline at end of file +legacySupport: false + +etlForced: "TRUE" \ No newline at end of file From 31f0a07ddea8ac5ad971ef356660f4550ab8aec1 Mon Sep 17 00:00:00 2001 From: emalinowski Date: Thu, 21 Nov 2024 09:00:32 -0500 Subject: [PATCH 4/9] Update Chart.yaml --- helm/etl/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/etl/Chart.yaml b/helm/etl/Chart.yaml index 2e07c889..d55e0945 100644 --- a/helm/etl/Chart.yaml +++ b/helm/etl/Chart.yaml @@ -22,4 +22,4 @@ version: 0.1.6 # follow Semantic Versioning. They should reflect the version the application is using. appVersion: "master" -dependencies: [] \ No newline at end of file +dependencies: [] From aa1e9b32d59739fd193c75400a0eaef9c9af00d4 Mon Sep 17 00:00:00 2001 From: emalinowski Date: Mon, 25 Nov 2024 09:02:23 -0600 Subject: [PATCH 5/9] Update values.yaml --- helm/etl/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/etl/values.yaml b/helm/etl/values.yaml index b24f9b3f..ebf11925 100644 --- a/helm/etl/values.yaml +++ b/helm/etl/values.yaml @@ -149,4 +149,4 @@ suspendCronjob: true legacySupport: false -etlForced: "TRUE" \ No newline at end of file +etlForced: "TRUE" From 835a5a38cb749c77f98e17abf9468ad643505230 Mon Sep 17 00:00:00 2001 From: emalinowski Date: Mon, 25 Nov 2024 09:11:41 -0600 Subject: [PATCH 6/9] Update values.yaml --- helm/etl/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/etl/values.yaml b/helm/etl/values.yaml index ebf11925..5d7e2cdb 100644 --- a/helm/etl/values.yaml +++ b/helm/etl/values.yaml @@ -145,7 +145,7 @@ esGarbageCollect: schedule: "*/30 * * * *" -suspendCronjob: true +suspendCronjob: "true" legacySupport: false From 5129d775a37768cf255f61128f0872807de0177f Mon Sep 17 00:00:00 2001 From: emalinowski Date: Mon, 25 Nov 2024 09:13:48 -0600 Subject: [PATCH 7/9] Update etl-job.yaml --- helm/etl/templates/etl-job.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm/etl/templates/etl-job.yaml b/helm/etl/templates/etl-job.yaml index 4167f5df..76e34190 100644 --- a/helm/etl/templates/etl-job.yaml +++ b/helm/etl/templates/etl-job.yaml @@ -4,7 +4,7 @@ metadata: name: etl-cronjob spec: suspend: {{ .Values.suspendCronjob }} - schedule: {{ .Values.schedule }} + schedule: "{{ .Values.schedule }}" jobTemplate: spec: backoffLimit: 0 @@ -212,4 +212,4 @@ spec: echo "Exit code: $exitcode" pkill -u root && exit $exitcode exit "$exitcode" & - restartPolicy: Never \ No newline at end of file + restartPolicy: Never From ee417abf0eb273a185a718007d2760f09f1574be Mon Sep 17 00:00:00 2001 From: emalinowski Date: Mon, 25 Nov 2024 09:14:01 -0600 Subject: [PATCH 8/9] Update values.yaml --- helm/etl/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/etl/values.yaml b/helm/etl/values.yaml index 5d7e2cdb..ebf11925 100644 --- a/helm/etl/values.yaml +++ b/helm/etl/values.yaml @@ -145,7 +145,7 @@ esGarbageCollect: schedule: "*/30 * * * *" -suspendCronjob: "true" +suspendCronjob: true legacySupport: false From c24545bf907278248423c4db8b18cda0fac14235 Mon Sep 17 00:00:00 2001 From: emalinowski Date: Mon, 25 Nov 2024 09:30:32 -0600 Subject: [PATCH 9/9] Update etl-job.yaml --- helm/etl/templates/etl-job.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/etl/templates/etl-job.yaml b/helm/etl/templates/etl-job.yaml index 76e34190..9272940c 100644 --- a/helm/etl/templates/etl-job.yaml +++ b/helm/etl/templates/etl-job.yaml @@ -4,7 +4,7 @@ metadata: name: etl-cronjob spec: suspend: {{ .Values.suspendCronjob }} - schedule: "{{ .Values.schedule }}" + schedule: {{ .Values.schedule | quote }} jobTemplate: spec: backoffLimit: 0