From 055181f82f9865a5ec0567f802a9aac9c12378cd Mon Sep 17 00:00:00 2001 From: rashmichandrashekar Date: Fri, 13 Sep 2024 15:34:40 -0700 Subject: [PATCH] Changes to enable HPA for ama-metrics deployment (#968) [comment]: # (Note that your PR title should follow the conventional commit format: https://conventionalcommits.org/en/v1.0.0/#summary) # PR Description Changes to enable HPA for ama-metrics deployment. Update ME version to include changes for memory deallocation Updating ME config to tune memory deallocation Telemetry to track HPA enablement [comment]: # (The below checklist is for PRs adding new features. If a box is not checked, add a reason why it's not needed.) # New Feature Checklist - [X] List telemetry added about the feature - Added CollectorHpaEnabled in the custom dimensions - [X] Link to the one-pager about the feature - https://msazure.visualstudio.com/InfrastructureInsights/_wiki/wikis/InfrastructureInsights.wiki/686240/HPA-PRD --- .trivyignore | 3 ++ internal/docs/Operator-CRD.md | 9 ++--- internal/docs/Operator-Sharding.md | 2 +- otelcollector/build/windows/scripts/setup.ps1 | 6 +-- .../templates/_ama-metrics-helpers.tpl | 39 +++++++++++++++++++ .../templates/ama-metrics-collector-hpa.yaml | 26 +++++++++++++ .../templates/ama-metrics-deployment.yaml | 23 +++++++++-- .../values-template.yaml | 1 + otelcollector/metricextension/me.config | 4 +- otelcollector/metricextension/me_ds.config | 4 +- .../metricextension/me_ds_internal.config | 4 +- .../metricextension/me_ds_internal_win.config | 4 +- .../metricextension/me_ds_win.config | 4 +- .../metricextension/me_internal.config | 4 +- otelcollector/scripts/setup.sh | 2 +- ...egraf-prometheus-collector-ta-enabled.conf | 1 + .../telegraf-prometheus-collector.conf | 1 + 17 files changed, 118 insertions(+), 19 deletions(-) create mode 100644 otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/_ama-metrics-helpers.tpl create mode 100644 otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-collector-hpa.yaml diff --git a/.trivyignore b/.trivyignore index 63fec304c..ae4cf5e53 100644 --- a/.trivyignore +++ b/.trivyignore @@ -62,6 +62,9 @@ CVE-2023-39318 CVE-2023-39319 CVE-2023-39326 CVE-2023-45284 +CVE-2024-34156 +CVE-2024-34155 +CVE-2024-34158 # MEDIUM - telegraf GHSA-jq35-85cj-fj4p GHSA-7ww5-4wqc-m92c diff --git a/internal/docs/Operator-CRD.md b/internal/docs/Operator-CRD.md index 13be796c2..6e0b2d699 100644 --- a/internal/docs/Operator-CRD.md +++ b/internal/docs/Operator-CRD.md @@ -1,4 +1,4 @@ -## Managed Prometheus support for CRD (In private preview) +## Managed Prometheus support for CRD ### Use Prometheus Pod and Service Monitor Custom Resources The Azure Monitor metrics add-on supports scraping Prometheus metrics using Prometheus - Pod Monitors and Service Monitors, similar to the OSS Prometheus operator. Enabling the add-on will deploy the Pod and Service Monitor custom resource definitions to allow you to create your own custom resources. @@ -7,12 +7,11 @@ Creating these custom resources allows for easy configuration of scrape jobs in This document illustrates the steps need to setup custom resources (pod monitors and service monitors) with Azure Managed Prometheus to setup scrape jobs for the workloads running in your AKS clusters. ### Pre-requisites -1. You have Azure Managed Prometheus Operator model configured in the AKS cluster. Currently this feature is in private preview – please send us an email to ciprometheus@microsoft.com to enable the feature for your cluster or subscription. -2. Azure Monitor Workspace is configured and receiving Azure Managed Prometheus metrics. -3. The workload that you want to scrape metrics from is deployed and running on the AKS cluster. +1. Azure Monitor Workspace is configured and receiving Azure Managed Prometheus metrics. +2. The workload that you want to scrape metrics from is deployed and running on the AKS cluster. ### Enable Azure Managed Prometheus with Operator/CRD support -Once your cluster/subscription is enabled with preview, you can enable Managed Prometheus for the AKS cluster. This will deploy the Azure Monitor metrics add-on and will automatically install the custom resource definition (CRD) for pod and service monitors. The add-on will use the same custom resource definition (CRD) for pod and service monitors as open-source Prometheus, except for a change in the group name and API version. If you have existing Prometheus CRDs and custom resources on your cluster, these will not conflict with the CRDs created by the add-on. +Once your cluster has Managed Prometheus enabled, Azure Monitor metrics add-on and will automatically install the custom resource definition (CRD) for pod and service monitors. The add-on will use the same custom resource definition (CRD) for pod and service monitors as open-source Prometheus, except for a change in the group name and API version. If you have existing Prometheus CRDs and custom resources on your cluster, these will not conflict with the CRDs created by the add-on. At the same time, the CRDs created for the OSS Prometheus will not be picked up by the managed Prometheus addon. This is intentional for the purposes of isolation of scrape jobs. ### Create a Pod or Service Monitor diff --git a/internal/docs/Operator-Sharding.md b/internal/docs/Operator-Sharding.md index 1f449f6ba..0cb08b3f0 100644 --- a/internal/docs/Operator-Sharding.md +++ b/internal/docs/Operator-Sharding.md @@ -1,4 +1,4 @@ -## Managed Prometheus support for Sharding (In private preview) +## Managed Prometheus support for Sharding ### Overview diff --git a/otelcollector/build/windows/scripts/setup.ps1 b/otelcollector/build/windows/scripts/setup.ps1 index 4c1d1e676..6d8db0783 100644 --- a/otelcollector/build/windows/scripts/setup.ps1 +++ b/otelcollector/build/windows/scripts/setup.ps1 @@ -15,8 +15,8 @@ New-Item -Type Directory -Path /etc/genevamonitoringagent ############################################################################################ Write-Host ('Installing Metrics Extension'); try { - Invoke-WebRequest -Uri "https://github.com/Azure/prometheus-collector/releases/download/v6.8.9-main-05-02-2024-9facd0f8/MdmMetricsExtension.2.2024.419.1535.nupkg" -OutFile /installation/ME/mdmmetricsextension.2.2024.419.1535.zip - Expand-Archive -Path /installation/ME/mdmmetricsextension.2.2024.419.1535.zip -Destination /installation/ME/ + Invoke-WebRequest -Uri "https://github.com/Azure/prometheus-collector/releases/download/metricsext2-2.2024.823.1539/MdmMetricsExtension.2.2024.823.1539.nupkg" -OutFile /installation/ME/mdmmetricsextension.2.2024.823.1539.zip + Expand-Archive -Path /installation/ME/mdmmetricsextension.2.2024.823.1539.zip -Destination /installation/ME/ Move-Item /installation/ME/MetricsExtension /opt/metricextension/ } catch { @@ -83,7 +83,7 @@ If (Test-Path -Path $gemfile ) { ############################################################################################ Write-Host ('Installing GenevaMonitoringAgent'); try { - $genevamonitoringagentUri='https://github.com/Azure/prometheus-collector/releases/download/Promtheus-MA-Windows-4.1.2024/GenevaMonitoringAgent.46.15.4.zip' + $genevamonitoringagentUri = 'https://github.com/Azure/prometheus-collector/releases/download/Promtheus-MA-Windows-4.1.2024/GenevaMonitoringAgent.46.15.4.zip' Invoke-WebRequest -Uri $genevamonitoringagentUri -OutFile /installation/genevamonitoringagent.zip Expand-Archive -Path /installation/genevamonitoringagent.zip -Destination /installation/genevamonitoringagent Move-Item -Path /installation/genevamonitoringagent -Destination /opt/genevamonitoringagent/ -ErrorAction SilentlyContinue diff --git a/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/_ama-metrics-helpers.tpl b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/_ama-metrics-helpers.tpl new file mode 100644 index 000000000..69409443d --- /dev/null +++ b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/_ama-metrics-helpers.tpl @@ -0,0 +1,39 @@ +{{/* HPA merge. */}} +{{/* + 1. Set the default HPA values for minReplicas, maxReplicas, and metrics. + 2. If the current HPA already exists, override the default HPA values to the current values. +*/}} +{{ define "ama-metrics-merge-custom-hpa" }} + +{{/* Set the default HPA values for minReplicas, maxReplicas, and metrics. */}} +{{- $amaMetricsHpaName := "ama-metrics-hpa" }} +{{- $amaMetricsAutoscaleMin := 2 -}} +{{- $amaMetricsAutoscaleMax := 8 -}} + +amaMetricsMinReplicasFromHelper: 2 +amaMetricsMaxReplicasFromHelper: 8 + +{{/* If the current HPA already exists, set the HPA values to the current + HPA spec to preserve those values. */}} + +{{- $amaMetricsCurrentHPA := lookup "autoscaling/v2" "HorizontalPodAutoscaler" "kube-system" $amaMetricsHpaName }} +{{- if and $amaMetricsCurrentHPA $amaMetricsCurrentHPA.spec }} +{{- $amaMetricsMinReplicasFromCurrentSpec := $amaMetricsCurrentHPA.spec.minReplicas -}} +{{- $amaMetricsMaxReplicasFromCurrentSpec := $amaMetricsCurrentHPA.spec.maxReplicas -}} + + {{- if and ($amaMetricsMinReplicasFromCurrentSpec) (gt (int $amaMetricsMinReplicasFromCurrentSpec) 0) }} + {{- if ge (int $amaMetricsMinReplicasFromCurrentSpec) $amaMetricsAutoscaleMin }} +amaMetricsMinReplicasFromHelper: {{ $amaMetricsMinReplicasFromCurrentSpec }} + {{- end }} + {{- end }} + + {{- if and ($amaMetricsMaxReplicasFromCurrentSpec) (gt (int $amaMetricsMaxReplicasFromCurrentSpec) 0) }} + {{- if le (int $amaMetricsMaxReplicasFromCurrentSpec) $amaMetricsAutoscaleMax }} +amaMetricsMaxReplicasFromHelper: {{ $amaMetricsMaxReplicasFromCurrentSpec }} + {{- end }} + {{- end }} + +{{- end }} + +{{- end }} + diff --git a/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-collector-hpa.yaml b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-collector-hpa.yaml new file mode 100644 index 000000000..c88f3efad --- /dev/null +++ b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-collector-hpa.yaml @@ -0,0 +1,26 @@ +{{- if .Values.AzureMonitorMetrics.CollectorHPAEnabled}} +{{- $amaMetricsHpa := include "ama-metrics-merge-custom-hpa" . | fromYaml }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: ama-metrics-hpa + namespace: kube-system + labels: + component: ama-metrics-hpa + kubernetes.azure.com/managedby: aks +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: ama-metrics + minReplicas: {{ $amaMetricsHpa.amaMetricsMinReplicasFromHelper }} + maxReplicas: {{ $amaMetricsHpa.amaMetricsMaxReplicasFromHelper }} + metrics: + - type: ContainerResource + containerResource: + name: memory + container: prometheus-collector + target: + averageValue: 10Gi + type: AverageValue +{{- end }} \ No newline at end of file diff --git a/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-deployment.yaml b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-deployment.yaml index 09893df4f..4509a1553 100644 --- a/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-deployment.yaml +++ b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/templates/ama-metrics-deployment.yaml @@ -7,11 +7,22 @@ metadata: component: ama-metrics kubernetes.azure.com/managedby: aks spec: - {{- if .Values.AzureMonitorMetrics.TargetAllocatorEnabled }} +# New TargetAllocator Enabled scenario +{{- if .Values.AzureMonitorMetrics.TargetAllocatorEnabled }} + {{- if .Values.AzureMonitorMetrics.CollectorHPAEnabled }} # If HPA toggle is enabled, allow HPA to modify the deployment spec + {{- $currentSpec := (lookup "apps/v1" "Deployment" "kube-system" "ama-metrics").spec }} + {{- if $currentSpec }} + # in if check + replicas: {{ $currentSpec.replicas }} + {{- else}} # If current spec cannot be found, set to default + replicas: 2 + {{- end }} + {{- else }} # HPA is not enabled, set the replicas via helm adapter values replicas: {{ .Values.AzureMonitorMetrics.DeploymentReplicas }} - {{- else}} - replicas: 1 {{- end }} +{{- else }} # Legacy with no TargetAllocator Enabled + replicas: 1 +{{- end }} revisionHistoryLimit: 2 paused: false selector: @@ -71,6 +82,12 @@ spec: {{- else }} value: "false" {{- end }} + - name: AZMON_COLLECTOR_HPA_ENABLED + {{- if eq .Values.AzureMonitorMetrics.CollectorHPAEnabled true }} + value: "true" + {{- else }} + value: "false" + {{- end }} - name: customEnvironment {{- if .Values.AzureMonitorMetrics.isArcACluster }} value: "arcautonomous" diff --git a/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/values-template.yaml b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/values-template.yaml index 8a41d6245..c45e13473 100644 --- a/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/values-template.yaml +++ b/otelcollector/deploy/addon-chart/azure-monitor-metrics-addon/values-template.yaml @@ -51,6 +51,7 @@ AzureMonitorMetrics: ImageTagTargetAllocator: ${IMAGE_TAG}-targetallocator ImageTagCfgReader: ${IMAGE_TAG}-cfg TargetAllocatorEnabled: true + CollectorHPAEnabled: true DeploymentReplicas: 2 CfgReaderCPULimit: 1 CfgReaderMemoryLimit: 1Gi diff --git a/otelcollector/metricextension/me.config b/otelcollector/metricextension/me.config index 8d8725631..66524b7f9 100644 --- a/otelcollector/metricextension/me.config +++ b/otelcollector/metricextension/me.config @@ -23,5 +23,7 @@ "maxStringInternCacheSizeMb":5000, "interningSwapPeriodInMin":10000, "internalQueueSizeManagementPeriodInSec":10000, - "proxyDefinitionMode":1 + "proxyDefinitionMode":1, + "memoryPoolReleasePeriodInMin":10, + "memoryPoolReleaseTriggerPercentage": 70 } diff --git a/otelcollector/metricextension/me_ds.config b/otelcollector/metricextension/me_ds.config index 8d8725631..66524b7f9 100644 --- a/otelcollector/metricextension/me_ds.config +++ b/otelcollector/metricextension/me_ds.config @@ -23,5 +23,7 @@ "maxStringInternCacheSizeMb":5000, "interningSwapPeriodInMin":10000, "internalQueueSizeManagementPeriodInSec":10000, - "proxyDefinitionMode":1 + "proxyDefinitionMode":1, + "memoryPoolReleasePeriodInMin":10, + "memoryPoolReleaseTriggerPercentage": 70 } diff --git a/otelcollector/metricextension/me_ds_internal.config b/otelcollector/metricextension/me_ds_internal.config index a7d155e91..c8a6e4ca5 100644 --- a/otelcollector/metricextension/me_ds_internal.config +++ b/otelcollector/metricextension/me_ds_internal.config @@ -23,5 +23,7 @@ "maxStringInternCacheSizeMb":5000, "interningSwapPeriodInMin":10000, "internalQueueSizeManagementPeriodInSec":10000, - "proxyDefinitionMode":1 + "proxyDefinitionMode":1, + "memoryPoolReleasePeriodInMin":10, + "memoryPoolReleaseTriggerPercentage": 70 } diff --git a/otelcollector/metricextension/me_ds_internal_win.config b/otelcollector/metricextension/me_ds_internal_win.config index a7d155e91..c8a6e4ca5 100644 --- a/otelcollector/metricextension/me_ds_internal_win.config +++ b/otelcollector/metricextension/me_ds_internal_win.config @@ -23,5 +23,7 @@ "maxStringInternCacheSizeMb":5000, "interningSwapPeriodInMin":10000, "internalQueueSizeManagementPeriodInSec":10000, - "proxyDefinitionMode":1 + "proxyDefinitionMode":1, + "memoryPoolReleasePeriodInMin":10, + "memoryPoolReleaseTriggerPercentage": 70 } diff --git a/otelcollector/metricextension/me_ds_win.config b/otelcollector/metricextension/me_ds_win.config index 48c437ef6..8171da495 100644 --- a/otelcollector/metricextension/me_ds_win.config +++ b/otelcollector/metricextension/me_ds_win.config @@ -23,5 +23,7 @@ "maxStringInternCacheSizeMb":5000, "interningSwapPeriodInMin":10000, "internalQueueSizeManagementPeriodInSec":10000, - "proxyDefinitionMode":1 + "proxyDefinitionMode":1, + "memoryPoolReleasePeriodInMin":10, + "memoryPoolReleaseTriggerPercentage": 70 } diff --git a/otelcollector/metricextension/me_internal.config b/otelcollector/metricextension/me_internal.config index a7d155e91..c8a6e4ca5 100644 --- a/otelcollector/metricextension/me_internal.config +++ b/otelcollector/metricextension/me_internal.config @@ -23,5 +23,7 @@ "maxStringInternCacheSizeMb":5000, "interningSwapPeriodInMin":10000, "internalQueueSizeManagementPeriodInSec":10000, - "proxyDefinitionMode":1 + "proxyDefinitionMode":1, + "memoryPoolReleasePeriodInMin":10, + "memoryPoolReleaseTriggerPercentage": 70 } diff --git a/otelcollector/scripts/setup.sh b/otelcollector/scripts/setup.sh index 762f50c57..87e913f8e 100644 --- a/otelcollector/scripts/setup.sh +++ b/otelcollector/scripts/setup.sh @@ -64,7 +64,7 @@ cp /etc/cron.daily/logrotate /etc/cron.hourly/ # Install ME echo "Installing Metrics Extension..." -sudo tdnf install -y metricsext2-2.2024.419.1535 +sudo tdnf install -y metricsext2-2.2024.823.1539 sudo tdnf list installed | grep metricsext2 | awk '{print $2}' > metricsextversion.txt # tdnf does not have an autoremove feature. Only necessary packages are copied over to distroless build. Below reduces the image size if using non-distroless diff --git a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf index 2d1ea8923..11547a903 100644 --- a/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf +++ b/otelcollector/telegraf/telegraf-prometheus-collector-ta-enabled.conf @@ -36,6 +36,7 @@ operatormodel = "$AZMON_OPERATOR_ENABLED" operatormodelcfgmapsetting = "$AZMON_OPERATOR_ENABLED_CFG_MAP_SETTING" operatormodelchartsetting = "$AZMON_OPERATOR_ENABLED_CHART_SETTING" + collectorHpaEnabled = "$AZMON_COLLECTOR_HPA_ENABLED" # Configuration for telegraf agent [agent] diff --git a/otelcollector/telegraf/telegraf-prometheus-collector.conf b/otelcollector/telegraf/telegraf-prometheus-collector.conf index eadc46267..315ad58be 100644 --- a/otelcollector/telegraf/telegraf-prometheus-collector.conf +++ b/otelcollector/telegraf/telegraf-prometheus-collector.conf @@ -37,6 +37,7 @@ operatormodel = "$AZMON_OPERATOR_ENABLED" operatormodelcfgmapsetting = "$AZMON_OPERATOR_ENABLED_CFG_MAP_SETTING" operatormodelchartsetting = "$AZMON_OPERATOR_ENABLED_CHART_SETTING" + collectorHpaEnabled = "$AZMON_COLLECTOR_HPA_ENABLED" # Configuration for telegraf agent [agent]