From f8711332bcc97aeedcaca7ca49b62eae362cbbcb Mon Sep 17 00:00:00 2001 From: Grace Wehner Date: Thu, 17 Oct 2024 12:18:58 -0700 Subject: [PATCH] ci/cd: pipeline reliability and testkube fixes (#998) [comment]: # (Note that your PR title should follow the conventional commit format: https://conventionalcommits.org/en/v1.0.0/#summary) # PR Description - Skip some SDL tasks for branch builds. Still run for PRs and merges to main - Add retries to trivy task if failing to pull from the DB. Do not retry if the scan actually ran and failed because of vulnerabilities - Enable backup DB that trivy has added through an env var. - Check for arc proxy cluster to be ready and add retries. - Fix testkube configmap yaml to scrape correct node-exporter port --- .pipelines/azure-pipeline-build.yml | 126 +++++++++++++++--- ...rics-prometheus-config-node-configmap.yaml | 2 +- 2 files changed, 111 insertions(+), 17 deletions(-) diff --git a/.pipelines/azure-pipeline-build.yml b/.pipelines/azure-pipeline-build.yml index 463080cb2..18ba81571 100644 --- a/.pipelines/azure-pipeline-build.yml +++ b/.pipelines/azure-pipeline-build.yml @@ -512,6 +512,7 @@ stages: name: Azure-Pipelines-Windows-CI-Test-EO variables: skipComponentGovernanceDetection: true + condition: and(succeeded(), or(eq(variables.IS_PR, true), eq(variables.IS_MAIN_BRANCH, true))) steps: - checkout: self submodules: true @@ -529,6 +530,7 @@ stages: name: Azure-Pipelines-CI-Test-EO variables: skipComponentGovernanceDetection: true + condition: and(succeeded(), or(eq(variables.IS_PR, true), eq(variables.IS_MAIN_BRANCH, true))) steps: - checkout: self submodules: true @@ -693,18 +695,29 @@ stages: - bash: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin - trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM --exit-code 1 $(LINUX_FULL_IMAGE_NAME) - if [ $? -ne 0 ]; then - exit 1 - fi - trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM --exit-code 1 $(KUBE_STATE_METRICS_IMAGE) - if [ $? -ne 0 ]; then - exit 1 - fi - trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM --exit-code 1 $(NODE_EXPORTER_IMAGE) - if [ $? -ne 0 ]; then - exit 1 - fi + export TRIVY_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db" + export TRIVY_JAVA_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-java-db,public.ecr.aws/aquasecurity/trivy-java-db" + for image in $(LINUX_FULL_IMAGE_NAME) $(KUBE_STATE_METRICS_IMAGE) $(NODE_EXPORTER_IMAGE); do + for i in {1..5}; do + trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM $image > trivy_output.log 2>&1 + TRIVY_EXIT_CODE=$? + if [ $TRIVY_EXIT_CODE -eq 0 ]; then + cat trivy_output.log + break + fi + if grep -q "TOOMANYREQUESTS" trivy_output.log; then + echo "Error: Too many requests to the Trivy server. Retrying... ($i/5)" + sleep 5 + else + cat trivy_output.log + exit 1 + fi + done + if [ $TRIVY_EXIT_CODE -ne 0 ]; then + echo "Error: Trivy scan failed after 5 retries." + exit 1 + fi + done workingDirectory: $(Build.SourcesDirectory) displayName: "Build: run trivy scan" @@ -869,8 +882,25 @@ stages: - bash: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin - trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM --exit-code 1 $(LINUX_CCP_FULL_IMAGE_NAME) - if [ $? -ne 0 ]; then + export TRIVY_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db" + export TRIVY_JAVA_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-java-db,public.ecr.aws/aquasecurity/trivy-java-db" + for i in {1..5}; do + trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM $(LINUX_CCP_FULL_IMAGE_NAME) > trivy_output.log 2>&1 + TRIVY_EXIT_CODE=$? + if [ $TRIVY_EXIT_CODE -eq 0 ]; then + cat trivy_output.log + break + fi + if grep -q "TOOMANYREQUESTS" trivy_output.log; then + echo "Error: Too many requests to the Trivy server. Retrying... ($i/5)" + sleep 5 + else + cat trivy_output.log + exit 1 + fi + done + if [ $TRIVY_EXIT_CODE -ne 0 ]; then + echo "Error: Trivy scan failed after 5 retries." exit 1 fi workingDirectory: $(Build.SourcesDirectory) @@ -979,7 +1009,27 @@ stages: - bash: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin - trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM --exit-code 1 $(TARGET_ALLOCATOR_FULL_IMAGE_NAME) + export TRIVY_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db" + export TRIVY_JAVA_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-java-db,public.ecr.aws/aquasecurity/trivy-java-db" + for i in {1..5}; do + trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM $(TARGET_ALLOCATOR_FULL_IMAGE_NAME) > trivy_output.log 2>&1 + TRIVY_EXIT_CODE=$? + if [ $TRIVY_EXIT_CODE -eq 0 ]; then + cat trivy_output.log + break + fi + if grep -q "TOOMANYREQUESTS" trivy_output.log; then + echo "Error: Too many requests to the Trivy server. Retrying... ($i/5)" + sleep 5 + else + cat trivy_output.log + exit 1 + fi + done + if [ $TRIVY_EXIT_CODE -ne 0 ]; then + echo "Error: Trivy scan failed after 5 retries." + exit 1 + fi workingDirectory: $(Build.SourcesDirectory) displayName: "Build: run trivy scan" @@ -1074,7 +1124,27 @@ stages: - bash: | curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin - trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM --exit-code 1 $(LINUX_CONFIG_READER_FULL_IMAGE_NAME) + export TRIVY_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-db,public.ecr.aws/aquasecurity/trivy-db" + export TRIVY_JAVA_DB_REPOSITORY="ghcr.io/aquasecurity/trivy-java-db,public.ecr.aws/aquasecurity/trivy-java-db" + for i in {1..5}; do + trivy image --ignore-unfixed --no-progress --severity HIGH,CRITICAL,MEDIUM $(LINUX_CONFIG_READER_FULL_IMAGE_NAME) > trivy_output.log 2>&1 + TRIVY_EXIT_CODE=$? + if [ $TRIVY_EXIT_CODE -eq 0 ]; then + cat trivy_output.log + break + fi + if grep -q "TOOMANYREQUESTS" trivy_output.log; then + echo "Error: Too many requests to the Trivy server. Retrying... ($i/5)" + sleep 5 + else + cat trivy_output.log + exit 1 + fi + done + if [ $TRIVY_EXIT_CODE -ne 0 ]; then + echo "Error: Trivy scan failed after 5 retries." + exit 1 + fi workingDirectory: $(Build.SourcesDirectory) displayName: "Build: run trivy scan" @@ -1525,6 +1595,29 @@ stages: inlineScript: | az config set extension.use_dynamic_install=yes_without_prompt az k8s-extension update --name azuremonitor-metrics --resource-group ci-dev-arc-wcus --cluster-name ci-dev-arc-wcus --cluster-type connectedClusters --version $HELM_SEMVER --release-train pipeline + retryCountOnTaskFailure: 2 + + - task: AzureCLI@2 + displayName: "Deploy: wait for ci-dev-arc-proxy cluster to be ready" + inputs: + azureSubscription: 'ContainerInsights_Build_Subscription(9b96ebbd-c57a-42d1-bbe9-b69296e4c7fb)' + scriptType: 'bash' + scriptLocation: 'inlineScript' + inlineScript: | + for i in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + do + state=$(az k8s-extension show --name azuremonitor-metrics --resource-group ci-dev-arc-proxy --cluster-name ci-dev-arc-proxy --cluster-type connectedClusters | jq -r '.provisioningState') + # We want to wait in case the status is 'Creating' or 'Updating' because of another PR merged shortly before the current one. + if [ "$state" = "Succeeded" ] || [ "$state" = "Failed" ] + then + echo "Cluster is ready to install extension" + exit 0 + fi + sleep 30 + done + echo "Cluster is installing a different version of the extension" + exit 1 + retryCountOnTaskFailure: 5 - task: AzureCLI@2 displayName: "Deploy: ci-dev-arc-proxy cluster" @@ -1535,6 +1628,7 @@ stages: inlineScript: | az config set extension.use_dynamic_install=yes_without_prompt az k8s-extension update --name azuremonitor-metrics --resource-group ci-dev-arc-proxy --cluster-name ci-dev-arc-proxy --cluster-type connectedClusters --version $HELM_SEMVER --release-train pipeline + retryCountOnTaskFailure: 2 - deployment: Testkube_ARC displayName: "Test: Arc testkube tests" diff --git a/otelcollector/test/test-cluster-yamls/configmaps/ama-metrics-prometheus-config-node-configmap.yaml b/otelcollector/test/test-cluster-yamls/configmaps/ama-metrics-prometheus-config-node-configmap.yaml index ff9917814..148aab23d 100644 --- a/otelcollector/test/test-cluster-yamls/configmaps/ama-metrics-prometheus-config-node-configmap.yaml +++ b/otelcollector/test/test-cluster-yamls/configmaps/ama-metrics-prometheus-config-node-configmap.yaml @@ -38,7 +38,7 @@ data: replacement: $$NODE_IP target_label: node_ip_double_dollar_sign static_configs: - - targets: ['$NODE_IP:9100'] + - targets: ['$NODE_IP:19100'] metadata: name: ama-metrics-prometheus-config-node namespace: kube-system