diff --git a/kubernetes/main/apps/observability/kube-prometheus-stack/app/helmrelease.yaml b/kubernetes/main/apps/observability/kube-prometheus-stack/app/helmrelease.yaml index bda223888..444254cf4 100644 --- a/kubernetes/main/apps/observability/kube-prometheus-stack/app/helmrelease.yaml +++ b/kubernetes/main/apps/observability/kube-prometheus-stack/app/helmrelease.yaml @@ -128,13 +128,6 @@ spec: registry: quay.io repository: prometheus/prometheus tag: v2.51.0-dedupelabels - thanos: - image: quay.io/thanos/thanos:${THANOS_VERSION} - version: "${THANOS_VERSION#v}" - objectStorageConfig: - existingSecret: - name: *secret - key: config retention: 2d retentionSize: 15GB externalLabels: diff --git a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/externalsecret.yaml b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/externalsecret.yaml deleted file mode 100644 index db04143df..000000000 --- a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/externalsecret.yaml +++ /dev/null @@ -1,91 +0,0 @@ ---- -# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/external-secrets.io/externalsecret_v1beta1.json -apiVersion: external-secrets.io/v1beta1 -kind: ExternalSecret -metadata: - name: alertmanager -spec: - refreshInterval: 5m - secretStoreRef: - kind: ClusterSecretStore - name: onepassword-connect - target: - name: alertmanager-secret - template: - engineVersion: v2 - data: - # Yo dawg I heard you like go templating so I put go templates in your go templates - ## Such a classic - alertmanager.yaml: | - global: - resolve_timeout: 5m - route: - group_by: ["alertname", "job"] - group_interval: 10m - group_wait: 1m - receiver: pushover - repeat_interval: 12h - routes: - - receiver: heartbeat - group_interval: 5m - group_wait: 0s - matchers: - - alertname =~ "Watchdog" - repeat_interval: 5m - - receiver: "null" - matchers: - - alertname =~ "InfoInhibitor" - - receiver: pushover - continue: true - matchers: - - severity = "critical" - inhibit_rules: - - equal: ["alertname", "namespace"] - source_matchers: - - severity = "critical" - target_matchers: - - severity = "warning" - receivers: - - name: heartbeat - webhook_configs: - - send_resolved: true - url: "{{ .ALERTMANAGER_UPTIMEROBOT_HEARTBEAT_URL }}" - - name: "null" - - name: pushover - pushover_configs: - - html: true - message: |- - {{ "{{-" }} range .Alerts {{ "}}" }} - {{ "{{-" }} if ne .Annotations.description "" {{ "}}" }} - {{ "{{" }} .Annotations.description {{ "}}" }} - {{ "{{-" }} else if ne .Annotations.summary "" {{ "}}" }} - {{ "{{" }} .Annotations.summary {{ "}}" }} - {{ "{{-" }} else if ne .Annotations.message "" {{ "}}" }} - {{ "{{" }} .Annotations.message {{ "}}" }} - {{ "{{-" }} else {{ "}}" }} - Alert description not available - {{ "{{-" }} end {{ "}}" }} - {{ "{{-" }} if gt (len .Labels.SortedPairs) 0 {{ "}}" }} - - {{ "{{-" }} range .Labels.SortedPairs {{ "}}" }} - {{ "{{" }} .Name {{ "}}" }}: {{ "{{" }} .Value {{ "}}" }} - {{ "{{-" }} end {{ "}}" }} - - {{ "{{-" }} end {{ "}}" }} - {{ "{{-" }} end {{ "}}" }} - priority: |- - {{ "{{" }} if eq .Status "firing" {{ "}}" }}1{{ "{{" }} else {{ "}}" }}0{{ "{{" }} end {{ "}}" }} - send_resolved: true - sound: gamelan - # ttl: 1d - title: >- - [{{ "{{" }} .Status | toUpper {{ "}}" }}{{ "{{" }} if eq .Status "firing" {{ "}}" }}:{{ "{{" }} .Alerts.Firing | len {{ "}}" }}{{ "{{" }} end {{ "}}" }}] - {{ "{{" }} .CommonLabels.alertname {{ "}}" }} - token: "{{ .ALERTMANAGER_PUSHOVER_TOKEN }}" - url_title: View in Alertmanager - user_key: "{{ .PUSHOVER_USER_KEY }}" - dataFrom: - - extract: - key: pushover - - extract: - key: alertmanager diff --git a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/gatus.yaml b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/gatus.yaml deleted file mode 100644 index 11a50726a..000000000 --- a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/gatus.yaml +++ /dev/null @@ -1,44 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: alertmanager-gatus-ep - namespace: observability - labels: - gatus.io/enabled: "true" -data: - config.yaml: | - endpoints: - - name: alertmanager - group: internal - url: 1.1.1.1 - interval: 1m - dns: - query-name: alert-manager.ok8.sh - query-type: A - conditions: - - "len([BODY]) == 0" - alerts: - - type: pushover ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prometheus-gatus-ep - namespace: observability - labels: - gatus.io/enabled: "true" -data: - config.yaml: | - endpoints: - - name: prometheus - group: internal - url: 1.1.1.1 - interval: 1m - dns: - query-name: prometheus.ok8.sh - query-type: A - conditions: - - "len([BODY]) == 0" - alerts: - - type: pushover diff --git a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/helmrelease.yaml b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/helmrelease.yaml index 4a002bc60..1f51034f4 100644 --- a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/helmrelease.yaml +++ b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/helmrelease.yaml @@ -1,5 +1,7 @@ --- -# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json +# yaml-language-server: $schema=https://kubernetes-schemas.ok8.sh/helm.toolkit.fluxcd.io/helmrelease_v2beta2.json +--- +# yaml-language-server: $schema=https://kubernetes-schemas.ok8.sh/helm.toolkit.fluxcd.io/helmrelease_v2.json apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: @@ -10,52 +12,41 @@ spec: chart: spec: chart: kube-prometheus-stack - version: 61.3.2 + version: 60.1.0 sourceRef: kind: HelmRepository name: prometheus-community namespace: flux-system install: - crds: Skip + crds: CreateReplace remediation: retries: 3 upgrade: cleanupOnFail: true - crds: Skip + crds: CreateReplace remediation: strategy: rollback retries: 3 dependsOn: - - name: prometheus-operator-crds - namespace: observability - name: openebs namespace: openebs-system values: - crds: - enabled: false cleanPrometheusOperatorObjectNames: true + crds: + enabled: true alertmanager: - ingress: - enabled: true - annotations: - external-dns.alpha.kubernetes.io/target: internal.ok8.sh - ingressClassName: internal - hosts: ["alertmanager.ok8.sh"] - pathType: Prefix - alertmanagerSpec: - useExistingSecret: true - configSecret: alertmanager-secret - storage: - volumeClaimTemplate: - spec: - storageClassName: openebs-hostpath - resources: - requests: - storage: 1Gi + enabled: false kubelet: enabled: true serviceMonitor: metricRelabelings: + # Remove duplicate labels provided by k3s + - action: keep + sourceLabels: ["__name__"] + regex: (apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authentication_token|cadvisor_version|container_blkio|container_cpu|container_fs|container_last|container_memory|container_network|container_oom|container_processes|container|csi_operations|disabled_metric|get_token|go|hidden_metric|kubelet_certificate|kubelet_cgroup|kubelet_container|kubelet_containers|kubelet_cpu|kubelet_device|kubelet_graceful|kubelet_http|kubelet_lifecycle|kubelet_managed|kubelet_node|kubelet_pleg|kubelet_pod|kubelet_run|kubelet_running|kubelet_runtime|kubelet_server|kubelet_started|kubelet_volume|kubernetes_build|kubernetes_feature|machine_cpu|machine_memory|machine_nvm|machine_scrape|node_namespace|plugin_manager|prober_probe|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|storage_operation|volume_manager|volume_operation|workqueue)_(.+) + - action: replace + sourceLabels: ["node"] + targetLabel: instance # Drop high cardinality labels - action: labeldrop regex: (uid) @@ -68,6 +59,10 @@ spec: enabled: true serviceMonitor: metricRelabelings: + # Remove duplicate labels provided by k3s + - action: keep + sourceLabels: ["__name__"] + regex: (aggregator_openapi|aggregator_unavailable|apiextensions_openapi|apiserver_admission|apiserver_audit|apiserver_cache|apiserver_cel|apiserver_client|apiserver_crd|apiserver_current|apiserver_envelope|apiserver_flowcontrol|apiserver_init|apiserver_kube|apiserver_longrunning|apiserver_request|apiserver_requested|apiserver_response|apiserver_selfrequest|apiserver_storage|apiserver_terminated|apiserver_tls|apiserver_watch|apiserver_webhooks|authenticated_user|authentication|disabled_metric|etcd_bookmark|etcd_lease|etcd_request|field_validation|get_token|go|grpc_client|hidden_metric|kube_apiserver|kubernetes_build|kubernetes_feature|node_authorizer|pod_security|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scrape_duration|scrape_samples|scrape_series|serviceaccount_legacy|serviceaccount_stale|serviceaccount_valid|watch_cache|workqueue)_(.+) # Drop high cardinality labels - action: drop sourceLabels: ["__name__"] @@ -79,23 +74,46 @@ spec: enabled: true endpoints: &cp - 10.69.0.69 + serviceMonitor: + metricRelabelings: + # Remove duplicate labels provided by k3s + - action: keep + sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|attachdetach_controller|authenticated_user|authentication|cronjob_controller|disabled_metric|endpoint_slice|ephemeral_volume|garbagecollector_controller|get_token|go|hidden_metric|job_controller|kubernetes_build|kubernetes_feature|leader_election|node_collector|node_ipam|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|pv_collector|registered_metric|replicaset_controller|rest_client|retroactive_storageclass|root_ca|running_managed|scrape_duration|scrape_samples|scrape_series|service_controller|storage_count|storage_operation|ttl_after|volume_operation|workqueue)_(.+)" kubeEtcd: enabled: true endpoints: *cp kubeScheduler: enabled: true endpoints: *cp + serviceMonitor: + metricRelabelings: + # Remove duplicate labels provided by k3s + - action: keep + sourceLabels: ["__name__"] + regex: "(apiserver_audit|apiserver_client|apiserver_delegated|apiserver_envelope|apiserver_storage|apiserver_webhooks|authenticated_user|authentication|disabled_metric|go|hidden_metric|kubernetes_build|kubernetes_feature|leader_election|process_cpu|process_max|process_open|process_resident|process_start|process_virtual|registered_metric|rest_client|scheduler|scrape_duration|scrape_samples|scrape_series|workqueue)_(.+)" kubeProxy: enabled: false prometheus: ingress: enabled: true - annotations: - external-dns.alpha.kubernetes.io/target: internal.ok8.sh ingressClassName: internal - hosts: ["prometheus.ok8.sh"] pathType: Prefix + hosts: + - &host prometheus.outsideour.casa + tls: + - hosts: + - *host prometheusSpec: + podMetadata: + annotations: + secret.reloader.stakater.com/reload: &secret kube-prometheus-stack-secret + additionalAlertManagerConfigs: + - static_configs: + - targets: + - alert-manager.ok8.sh + replicas: 1 + replicaExternalLabelName: __replica__ scrapeInterval: 1m # Must match interval in Grafana Helm chart ruleSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false @@ -105,23 +123,26 @@ spec: enableAdminAPI: true walCompression: true enableFeatures: - - auto-gomemlimit + - auto-gomaxprocs - memory-snapshot-on-shutdown - new-service-discovery-manager - retention: 14d - retentionSize: 70GB - resources: - requests: - cpu: 100m - limits: - memory: 1500Mi + retention: 2d + retentionSize: 15GB + externalLabels: + cluster: storage storageSpec: volumeClaimTemplate: spec: storageClassName: openebs-hostpath resources: requests: - storage: 75Gi + storage: 20Gi + thanosService: + enabled: false + thanosServiceExternal: + enabled: false + thanosServiceMonitor: + enabled: false nodeExporter: enabled: true prometheus-node-exporter: @@ -133,7 +154,8 @@ spec: - action: replace regex: (.*) replacement: $1 - sourceLabels: ["__meta_kubernetes_pod_node_name"] + sourceLabels: + - __meta_kubernetes_pod_node_name targetLabel: kubernetes_node kubeStateMetrics: enabled: true @@ -150,8 +172,8 @@ spec: - action: replace regex: (.*) replacement: $1 - sourceLabels: ["__meta_kubernetes_pod_node_name"] + sourceLabels: + - __meta_kubernetes_pod_node_name targetLabel: kubernetes_node grafana: enabled: false - forceDeployDashboards: true diff --git a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/kustomization.yaml b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/kustomization.yaml index 6c0f9acef..17cbc72b2 100644 --- a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/kustomization.yaml +++ b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/kustomization.yaml @@ -3,8 +3,4 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - ./externalsecret.yaml - ./helmrelease.yaml - - ./prometheusrule.yaml - - ./gatus.yaml - - ./scrapeconfig.yaml diff --git a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/scrapeconfig.yaml b/kubernetes/storage/apps/observability/kube-prometheus-stack/app/scrapeconfig.yaml deleted file mode 100644 index d2bb86dc5..000000000 --- a/kubernetes/storage/apps/observability/kube-prometheus-stack/app/scrapeconfig.yaml +++ /dev/null @@ -1,12 +0,0 @@ ---- -# yaml-language-server: $schema=https://kubernetes-schemas.ok8.sh/monitoring.coreos.com/scrapeconfig_v1alpha1.json -apiVersion: monitoring.coreos.com/v1alpha1 -kind: ScrapeConfig -metadata: - name: airgradient - namespace: observability -spec: - staticConfigs: - - targets: - - 10.42.42.128:9926 - metricsPath: /metrics