diff --git a/resources/monitoring/helm/agent.yaml.liquid b/resources/monitoring/helm/agent.yaml.liquid new file mode 100644 index 0000000..1a18a87 --- /dev/null +++ b/resources/monitoring/helm/agent.yaml.liquid @@ -0,0 +1,54 @@ +vmagent: + enabled: true + + additionalRemoteWrites: + - url: https://{{ context.host }}/insert/0/prometheus/api/v1/write + basicAuth: + username: + name: vm-auth + key: user + password: + name: vm-auth + key: password + + {% raw %} + spec: + externalLabels: + cluster: {{ cluster.handle }} + {% endraw %} + +prometheus-operator-crds: + enabled: true + +vmalert: + enabled: false + +alertmanager: + enabled: false + +vmsingle: + enabled: false + +grafana: + enabled: false + +vmcluster: + enabled: false + +kubeApiServer: + enabled: false + +victoria-metrics-operator: + operator: + enable_converter_ownership: true + +extraObjects: +- apiVersion: v1 + kind: Secret + metadata: + name: vm-auth + stringData: + {% raw %} + password: {{ configuration.password }} + user: {{ configuration.user }} + {% endraw %} \ No newline at end of file diff --git a/resources/monitoring/helm/kps-agent.yaml b/resources/monitoring/helm/kps-agent.yaml deleted file mode 100644 index 3477152..0000000 --- a/resources/monitoring/helm/kps-agent.yaml +++ /dev/null @@ -1,115 +0,0 @@ -fullnameOverride: monitoring - -defaultRules: - create: false - rules: - alertmanager: true - etcd: true - configReloaders: true - general: true - k8sContainerCpuUsageSecondsTotal: true - k8sContainerMemoryCache: true - k8sContainerMemoryRss: true - k8sContainerMemorySwap: true - k8sContainerResource: true - k8sContainerMemoryWorkingSetBytes: true - k8sPodOwner: true - kubeApiserverAvailability: true - kubeApiserverBurnrate: true - kubeApiserverHistogram: true - kubeApiserverSlos: true - kubeControllerManager: true - kubelet: true - kubeProxy: true - kubePrometheusGeneral: true - kubePrometheusNodeRecording: true - kubernetesApps: true - kubernetesResources: true - kubernetesStorage: true - kubernetesSystem: true - kubeSchedulerAlerting: true - kubeSchedulerRecording: true - kubeStateMetrics: true - network: true - node: true - nodeExporterAlerting: true - nodeExporterRecording: true - prometheus: true - prometheusOperator: true - windows: true - - -alertmanager: - enabled: false - fullnameOverride: kps-alertmanager - - -prometheusOperator: - tls: - enabled: false - admissionWebhooks: - enabled: false - prometheusConfigReloader: - resources: - requests: - cpu: 200m - memory: 50Mi - limits: - memory: 100Mi -grafana: - enabled: false - -# monitored k8s components -kubeApiServer: - enabled: true - -kubelet: - enabled: true - -coreDns: - enabled: true - -# already monitored with coreDns -kubeDns: - enabled: false - -kubeProxy: - enabled: true - -kubeStateMetrics: - enabled: true - -kube-state-metrics: - fullnameOverride: kps-kube-state-metrics - selfMonitor: - enabled: true - -nodeExporter: - enabled: true - -prometheus-node-exporter: - fullnameOverride: kps-node-exporter - prometheus: - monitor: - enabled: true - resources: - requests: - memory: 512Mi - cpu: 250m - limits: - memory: 2048Mi - -# EKS hides metrics for controller manager, scheduler, and etcd -# https://github.com/aws/eks-anywhere/issues/4405 -# disable kube controller manager scraping -kubeControllerManager: - enabled: false - -# disable kube scheduler scraping -kubeScheduler: - enabled: false - -kubeEtcd: - enabled: false - - diff --git a/resources/monitoring/helm/kps-agent.yaml.liquid b/resources/monitoring/helm/kps-agent.yaml.liquid deleted file mode 100644 index bf2bc21..0000000 --- a/resources/monitoring/helm/kps-agent.yaml.liquid +++ /dev/null @@ -1,28 +0,0 @@ -prometheus: - enabled: true - agentMode: true - extraSecret: - name: basic-auth-remote - data: - {% raw %} - user: {{ configuration.basicAuthUser }} - password: {{ configuration.basicAuthPassword }} - {% endraw %} - prometheusSpec: - remoteWrite: - - url: {{ context.prometheusHost }} - name: mgmt-cluster-prometheus - basicAuth: - username: - name: basic-auth-remote - key: user - password: - name: basic-auth-remote - key: password - writeRelabelConfigs: - - sourceLabels: [] - targetLabel: 'cluster' - {% raw %} - replacement: {{ cluster.Handle }} - {% endraw %} - diff --git a/resources/monitoring/helm/kps-mgmt.liquid b/resources/monitoring/helm/kps-mgmt.liquid deleted file mode 100644 index 410f0fd..0000000 --- a/resources/monitoring/helm/kps-mgmt.liquid +++ /dev/null @@ -1,132 +0,0 @@ -{% raw %} -prometheus: - prometheusSpec: - prometheusExternalLabelName: {{ cluster.Handle }} - # incoming metrics from workload clusters will have a cluster label set to the cluster handle, but that's only assigned at the remote write push - # the mgmt cluster itself will not have a cluster label set, one can use an external label, but that's only added on push time - # ideally we would have a scrape class that would add a cluster label to all targets scraped by prometheus, this is currently not supported, but will be with probably the next release of the prometheus operator - # in the meantime we add relabel config to all servicemonitors individually (see below) - # this is how it would look like: - # use scrape classes in the future to add a cluster label to all targets scraped by prometheus - # this will make sure we can always identify the cluster a target belongs to, even for the mgmt cluster prometheus - # https://github.com/prometheus-operator/prometheus-operator/pull/5978 - # https://github.com/prometheus-operator/prometheus-operator/pull/6379 - #additionalConfig: - # scrapeClasses: - # - name: cluster_label - # default: true - # relabelings: - # - sourceLabels: [] - # targetLabel: cluster - # replacement: {{ cluster.Handle }} - serviceMonitor: - enabled: true - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -alertmanager: - serviceMonitor: - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -grafana: - serviceMonitor: - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -kubeApiServer: - serviceMonitor: - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - - -kubelet: - serviceMonitor: - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - - -coreDns: - serviceMonitor: - enabled: true - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -# already monitored with coreDns -kubeDns: - serviceMonitor: - enabled: true - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -kubeProxy: - enabled: true - serviceMonitor: - enabled: true - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -kubeStateMetrics: - enabled: true - -kube-state-metrics: - fullnameOverride: kps-kube-state-metrics - selfMonitor: - enabled: true - prometheus: - monitor: - enabled: true - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - -nodeExporter: - enabled: true - -prometheus-node-exporter: - fullnameOverride: kps-node-exporter - prometheus: - monitor: - enabled: true - relabelings: - - sourceLabels: [] - targetLabel: cluster - replacement: {{ cluster.Handle }} - - resources: - requests: - memory: 512Mi - cpu: 250m - limits: - memory: 2048Mi - -# EKS hides metrics for controller manager, scheduler, and etcd -# https://github.com/aws/eks-anywhere/issues/4405 -# disable kube controller manager scraping -kubeControllerManager: - enabled: false - -# disable kube scheduler scraping -kubeScheduler: - enabled: false - -kubeEtcd: - enabled: false -{% endraw %} \ No newline at end of file diff --git a/resources/monitoring/helm/kps-mgmt.yaml b/resources/monitoring/helm/kps-mgmt.yaml deleted file mode 100644 index 5276986..0000000 --- a/resources/monitoring/helm/kps-mgmt.yaml +++ /dev/null @@ -1,158 +0,0 @@ -fullnameOverride: monitoring - -defaultRules: - create: true - rules: - alertmanager: true - etcd: true - configReloaders: true - general: true - k8sContainerCpuUsageSecondsTotal: true - k8sContainerMemoryCache: true - k8sContainerMemoryRss: true - k8sContainerMemorySwap: true - k8sContainerResource: true - k8sContainerMemoryWorkingSetBytes: true - k8sPodOwner: true - kubeApiserverAvailability: true - kubeApiserverBurnrate: true - kubeApiserverHistogram: true - kubeApiserverSlos: true - kubeControllerManager: true - kubelet: true - kubeProxy: true - kubePrometheusGeneral: true - kubePrometheusNodeRecording: true - kubernetesApps: true - kubernetesResources: true - kubernetesStorage: true - kubernetesSystem: true - kubeSchedulerAlerting: true - kubeSchedulerRecording: true - kubeStateMetrics: true - network: true - node: true - nodeExporterAlerting: true - nodeExporterRecording: true - prometheus: true - prometheusOperator: true - windows: true - - -alertmanager: - enabled: true - fullnameOverride: kps-alertmanager - - -prometheusOperator: - tls: - enabled: false - admissionWebhooks: - enabled: false - prometheusConfigReloader: - resources: - requests: - cpu: 200m - memory: 50Mi - limits: - memory: 100Mi -additionalScrapeConfigs: - - job_name: opencost - honor_labels: true - scrape_interval: 1m - scrape_timeout: 10s - metrics_path: /metrics - scheme: http - dns_sd_configs: - - names: - - opencost.monitoring - type: 'A' - port: 9003 - -grafana: - enabled: false - -prometheus: - enabled: true - ingress: - enabled: true - ingressClassName: nginx - annotations: - kubernetes.io/tls-acme: "true" - cert-manager.io/cluster-issuer: letsencrypt-prod - nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' - nginx.ingress.kubernetes.io/auth-type: basic - nginx.ingress.kubernetes.io/auth-secret: basic-auth-prom - nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' - hosts: - - {{ context.prometheusHost }} - tls: - - secretName: prometheus-tls - hosts: - - {{ context.prometheusHost }} - servicePort: 9090 - pathType: Prefix - paths: - - / - - /.* - prometheusSpec: - enableFeatures: - enableFeatures: -remote-write-receiver - enableRemoteWriteReceiver: true - remoteWriteDashboards: - enabled: true - -# monitored k8s components -kubeApiServer: - enabled: true - -kubelet: - enabled: true - -coreDns: - enabled: true - -# already monitored with coreDns -kubeDns: - enabled: false - -kubeProxy: - enabled: true - -kubeStateMetrics: - enabled: true - -kube-state-metrics: - fullnameOverride: kps-kube-state-metrics - selfMonitor: - enabled: true - -nodeExporter: - enabled: true - -prometheus-node-exporter: - fullnameOverride: kps-node-exporter - prometheus: - monitor: - enabled: true - resources: - requests: - memory: 512Mi - cpu: 250m - limits: - memory: 2048Mi - -# EKS hides metrics for controller manager, scheduler, and etcd -# https://github.com/aws/eks-anywhere/issues/4405 -# disable kube controller manager scraping -kubeControllerManager: - enabled: false - -# disable kube scheduler scraping -kubeScheduler: - enabled: false - -kubeEtcd: - enabled: false - - diff --git a/resources/monitoring/helm/mgmt.yaml.liquid b/resources/monitoring/helm/mgmt.yaml.liquid new file mode 100644 index 0000000..f260424 --- /dev/null +++ b/resources/monitoring/helm/mgmt.yaml.liquid @@ -0,0 +1,71 @@ +vmcluster: + enabled: true + + spec: + vminsert: + extraArgs: + maxLabelsPerTimeseries: '50' + + vmstorage: + resources: + limits: + cpu: "1" + memory: 5Gi + +victoria-metrics-operator: + enabled: false + +prometheus-node-exporter: + enabled: false + +vmagent: + enabled: false + +vmsingle: + enabled: false + +grafana: + enabled: false + +kubeApiServer: + enabled: false + +fullnameOverride: vm-cluster + +extraObjects: +- apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMAuth + metadata: + name: vm-auth + spec: + selectAllByDefault: true + ingress: + class_name: nginx # <-- change this to your ingress-controller + host: {{ context.host }} + tlsHosts: + - {{ context.host }} + tlsSecretName: vmetrics-tls + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod +- apiVersion: operator.victoriametrics.com/v1beta1 + kind: VMUser + {% raw %} + metadata: + name: {{ configuration.user }} + spec: + password: {{ configuration.password }} + {% endraw %} + targetRefs: + - crd: + kind: VMCluster/vminsert + name: vm-cluster + namespace: monitoring + paths: + - "/insert/.*" + - crd: + kind: VMCluster/vmselect + name: vm-cluster + namespace: monitoring + paths: + - "/select/.*" + \ No newline at end of file diff --git a/resources/monitoring/services/agent.yaml b/resources/monitoring/services/agent.yaml index 1e1bd7e..dbc2f7e 100644 --- a/resources/monitoring/services/agent.yaml +++ b/resources/monitoring/services/agent.yaml @@ -1,12 +1,12 @@ apiVersion: deployments.plural.sh/v1alpha1 kind: ServiceDeployment metadata: - name: kps-agent + name: vmetrics-agent namespace: infra spec: - namespace: prometheus + namespace: monitoring git: - folder: helm/prometheus + folder: helm/monitoring ref: main configurationRef: name: basic-auth-prom @@ -16,12 +16,11 @@ spec: name: infra namespace: infra helm: - url: https://prometheus-community.github.io/helm-charts - version: x.x.x - chart: kube-prometheus-stack + url: https://victoriametrics.github.io/helm-charts/ + version: 0.25.9 + chart: victoria-metrics-k8s-stack valuesFiles: - - kps-agent.yaml - - kps-agent.yaml.liquid + - agent.yaml.liquid clusterRef: kind: Cluster name: {{ context.cluster }} diff --git a/resources/monitoring/services/mgmt.yaml b/resources/monitoring/services/mgmt.yaml index c13d697..0fd0885 100644 --- a/resources/monitoring/services/mgmt.yaml +++ b/resources/monitoring/services/mgmt.yaml @@ -1,12 +1,12 @@ apiVersion: deployments.plural.sh/v1alpha1 kind: ServiceDeployment metadata: - name: kps-mgmt + name: vmetrics-server namespace: infra spec: namespace: monitoring git: - folder: helm/prometheus + folder: helm/monitoring ref: main configurationRef: name: basic-auth-prom @@ -16,12 +16,11 @@ spec: name: infra namespace: infra helm: - url: https://prometheus-community.github.io/helm-charts - version: x.x.x - chart: kube-prometheus-stack + url: https://victoriametrics.github.io/helm-charts/ + version: 0.25.9 + chart: victoria-metrics-k8s-stack valuesFiles: - - kps-mgmt.yaml - - kps-mgmt.yaml.liquid + - mgmt.yaml.liquid clusterRef: kind: Cluster name: {{ context.cluster }} diff --git a/setup/pr-automation/prometheus/prom-agent-creator.yaml b/setup/pr-automation/prometheus/prom-agent-creator.yaml index 2626272..eb452c8 100644 --- a/setup/pr-automation/prometheus/prom-agent-creator.yaml +++ b/setup/pr-automation/prometheus/prom-agent-creator.yaml @@ -1,15 +1,15 @@ apiVersion: deployments.plural.sh/v1alpha1 kind: PrAutomation metadata: - name: prom-agent-creator + name: metrics-agent-creator spec: - name: prom-agent-creator + name: metrics-agent-creator documentation: | - Sets up a prometheus agent for shipping metrics, pairs with `prom-creator` + Sets up a victoria metrics agent for shipping metrics, pairs with `prom-creator` creates: templates: - source: resources/monitoring/helm - destination: "helm/prometheus" + destination: "helm/monitoring" external: false - source: resources/monitoring/mgmt.yaml destination: "bootstrap/monitoring/mgmt.yaml" @@ -23,9 +23,9 @@ spec: - name: cluster type: STRING documentation: CRD name for this cluster, use `mgmt` to place it in the management cluster - - name: prometheusHost + - name: host type: STRING - documentation: the FQDN of your prometheus instance + documentation: the FQDN of your victoria metrics prometheus instance - name: tagName type: STRING documentation: The cluster tag name you want to use to control the global service for installation of prometheus agent diff --git a/setup/pr-automation/prometheus/prom-creator.yaml b/setup/pr-automation/prometheus/prom-creator.yaml index 3c0bf44..1dc1533 100644 --- a/setup/pr-automation/prometheus/prom-creator.yaml +++ b/setup/pr-automation/prometheus/prom-creator.yaml @@ -5,11 +5,11 @@ metadata: spec: name: prom-creator documentation: | - Sets up a prometheus instance for metrics storage, pairs with prom-agent-creator to ship metrics from workload clusters + Sets up a victoria metrics prometheus instance for metrics storage, pairs with prom-agent-creator to ship metrics from workload clusters creates: templates: - source: resources/monitoring/helm - destination: "helm/prometheus" + destination: "helm/monitoring" external: false - source: resources/monitoring/mgmt.yaml destination: "bootstrap/monitoring/mgmt.yaml" @@ -23,6 +23,6 @@ spec: - name: cluster type: STRING documentation: CRD name for this cluster, use `mgmt` to place it in the management cluster - - name: prometheusHost + - name: host type: STRING - documentation: the FQDN you want prometheus to be hosted on, you'll need external-dns and cert manager already configured + documentation: the FQDN you want victoria metrics prometheus to be hosted on, you'll need external-dns and cert manager already configured