From 1cd42a999964d0d9295e55dc9b54f4b613a73243 Mon Sep 17 00:00:00 2001 From: michaeljguarino Date: Sat, 30 Mar 2024 00:38:16 -0400 Subject: [PATCH] Add monitoring setup example --- .gitignore | 1 - resources/monitoring/README.md | 26 +++ .../monitoring/helm-repositories/grafana.yaml | 8 + .../helm-repositories/opencost.yaml | 8 + .../prometheuscommunity.yaml | 8 + .../monitoring/helm-values/kps-agent.yaml | 115 ++++++++++ .../helm-values/kps-agent.yaml.liquid | 24 ++ .../monitoring/helm-values/kps-mgmt.liquid | 130 +++++++++++ .../monitoring/helm-values/kps-mgmt.yaml | 206 ++++++++++++++++++ resources/monitoring/helm-values/loki.yaml | 53 +++++ .../monitoring/helm-values/loki.yaml.liquid | 19 ++ .../monitoring/helm-values/opencost.yaml | 65 ++++++ .../monitoring/helm-values/promtail.yaml | 23 ++ .../helm-values/promtail.yaml.liquid | 6 + .../services/kps-agent-fleet/secret.yaml | 7 + .../kps-agent-fleet/servicedeployment.yaml | 43 ++++ .../kps-mgmt/basicauth-prometheus.yaml | 11 + .../services/kps-mgmt/grafana-datasource.yaml | 24 ++ .../services/kps-mgmt/servicedeployment.yaml | 38 ++++ .../services/loki-mgmt/servicedeployment.yaml | 32 +++ .../opencost-mgmt/servicedeployment.yaml | 29 +++ .../promtail-fleet/basicauth-loki.yaml | 12 + .../promtail-fleet/servicedeployment.yaml | 43 ++++ resources/monitoring/services/setup.yaml | 7 + resources/monitoring/terraform/cd.tf | 74 +++++++ resources/monitoring/terraform/persistence.tf | 88 ++++++++ resources/monitoring/terraform/provider.tf | 54 +++++ 27 files changed, 1153 insertions(+), 1 deletion(-) create mode 100644 resources/monitoring/helm-repositories/grafana.yaml create mode 100644 resources/monitoring/helm-repositories/opencost.yaml create mode 100644 resources/monitoring/helm-repositories/prometheuscommunity.yaml create mode 100644 resources/monitoring/helm-values/kps-agent.yaml create mode 100644 resources/monitoring/helm-values/kps-agent.yaml.liquid create mode 100644 resources/monitoring/helm-values/kps-mgmt.liquid create mode 100644 resources/monitoring/helm-values/kps-mgmt.yaml create mode 100644 resources/monitoring/helm-values/loki.yaml create mode 100644 resources/monitoring/helm-values/loki.yaml.liquid create mode 100644 resources/monitoring/helm-values/opencost.yaml create mode 100644 resources/monitoring/helm-values/promtail.yaml create mode 100644 resources/monitoring/helm-values/promtail.yaml.liquid create mode 100644 resources/monitoring/services/kps-agent-fleet/secret.yaml create mode 100644 resources/monitoring/services/kps-agent-fleet/servicedeployment.yaml create mode 100644 resources/monitoring/services/kps-mgmt/basicauth-prometheus.yaml create mode 100644 resources/monitoring/services/kps-mgmt/grafana-datasource.yaml create mode 100644 resources/monitoring/services/kps-mgmt/servicedeployment.yaml create mode 100644 resources/monitoring/services/loki-mgmt/servicedeployment.yaml create mode 100644 resources/monitoring/services/opencost-mgmt/servicedeployment.yaml create mode 100644 resources/monitoring/services/promtail-fleet/basicauth-loki.yaml create mode 100644 resources/monitoring/services/promtail-fleet/servicedeployment.yaml create mode 100644 resources/monitoring/services/setup.yaml create mode 100644 resources/monitoring/terraform/cd.tf create mode 100644 resources/monitoring/terraform/persistence.tf create mode 100644 resources/monitoring/terraform/provider.tf diff --git a/.gitignore b/.gitignore index 9b898cc..bd1c40e 100644 --- a/.gitignore +++ b/.gitignore @@ -33,7 +33,6 @@ override.tf.json .terraformrc terraform.rc -helm-values test/helm-values # IDE diff --git a/resources/monitoring/README.md b/resources/monitoring/README.md index e69de29..ef251fa 100644 --- a/resources/monitoring/README.md +++ b/resources/monitoring/README.md @@ -0,0 +1,26 @@ +# Prometheus Monitoring Setup + +This gives an overview of a production-ready observability setup with Prometheus for timeseries metrics collection and Loki for log aggregation. It sets up central instances of Prometheus and Loki (on your management cluster in this case, but could go elsewhere), and promtail plus prometheus agent installs to collect and ship metrics remotely. + +A quick overview of repository structure: + +* `/terraform` - example terraform you can rework to set up cloud resources. In this case it sets up the s3 bucket needed by loki to persist the service and terraform's the initial services to start the service-of-service process to provision everything else +* `/helm-values` - values files for all the charts needed, note the `.liquid` variants support templating both configuration values and other contextual information that's useful especially in global service contexts +* `/services` - the service-of-services that sets up all the main components, in order these are: + - prometheus agent, replicated across clusters as a global service + - prometheus itself, deployed via kube-prometheus-stack + - loki, deployed on the mgmt cluster + - promtail, replicated as a global service +* `/helm-repositories` - flux helm repositories crds needed to create helm repository services for the various resources + +## Adopting this setup + +We'd recommend copy-pasting this into a repo you own to assist customization. There are a few points you'd need to know to customize: + +* urls for your prometheus/loki domain, in the kps-* and loki-* helm values. They will usually be in ingress configuration, but also elsewhere. Our defaults were `loki.boot-aws.onplural.sh`, `prometheus.boot-aws.onplural.sh`, etc +* cluster names in servicedeployment.yaml files, eg in `services/kps-agent-fleet/servicedeployment.yaml`, it's wired to our default cluster name, `boot-staging` +* you currently need to manually set `basicAuthUser` and `basicAuthPassword` in your root service-of-services' secrets to configure basic auth for both loki and prometheus. + +## Configure Prometheus and Loki for your Console ui + +The console has the ability to take prometheus and loki connection information to begin providing log aggregation and metrics views in useful places in-ui. The configuration is nested under the deployment settings tab, at `/cd/settings/observability`. Be sure to use the same values as for the basic auth configuration above. \ No newline at end of file diff --git a/resources/monitoring/helm-repositories/grafana.yaml b/resources/monitoring/helm-repositories/grafana.yaml new file mode 100644 index 0000000..d19fa6d --- /dev/null +++ b/resources/monitoring/helm-repositories/grafana.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1beta1 +kind: HelmRepository +metadata: + name: grafana + namespace: {{ configuration.namespace }} +spec: + interval: 5m0s + url: https://grafana.github.io/helm-charts diff --git a/resources/monitoring/helm-repositories/opencost.yaml b/resources/monitoring/helm-repositories/opencost.yaml new file mode 100644 index 0000000..baa6eee --- /dev/null +++ b/resources/monitoring/helm-repositories/opencost.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1beta1 +kind: HelmRepository +metadata: + name: opencost + namespace: {{ configuration.namespace }} +spec: + interval: 5m0s + url: https://opencost.github.io/opencost-helm-chart \ No newline at end of file diff --git a/resources/monitoring/helm-repositories/prometheuscommunity.yaml b/resources/monitoring/helm-repositories/prometheuscommunity.yaml new file mode 100644 index 0000000..0a12618 --- /dev/null +++ b/resources/monitoring/helm-repositories/prometheuscommunity.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1beta1 +kind: HelmRepository +metadata: + name: prometheus-community + namespace: {{ configuration.namespace }} +spec: + interval: 5m0s + url: https://prometheus-community.github.io/helm-charts diff --git a/resources/monitoring/helm-values/kps-agent.yaml b/resources/monitoring/helm-values/kps-agent.yaml new file mode 100644 index 0000000..3477152 --- /dev/null +++ b/resources/monitoring/helm-values/kps-agent.yaml @@ -0,0 +1,115 @@ +fullnameOverride: monitoring + +defaultRules: + create: false + rules: + alertmanager: true + etcd: true + configReloaders: true + general: true + k8sContainerCpuUsageSecondsTotal: true + k8sContainerMemoryCache: true + k8sContainerMemoryRss: true + k8sContainerMemorySwap: true + k8sContainerResource: true + k8sContainerMemoryWorkingSetBytes: true + k8sPodOwner: true + kubeApiserverAvailability: true + kubeApiserverBurnrate: true + kubeApiserverHistogram: true + kubeApiserverSlos: true + kubeControllerManager: true + kubelet: true + kubeProxy: true + kubePrometheusGeneral: true + kubePrometheusNodeRecording: true + kubernetesApps: true + kubernetesResources: true + kubernetesStorage: true + kubernetesSystem: true + kubeSchedulerAlerting: true + kubeSchedulerRecording: true + kubeStateMetrics: true + network: true + node: true + nodeExporterAlerting: true + nodeExporterRecording: true + prometheus: true + prometheusOperator: true + windows: true + + +alertmanager: + enabled: false + fullnameOverride: kps-alertmanager + + +prometheusOperator: + tls: + enabled: false + admissionWebhooks: + enabled: false + prometheusConfigReloader: + resources: + requests: + cpu: 200m + memory: 50Mi + limits: + memory: 100Mi +grafana: + enabled: false + +# monitored k8s components +kubeApiServer: + enabled: true + +kubelet: + enabled: true + +coreDns: + enabled: true + +# already monitored with coreDns +kubeDns: + enabled: false + +kubeProxy: + enabled: true + +kubeStateMetrics: + enabled: true + +kube-state-metrics: + fullnameOverride: kps-kube-state-metrics + selfMonitor: + enabled: true + +nodeExporter: + enabled: true + +prometheus-node-exporter: + fullnameOverride: kps-node-exporter + prometheus: + monitor: + enabled: true + resources: + requests: + memory: 512Mi + cpu: 250m + limits: + memory: 2048Mi + +# EKS hides metrics for controller manager, scheduler, and etcd +# https://github.com/aws/eks-anywhere/issues/4405 +# disable kube controller manager scraping +kubeControllerManager: + enabled: false + +# disable kube scheduler scraping +kubeScheduler: + enabled: false + +kubeEtcd: + enabled: false + + diff --git a/resources/monitoring/helm-values/kps-agent.yaml.liquid b/resources/monitoring/helm-values/kps-agent.yaml.liquid new file mode 100644 index 0000000..70ea342 --- /dev/null +++ b/resources/monitoring/helm-values/kps-agent.yaml.liquid @@ -0,0 +1,24 @@ +prometheus: + enabled: true + agentMode: true + extraSecret: + name: basic-auth-remote + data: + user: {{ configuration.basicAuthUser }} + password: {{ configuration.basicAuthPassword }} + prometheusSpec: + remoteWrite: + - url: https://prometheus.boot-aws.onplural.sh/api/v1/write + name: mgmt-cluster-prometheus + basicAuth: + username: + name: basic-auth-remote + key: user + password: + name: basic-auth-remote + key: password + writeRelabelConfigs: + - sourceLabels: [] + targetLabel: 'cluster' + replacement: {{ cluster.Handle }} + diff --git a/resources/monitoring/helm-values/kps-mgmt.liquid b/resources/monitoring/helm-values/kps-mgmt.liquid new file mode 100644 index 0000000..2f4f7b1 --- /dev/null +++ b/resources/monitoring/helm-values/kps-mgmt.liquid @@ -0,0 +1,130 @@ +prometheus: + prometheusSpec: + prometheusExternalLabelName: {{ cluster.Handle }} + # incoming metrics from workload clusters will have a cluster label set to the cluster handle, but that's only assigned at the remote write push + # the mgmt cluster itself will not have a cluster label set, one can use an external label, but that's only added on push time + # ideally we would have a scrape class that would add a cluster label to all targets scraped by prometheus, this is currently not supported, but will be with probably the next release of the prometheus operator + # in the meantime we add relabel config to all servicemonitors individually (see below) + # this is how it would look like: + # use scrape classes in the future to add a cluster label to all targets scraped by prometheus + # this will make sure we can always identify the cluster a target belongs to, even for the mgmt cluster prometheus + # https://github.com/prometheus-operator/prometheus-operator/pull/5978 + # https://github.com/prometheus-operator/prometheus-operator/pull/6379 + #additionalConfig: + # scrapeClasses: + # - name: cluster_label + # default: true + # relabelings: + # - sourceLabels: [] + # targetLabel: cluster + # replacement: {{ cluster.Handle }} + serviceMonitor: + enabled: true + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +alertmanager: + serviceMonitor: + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +grafana: + serviceMonitor: + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +kubeApiServer: + serviceMonitor: + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + + +kubelet: + serviceMonitor: + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + + +coreDns: + serviceMonitor: + enabled: true + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +# already monitored with coreDns +kubeDns: + serviceMonitor: + enabled: true + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +kubeProxy: + enabled: true + serviceMonitor: + enabled: true + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +kubeStateMetrics: + enabled: true + +kube-state-metrics: + fullnameOverride: kps-kube-state-metrics + selfMonitor: + enabled: true + prometheus: + monitor: + enabled: true + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + +nodeExporter: + enabled: true + +prometheus-node-exporter: + fullnameOverride: kps-node-exporter + prometheus: + monitor: + enabled: true + relabelings: + - sourceLabels: [] + targetLabel: cluster + replacement: {{ cluster.Handle }} + + resources: + requests: + memory: 512Mi + cpu: 250m + limits: + memory: 2048Mi + +# EKS hides metrics for controller manager, scheduler, and etcd +# https://github.com/aws/eks-anywhere/issues/4405 +# disable kube controller manager scraping +kubeControllerManager: + enabled: false + +# disable kube scheduler scraping +kubeScheduler: + enabled: false + +kubeEtcd: + enabled: false diff --git a/resources/monitoring/helm-values/kps-mgmt.yaml b/resources/monitoring/helm-values/kps-mgmt.yaml new file mode 100644 index 0000000..11be227 --- /dev/null +++ b/resources/monitoring/helm-values/kps-mgmt.yaml @@ -0,0 +1,206 @@ +fullnameOverride: monitoring + +defaultRules: + create: true + rules: + alertmanager: true + etcd: true + configReloaders: true + general: true + k8sContainerCpuUsageSecondsTotal: true + k8sContainerMemoryCache: true + k8sContainerMemoryRss: true + k8sContainerMemorySwap: true + k8sContainerResource: true + k8sContainerMemoryWorkingSetBytes: true + k8sPodOwner: true + kubeApiserverAvailability: true + kubeApiserverBurnrate: true + kubeApiserverHistogram: true + kubeApiserverSlos: true + kubeControllerManager: true + kubelet: true + kubeProxy: true + kubePrometheusGeneral: true + kubePrometheusNodeRecording: true + kubernetesApps: true + kubernetesResources: true + kubernetesStorage: true + kubernetesSystem: true + kubeSchedulerAlerting: true + kubeSchedulerRecording: true + kubeStateMetrics: true + network: true + node: true + nodeExporterAlerting: true + nodeExporterRecording: true + prometheus: true + prometheusOperator: true + windows: true + + +alertmanager: + enabled: true + fullnameOverride: kps-alertmanager + + +prometheusOperator: + tls: + enabled: false + admissionWebhooks: + enabled: false + prometheusConfigReloader: + resources: + requests: + cpu: 200m + memory: 50Mi + limits: + memory: 100Mi +additionalScrapeConfigs: + - job_name: opencost + honor_labels: true + scrape_interval: 1m + scrape_timeout: 10s + metrics_path: /metrics + scheme: http + dns_sd_configs: + - names: + - opencost.monitoring + type: 'A' + port: 9003 +grafana: + enabled: true + fullnameOverride: kps-grafana + envValueFrom: + LOKI_DATASOURCE_PASSWORD: + secretKeyRef: + name: basic-auth-loki + key: password + admin: + password: admin # placeholder will get overwritten + user: admin + datasources.yaml: + apiVersion: 1 + deleteDatasources: + - name: Loki + orgId: 1 + service: + type: ClusterIP + ingress: + enabled: true + path: /.* + annotations: + kubernetes.io/tls-acme: "true" + kubernetes.io/ingress.class: "nginx" + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' + nginx.ingress.kubernetes.io/use-regex: "true" + hosts: + - grafana.boot-aws.onplural.sh + tls: + - hosts: + - grafana.boot-aws.onplural.sh + secretName: grafana-tls + sidecar: + dashboards: + enabled: true + label: grafana_dashboard + labelValue: "1" + provider: + name: sidecar + allowUiUpdates: true + foldersFromFilesStructure: true + searchNamespace: ALL + folderAnnotation: k8s-sidecar-target-directory + annotations: + k8s-sidecar-target-directory: "/tmp/dashboards/kubernetes" + datasources: + enabled: true + searchNamespace: ALL + plugins: + enabled: false + searchNamespace: ALL +prometheus: + enabled: true + ingress: + enabled: true + ingressClassName: nginx + annotations: + kubernetes.io/tls-acme: "true" + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: basic-auth-prom + nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' + hosts: + - prometheus.boot-aws.onplural.sh + tls: + - secretName: prometheus-tls + hosts: + - prometheus.boot-aws.onplural.sh + servicePort: 9090 + pathType: Prefix + paths: + - / + - /.* + prometheusSpec: + enableFeatures: + enableFeatures: -remote-write-receiver + enableRemoteWriteReceiver: true + remoteWriteDashboards: + enabled: true + +# monitored k8s components +kubeApiServer: + enabled: true + +kubelet: + enabled: true + +coreDns: + enabled: true + +# already monitored with coreDns +kubeDns: + enabled: false + +kubeProxy: + enabled: true + +kubeStateMetrics: + enabled: true + +kube-state-metrics: + fullnameOverride: kps-kube-state-metrics + selfMonitor: + enabled: true + +nodeExporter: + enabled: true + +prometheus-node-exporter: + fullnameOverride: kps-node-exporter + prometheus: + monitor: + enabled: true + resources: + requests: + memory: 512Mi + cpu: 250m + limits: + memory: 2048Mi + +# EKS hides metrics for controller manager, scheduler, and etcd +# https://github.com/aws/eks-anywhere/issues/4405 +# disable kube controller manager scraping +kubeControllerManager: + enabled: false + +# disable kube scheduler scraping +kubeScheduler: + enabled: false + +kubeEtcd: + enabled: false + + diff --git a/resources/monitoring/helm-values/loki.yaml b/resources/monitoring/helm-values/loki.yaml new file mode 100644 index 0000000..f488a63 --- /dev/null +++ b/resources/monitoring/helm-values/loki.yaml @@ -0,0 +1,53 @@ +test: + enabled: false +monitoring: + serviceMonitor: + enabled: true + selfMonitoring: + # disable self monitoring as we can't run it without grafana agent operator, and we don't use grafana agent in this setup + enabled: false + grafanaAgent: + installOperator: false + lokiCanary: + enabled: false + +# Grafana Loki is a multi-tenant system; requests and data for tenant A are isolated from tenant B. +# Requests to the Loki API should include an HTTP header (X-Scope-OrgID) that identifies the tenant for the request. +# Tenant IDs can be any alphanumeric string that fits within the Go HTTP header limit (1MB). +# Operators are recommended to use a reasonable limit for uniquely identifying tenants; 20 bytes is usually enough. +# Loki defaults to running in multi-tenant mode. +# Multi-tenant mode is set in the configuration with auth_enabled: true. +loki: + auth_enabled: true + +gateway: + enabled: true + image: + registry: docker.io + repository: nginxinc/nginx-unprivileged + tag: 1.24-alpine + pullPolicy: IfNotPresent + basicAuth: + enabled: true + existingSecret: basic-auth-loki + ingress: + enabled: true + ingressClassName: nginx + annotations: + kubernetes.io/tls-acme: "true" + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' + # can be used to enable basic auth at the ingress level as well + # nginx.ingress.kubernetes.io/auth-type: basic + # nginx.ingress.kubernetes.io/auth-secret: basic-auth + # nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' + hosts: + - host: loki.boot-aws.onplural.sh + paths: + - path: / + pathType: Prefix + tls: + - hosts: + - loki.boot-aws.onplural.sh + secretName: loki-tls + diff --git a/resources/monitoring/helm-values/loki.yaml.liquid b/resources/monitoring/helm-values/loki.yaml.liquid new file mode 100644 index 0000000..4e03469 --- /dev/null +++ b/resources/monitoring/helm-values/loki.yaml.liquid @@ -0,0 +1,19 @@ +loki: + storage: + bucketNames: + chunks: {{ contexts.loki.bucketName }} + ruler: {{ contexts.loki.bucketName }} + admin: {{ contexts.loki.bucketName }} + type: s3 + s3: + s3: s3://us-east-2 + region: us-east-2 + gateway: + basicAuth: + enabled: "true" + username: {{ configuration.user }} + password: {{ configuration.password }} + +serviceAccount: + annotations: + eks.amazonaws.com/role-arn: {{ contexts.loki.roleArn }} diff --git a/resources/monitoring/helm-values/opencost.yaml b/resources/monitoring/helm-values/opencost.yaml new file mode 100644 index 0000000..107e8ca --- /dev/null +++ b/resources/monitoring/helm-values/opencost.yaml @@ -0,0 +1,65 @@ +namespaceOverride: monitoring + +loglevel: info + +serviceAccount: + create: true + annotations: {} + # eks.amazonaws.com/role-arn: arn:aws:iam::123456789012:role/eksctl-opencost + +service: + enabled: true + +opencost: + metrics: + serviceMonitor: + # -- Create ServiceMonitor resource for scraping metrics using PrometheusOperator + enabled: true + + prometheus: + internal: + # -- Use in-cluster Prometheus + enabled: true + # -- Service name of in-cluster Prometheus + serviceName: monitoring-prometheus + # -- Namespace of in-cluster Prometheus + namespaceName: monitoring + # -- Service port of in-cluster Prometheus + port: 9090 + + ui: + # -- Enable OpenCost UI + enabled: true + resources: + # -- CPU/Memory resource requests + requests: + cpu: '10m' + memory: '55Mi' + # -- CPU/Memory resource limits + limits: + cpu: '999m' + memory: '1Gi' + uiPort: 9090 + ingress: + # -- Ingress for OpenCost UI + enabled: true + # -- Ingress controller which implements the resource + ingressClassName: nginx + # -- Annotations for Ingress resource + annotations: + kubernetes.io/tls-acme: "true" + cert-manager.io/cluster-issuer: letsencrypt-prod + nginx.ingress.kubernetes.io/force-ssl-redirect: 'true' + nginx.ingress.kubernetes.io/auth-type: basic + nginx.ingress.kubernetes.io/auth-secret: basic-auth + nginx.ingress.kubernetes.io/auth-realm: 'Authentication Required' + hosts: + - host: opencost.boot-aws.onplural.sh + paths: + - / + servicePort: http-ui + tls: + - secretName: opencost-tls + hosts: + - opencost.boot-aws.onplural.sh + \ No newline at end of file diff --git a/resources/monitoring/helm-values/promtail.yaml b/resources/monitoring/helm-values/promtail.yaml new file mode 100644 index 0000000..07e433d --- /dev/null +++ b/resources/monitoring/helm-values/promtail.yaml @@ -0,0 +1,23 @@ +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/port: "3101" + prometheus.io/path: /metrics + prometheus.io/scheme: http +tolerations: +- effect: NoSchedule + operator: Exists +- effect: NoExecute + operator: Exists +serviceMonitor: + enabled: false # ignore for now so you can ensure installability +daemonset: +# -- Deploys Promtail as a DaemonSet + enabled: true +config: + clients: + - url: https://loki.boot-aws.onplural.sh/loki/api/v1/push + tenant_id: 1 + basic_auth: + username: "{{ .Values.lokiUsername }}" + password: "{{ .Values.lokiPassword }}" + diff --git a/resources/monitoring/helm-values/promtail.yaml.liquid b/resources/monitoring/helm-values/promtail.yaml.liquid new file mode 100644 index 0000000..d137999 --- /dev/null +++ b/resources/monitoring/helm-values/promtail.yaml.liquid @@ -0,0 +1,6 @@ +lokiUsername: {{ configuration.user }} +lokiPassword: {{ configuration.password }} + + +extraArgs: +- -client.external-labels=cluster={{ cluster.Handle }} \ No newline at end of file diff --git a/resources/monitoring/services/kps-agent-fleet/secret.yaml b/resources/monitoring/services/kps-agent-fleet/secret.yaml new file mode 100644 index 0000000..b6e53e3 --- /dev/null +++ b/resources/monitoring/services/kps-agent-fleet/secret.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: basic-auth-prom-agent +stringData: + basicAuthUser: {{ configuration.basicAuthUser }} + basicAuthPassword: {{ configuration.basicAuthPassword }} diff --git a/resources/monitoring/services/kps-agent-fleet/servicedeployment.yaml b/resources/monitoring/services/kps-agent-fleet/servicedeployment.yaml new file mode 100644 index 0000000..2b5400e --- /dev/null +++ b/resources/monitoring/services/kps-agent-fleet/servicedeployment.yaml @@ -0,0 +1,43 @@ +apiVersion: deployments.plural.sh/v1alpha1 +kind: ServiceDeployment +metadata: + name: kps-agent-fleet + namespace: monitoring +spec: + namespace: monitoring + git: + folder: helm-values + ref: logstack + configurationRef: + name: basic-auth-prom-agent + namespace: monitoring + repositoryRef: + kind: GitRepository + name: monitoring + namespace: monitoring + helm: + version: 57.1.0 + chart: kube-prometheus-stack + valuesFiles: + - kps-agent.yaml + - kps-agent.yaml.liquid + repository: + name: prometheus-community + namespace: monitoring + clusterRef: + kind: Cluster + name: boot-staging + namespace: infra +--- +# make global so it's also deployed on the other clusters +apiVersion: deployments.plural.sh/v1alpha1 +kind: GlobalService +metadata: + name: kps-agent-fleet + namespace: monitoring +spec: + tags: + stage: dev + serviceRef: + name: kps-agent-fleet + namespace: monitoring diff --git a/resources/monitoring/services/kps-mgmt/basicauth-prometheus.yaml b/resources/monitoring/services/kps-mgmt/basicauth-prometheus.yaml new file mode 100644 index 0000000..14dc392 --- /dev/null +++ b/resources/monitoring/services/kps-mgmt/basicauth-prometheus.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: basic-auth-prom +stringData: + .htpasswd: {{ configuration.basicAuthHtpasswd }} + user: {{ configuration.basicAuthUser }} + password: {{ configuration.basicAuthPassword }} + # auth key used by prometheus ingress basic auth + auth: {{ configuration.basicAuthHtpasswd }} + diff --git a/resources/monitoring/services/kps-mgmt/grafana-datasource.yaml b/resources/monitoring/services/kps-mgmt/grafana-datasource.yaml new file mode 100644 index 0000000..a7152af --- /dev/null +++ b/resources/monitoring/services/kps-mgmt/grafana-datasource.yaml @@ -0,0 +1,24 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: loki-grafana-datasource + namespace: monitoring + labels: + grafana_datasource: "1" +data: + datasource-loki.yaml: |- + apiVersion: 1 + datasources: + - name: Loki + type: loki + access: proxy + orgId: 1 + uid: loki + url: http://loki-gateway.monitoring + basicAuth: true + basicAuthUser: user + secureJsonData: + basicAuthPassword: $LOKI_DATASOURCE_PASSWORD + httpHeaderValue1: '1' + jsonData: + httpHeaderName1: 'X-Scope-OrgID' diff --git a/resources/monitoring/services/kps-mgmt/servicedeployment.yaml b/resources/monitoring/services/kps-mgmt/servicedeployment.yaml new file mode 100644 index 0000000..0a9027f --- /dev/null +++ b/resources/monitoring/services/kps-mgmt/servicedeployment.yaml @@ -0,0 +1,38 @@ +apiVersion: deployments.plural.sh/v1alpha1 +kind: ServiceDeployment +metadata: + name: kps-mgmt + namespace: monitoring +spec: + namespace: monitoring + git: + folder: helm-values + ref: logstack + repositoryRef: + kind: GitRepository + name: monitoring + namespace: monitoring + helm: + version: 57.1.0 + chart: kube-prometheus-stack + valuesFiles: + - kps-mgmt.yaml + - kps-mgmt.yaml.liquid + repository: + name: prometheus-community + namespace: monitoring + clusterRef: + kind: Cluster + name: mgmt + namespace: infra +#--- +## make global so it's also deployed on the other clusters +#apiVersion: deployments.plural.sh/v1alpha1 +#kind: GlobalService +#metadata: +# name: kube-prometheus-stack +# namespace: infra +#spec: +# serviceRef: +# name: kube-promotheues-stack +# namespace: infra diff --git a/resources/monitoring/services/loki-mgmt/servicedeployment.yaml b/resources/monitoring/services/loki-mgmt/servicedeployment.yaml new file mode 100644 index 0000000..dc6cb08 --- /dev/null +++ b/resources/monitoring/services/loki-mgmt/servicedeployment.yaml @@ -0,0 +1,32 @@ +apiVersion: deployments.plural.sh/v1alpha1 +kind: ServiceDeployment +metadata: + name: loki + namespace: monitoring +spec: + namespace: monitoring + git: + folder: helm-values + ref: logstack + configurationRef: + name: basic-auth + namespace: monitoring + repositoryRef: + kind: GitRepository + name: monitoring + namespace: monitoring + contexts: + - loki # binds the loki context from the tf stack to this service + helm: + version: 5.43.6 + chart: loki + valuesFiles: + - loki.yaml + - loki.yaml.liquid + repository: + name: grafana + namespace: monitoring + clusterRef: + kind: Cluster + name: mgmt + namespace: infra diff --git a/resources/monitoring/services/opencost-mgmt/servicedeployment.yaml b/resources/monitoring/services/opencost-mgmt/servicedeployment.yaml new file mode 100644 index 0000000..1ccec58 --- /dev/null +++ b/resources/monitoring/services/opencost-mgmt/servicedeployment.yaml @@ -0,0 +1,29 @@ +apiVersion: deployments.plural.sh/v1alpha1 +kind: ServiceDeployment +metadata: + name: opencost + namespace: monitoring +spec: + namespace: monitoring + git: + folder: helm-values + ref: logstack + repositoryRef: + kind: GitRepository + name: monitoring + namespace: monitoring + #contexts: + # - opencost # binds the loki context from the tf stack to this service + helm: + version: 1.31.0 + chart: opencost + valuesFiles: + - opencost.yaml + - opencost.yaml.liquid + repository: + name: opencost + namespace: monitoring + clusterRef: + kind: Cluster + name: mgmt + namespace: infra diff --git a/resources/monitoring/services/promtail-fleet/basicauth-loki.yaml b/resources/monitoring/services/promtail-fleet/basicauth-loki.yaml new file mode 100644 index 0000000..7a2b256 --- /dev/null +++ b/resources/monitoring/services/promtail-fleet/basicauth-loki.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Secret +metadata: + name: basic-auth-loki +stringData: + # .htpasswd key used by loki + .htpasswd: {{ configuration.basicAuthHtpasswd }} + user: {{ configuration.basicAuthUser }} + password: {{ configuration.basicAuthPassword }} + # auth key used by prometheus ingress basic auth + auth: {{ configuration.basicAuthHtpasswd }} + diff --git a/resources/monitoring/services/promtail-fleet/servicedeployment.yaml b/resources/monitoring/services/promtail-fleet/servicedeployment.yaml new file mode 100644 index 0000000..efbf6a8 --- /dev/null +++ b/resources/monitoring/services/promtail-fleet/servicedeployment.yaml @@ -0,0 +1,43 @@ +apiVersion: deployments.plural.sh/v1alpha1 +kind: ServiceDeployment +metadata: + name: promtail + namespace: monitoring +spec: + namespace: monitoring + git: + folder: helm-values + ref: logstack + repositoryRef: + kind: GitRepository + name: monitoring + namespace: monitoring + configurationRef: + name: basic-auth + namespace: monitoring + helm: + version: 6.15.5 + chart: promtail + valuesFiles: + - promtail.yaml + - promtail.yaml.liquid + repository: + name: grafana + namespace: monitoring + clusterRef: + kind: Cluster + name: mgmt + namespace: infra +--- +# make global so it's also deployed on the other clusters +apiVersion: deployments.plural.sh/v1alpha1 +kind: GlobalService +metadata: + name: promtail + namespace: monitoring +spec: + tags: + stage: dev + serviceRef: + name: promtail + namespace: monitoring diff --git a/resources/monitoring/services/setup.yaml b/resources/monitoring/services/setup.yaml new file mode 100644 index 0000000..1ecb69d --- /dev/null +++ b/resources/monitoring/services/setup.yaml @@ -0,0 +1,7 @@ +apiVersion: deployments.plural.sh/v1alpha1 +kind: GitRepository +metadata: + name: monitoring + namespace: monitoring +spec: + url: https://github.com/pluralsh/bootstrap.git diff --git a/resources/monitoring/terraform/cd.tf b/resources/monitoring/terraform/cd.tf new file mode 100644 index 0000000..63c8566 --- /dev/null +++ b/resources/monitoring/terraform/cd.tf @@ -0,0 +1,74 @@ +locals { + #context = yamldecode(data.local_sensitive_file.context.content) + repo_url = "https://github.com/pluralsh/bootstrap.git" +} + +#data "local_sensitive_file" "context" { +# filename = "${path.module}/../../context.yaml" +#} + +data "plural_cluster" "mgmt" { + handle = "mgmt" +} + +// create the kubernetes namespace manually here so it can be used elsewhere w/in terraform w/o race conditions +resource "kubernetes_namespace" "monitoring" { + metadata { + name = "monitoring" + } +} + +resource "plural_git_repository" "monitoring" { + url = local.repo_url + decrypt = false +} + +resource "plural_service_deployment" "monitoring-helm-repositories" { + name = "monitoring-helm-repositories" + namespace = kubernetes_namespace.monitoring.metadata[0].name + repository = { + id = plural_git_repository.monitoring.id + ref = "main" + folder = "resources/monitoring/helm-repositories" + } + cluster = { + id = data.plural_cluster.mgmt.id + } + configuration = { + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + protect = false + + depends_on = [kubernetes_namespace.monitoring] +} + +resource "plural_service_deployment" "monitoring" { + name = "monitoring" + namespace = kubernetes_namespace.monitoring.metadata[0].name + repository = { + id = plural_git_repository.monitoring.id + ref = "main" + folder = "resources/monitoring/services" + } + cluster = { + id = data.plural_cluster.mgmt.id + } + + protect = false + + depends_on = [kubernetes_namespace.monitoring] + configuration = { + monitoringRepo = plural_git_repository.monitoring.id + repoUrl = local.repo_url + namespace = kubernetes_namespace.monitoring.metadata[0].name + } + + lifecycle { + ignore_changes = [ + configuration["basicAuthPassword"], + configuration["basicAuthUser"], + configuration["basicAuthHtpasswd"], + ] + } +} + diff --git a/resources/monitoring/terraform/persistence.tf b/resources/monitoring/terraform/persistence.tf new file mode 100644 index 0000000..7755ec6 --- /dev/null +++ b/resources/monitoring/terraform/persistence.tf @@ -0,0 +1,88 @@ +variable "acl" { + type = string + default = "private" +} + +variable "prefix" { + type = string + default = "loki" +} + +variable "enable_versioning" { + type = bool + default = false +} + +variable "cluster_name" { + type = string + default = "boot-test" +} + +variable "loki_service_account" { + type = string + default = "loki" +} + +variable "namespace" { + type = string + default = "monitoring" +} + +############################################### + +resource "aws_s3_bucket" "bucket" { + bucket = "${var.cluster_name}-${var.prefix}-storage" + #acl = var.acl # deprecated + force_destroy = true +} + + +resource "aws_iam_policy" "iam_policy" { + name_prefix = var.prefix + description = "policy for ${var.prefix} s3 access" + policy = data.aws_iam_policy_document.admin.json +} + +resource "aws_s3_bucket_versioning" "version" { + bucket = aws_s3_bucket.bucket.id + + versioning_configuration { + status = var.enable_versioning ? "Enabled" : "Disabled" + } +} + +data "aws_iam_policy_document" "admin" { + statement { + sid = "admin" + effect = "Allow" + actions = ["s3:*"] + + resources = concat( + ["arn:aws:s3:::${aws_s3_bucket.bucket.id}"], + ["arn:aws:s3:::${aws_s3_bucket.bucket.id}/*"] + ) + } +} + +#data "aws_eks_cluster" "cluster" { +# name = var.cluster_name +#} + +module "assumable_role_loki" { + source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role-with-oidc" + version = "5.37.0" + create_role = true + role_name = "${var.cluster_name}-${var.loki_service_account}" + oidc_fully_qualified_subjects = ["system:serviceaccount:${var.namespace}:${var.loki_service_account}"] + provider_url = replace(data.aws_eks_cluster.cluster.identity[0].oidc[0].issuer, "https://", "") + role_policy_arns = [aws_iam_policy.iam_policy.arn] +} + +resource "plural_service_context" "loki" { + name = "loki" + configuration = { + roleArn = module.assumable_role_loki.iam_role_arn + bucketName = aws_s3_bucket.bucket.id + } +} + diff --git a/resources/monitoring/terraform/provider.tf b/resources/monitoring/terraform/provider.tf new file mode 100644 index 0000000..0da4b21 --- /dev/null +++ b/resources/monitoring/terraform/provider.tf @@ -0,0 +1,54 @@ +terraform { + required_version = ">= 1.0" + + #backend "s3" { + # bucket = "plrlupdem-tf-state" + # key = "up-demo/apps/terraform.tfstate" + # region = "us-east-2" + #} + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 4.57" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = ">= 2.10" + } + plural = { + source = "pluralsh/plural" + version = ">= 0.2.0" + } + } +} + +provider "aws" { + region = "us-east-2" +} + +data "aws_eks_cluster" "cluster" { + name = "boot-test" +} + +data "aws_eks_cluster_auth" "cluster" { + name = "boot-test" +} + +provider "kubernetes" { + host = data.aws_eks_cluster.cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority.0.data) + token = data.aws_eks_cluster_auth.cluster.token +} + +data "kubernetes_secret" "console-auth" { + metadata { + name = "console-auth-token" + namespace = "plrl-console" + } +} + +provider "plural" { + console_url = "https://console.boot-aws.onplural.sh" + access_token = data.kubernetes_secret.console-auth.data.access-token +}