From 4e5372dbb2d251ac57e139bfdcffdc9a0663399d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nils=20M=C3=BCller?= Date: Thu, 16 Mar 2023 22:26:55 +0100 Subject: [PATCH] feat: implemented pod deletion cronjob --- .editorconfig | 16 +++++ .gitignore | 14 ++++ .markdownlint.yaml | 23 ++++++ .pre-commit-config.yaml | 104 ++++++++++++++++++++++++++++ .prettierignore | 0 .prettierrc.yaml | 5 ++ .yamllint.yaml | 30 ++++++++ README.md | 33 ++++++++- manifests/cluster-role-binding.yaml | 13 ++++ manifests/cluster-role.yaml | 11 +++ manifests/cronjob.yaml | 30 ++++++++ manifests/flux-helmrelease.yaml | 58 ++++++++++++++++ manifests/service-account.yaml | 6 ++ 13 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .markdownlint.yaml create mode 100644 .pre-commit-config.yaml create mode 100644 .prettierignore create mode 100644 .prettierrc.yaml create mode 100644 .yamllint.yaml create mode 100644 manifests/cluster-role-binding.yaml create mode 100644 manifests/cluster-role.yaml create mode 100644 manifests/cronjob.yaml create mode 100644 manifests/flux-helmrelease.yaml create mode 100644 manifests/service-account.yaml diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..bf56a9c --- /dev/null +++ b/.editorconfig @@ -0,0 +1,16 @@ +# Editor configuration, see https://editorconfig.org +root = true + +[*] +charset = utf-8 +indent_style = space +indent_size = 2 +insert_final_newline = true +trim_trailing_whitespace = true + +[*.md] +max_line_length = off +trim_trailing_whitespace = false + +[Makefile] +indent_style = tab diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cc3c40a --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +# Editors +.vscode/ +.idea/ + +# OS artifacts +.DS_Store +Thumbs.db + +# vscode-sops +.decrypted~*.yaml + +# Env files +*.envrc +*.env diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000..f34ac61 --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,23 @@ +--- +default: true + +# MD013/line-length - Line length +MD013: + # Number of characters + line_length: 240 + # Number of characters for headings + heading_line_length: 80 + # Number of characters for code blocks + code_block_line_length: 120 + # Include code blocks + code_blocks: true + # Include tables + tables: true + # Include headings + headings: true + # Include headings + headers: true + # Strict length checking + strict: false + # Stern length checking + stern: false diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2e385b9 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,104 @@ +--- +fail_fast: false +default_stages: + - commit + - push + +repos: + - repo: https://github.com/thlorenz/doctoc + rev: v2.2.0 + hooks: + - id: doctoc + args: + - --update-only + - --maxlevel + - "3" + - --github + - --notitle + + - repo: https://github.com/antonbabenko/pre-commit-terraform + rev: v1.77.1 + hooks: + - id: terraform_docs + args: + - --hook-config=--path-to-file=README.md + - --args=--config=.terraform-docs.yaml + - ./infra/terraform + - id: terraform_fmt + - id: terraform_tflint + args: + - --args=--config=__GIT_WORKING_DIR__/.tflint.hcl + # - id: terraform_tfsec + # args: + # - --args=--config-file=__GIT_WORKING_DIR__/.tfsec.yaml + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.4.0 + hooks: + - id: check-merge-conflict + - id: check-added-large-files + args: + - --maxkb=100 + - id: check-case-conflict + - id: check-executables-have-shebangs + - id: check-json + - id: check-symlinks + - id: check-xml + - id: detect-private-key + - id: end-of-file-fixer + - id: fix-byte-order-marker + - id: mixed-line-ending + args: + - --fix=auto + - id: trailing-whitespace + args: + - --markdown-linebreak-ext=md + + - repo: https://github.com/adrienverge/yamllint + rev: v1.29.0 + hooks: + - id: yamllint + args: + - --config-file + - .yamllint.yaml + + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.4.2 + hooks: + - id: remove-crlf + - id: remove-tabs + + - repo: https://github.com/sirosen/texthooks + rev: 0.5.0 + hooks: + - id: fix-smartquotes + - id: fix-ligatures + - id: forbid-bidi-controls + + - repo: https://github.com/igorshubovych/markdownlint-cli + rev: v0.33.0 + hooks: + - id: markdownlint-fix + args: + - --config + - .markdownlint.yaml + + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.0-alpha.4 + hooks: + - id: prettier + args: + - --ignore-path + - .prettierignore + - --config + - .prettierrc.yaml + + - repo: https://github.com/k8s-at-home/sops-pre-commit + rev: v2.1.1 + hooks: + - id: forbid-secrets + + - repo: https://github.com/zricethezav/gitleaks + rev: v8.12.0 + hooks: + - id: gitleaks diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 0000000..e69de29 diff --git a/.prettierrc.yaml b/.prettierrc.yaml new file mode 100644 index 0000000..e30d9f9 --- /dev/null +++ b/.prettierrc.yaml @@ -0,0 +1,5 @@ +--- +trailingComma: "es5" +tabWidth: 2 +semi: false +singleQuote: false diff --git a/.yamllint.yaml b/.yamllint.yaml new file mode 100644 index 0000000..100b666 --- /dev/null +++ b/.yamllint.yaml @@ -0,0 +1,30 @@ +--- +# see https://yamllint.readthedocs.io/en/stable/index.html for more options +extends: default +rules: + truthy: + allowed-values: ["true", "false", "on", "yes"] + + line-length: + max: 120 + level: warning + + braces: + min-spaces-inside: 0 + max-spaces-inside: 1 + + brackets: + min-spaces-inside: 0 + max-spaces-inside: 0 + + indentation: + spaces: 2 + indent-sequences: consistent + check-multi-line-strings: false + + document-start: + present: true + level: error + + comments: + min-spaces-from-content: 1 diff --git a/README.md b/README.md index da48bfc..b67dded 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,33 @@ -# i-see-dead-pods +# I see dead Pods + Get rid of `Pod was terminated in response to imminent node shutdown.` Pods forever. + +## Story + +In kubernetes with gracefull shutdown enabled, pods can stay for long time perriods in a `broken` state. The state results in alerting getting fired by kube-prometheus-stack. + +Most of the `solutions` on the internet describe an uncontrolled deletion of all Pods in `Error` or `Terminated` state. +Wich I consider as a bad idea, because you will not see anymore if real `Error` Pods are in your system. + +These manifests provide a kubernetes `CronJob` deleting constantly all Pods with given criterias. + +## Setup + +### kubectl + +You can apply the manifests manually: + +```console +kubectl apply -f https://raw.githubusercontent.com/tyriis/i-see-dead-pods/main/manifests/service-account.yaml +kubectl apply -f https://raw.githubusercontent.com/tyriis/i-see-dead-pods/main/manifests/cluster-role.yaml +kubectl apply -f https://raw.githubusercontent.com/tyriis/i-see-dead-pods/main/manifests/cluster-role-binding.yaml +kubectl apply -f https://raw.githubusercontent.com/tyriis/i-see-dead-pods/main/manifests/cronjob.yaml +``` + +### kustomize + +or with kustomize + +### flux helmrelease + +or with flux diff --git a/manifests/cluster-role-binding.yaml b/manifests/cluster-role-binding.yaml new file mode 100644 index 0000000..09f44c4 --- /dev/null +++ b/manifests/cluster-role-binding.yaml @@ -0,0 +1,13 @@ +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: system:i-see-dead-pods +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:i-see-dead-pods +subjects: + - kind: ServiceAccount + name: i-see-dead-pods + namespace: kube-system diff --git a/manifests/cluster-role.yaml b/manifests/cluster-role.yaml new file mode 100644 index 0000000..b42207f --- /dev/null +++ b/manifests/cluster-role.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + annotations: + rbac.authorization.kubernetes.io/autoupdate: "true" + name: system:i-see-dead-pods +rules: + - apiGroups: [""] + resources: ["pods"] + verbs: ["list", "get", "delete"] diff --git a/manifests/cronjob.yaml b/manifests/cronjob.yaml new file mode 100644 index 0000000..a7599cc --- /dev/null +++ b/manifests/cronjob.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: i-see-dead-pods + namespace: kube-system +spec: + schedule: "* * * * *" + concurrencyPolicy: Forbid + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + serviceAccountName: i-see-dead-pods + containers: + - name: kubectl + image: ghcr.io/k8s-at-home/kubectl:v1.25.4 + command: + - /bin/sh + - -ec + - | + kubectl get pods \ + --all-namespaces \ + -o go-template \ + --template='{{range .items}}{{printf "%s %s %s\n" .metadata.namespace .metadata.name .status.message}}{{end}}' \ + | grep "Pod was terminated in response to imminent node shutdown." \ + | awk '{print $1, $2}' \ + | xargs -n2 kubectl delete pod -n || true + restartPolicy: OnFailure diff --git a/manifests/flux-helmrelease.yaml b/manifests/flux-helmrelease.yaml new file mode 100644 index 0000000..0b20893 --- /dev/null +++ b/manifests/flux-helmrelease.yaml @@ -0,0 +1,58 @@ +--- +apiVersion: helm.toolkit.fluxcd.io/v2beta1 +kind: HelmRelease +metadata: + name: i-see-dead-pods + namespace: kube-system +spec: + interval: 15m + chart: + spec: + chart: app-template + version: 1.3.2 + interval: 15m + sourceRef: + kind: HelmRepository + name: bjw-s-charts + namespace: flux-system + maxHistory: 15 + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + cleanupOnFail: true + remediation: + retries: 3 + uninstall: + keepHistory: false + values: + controller: + type: cronjob + cronjob: + schedule: "* * * * *" + ttlSecondsAfterFinished: 60 + restartPolicy: OnFailure + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 1 + image: + repository: ghcr.io/k8s-at-home/kubectl + tag: v1.25.4 + command: + - /bin/sh + - -ec + - | + kubectl get pods \ + --all-namespaces \ + -o go-template \ + --template='{{range .items}}{{printf "%s %s %s\n" .metadata.namespace .metadata.name .status.message}}{{end}}' \ + | grep "Pod was terminated in response to imminent node shutdown." \ + | awk '{print $1, $2}' \ + | xargs -n2 kubectl delete pod -n || true + resources: + requests: + cpu: 10m + memory: 10Mi + limits: + memory: 10Mi diff --git a/manifests/service-account.yaml b/manifests/service-account.yaml new file mode 100644 index 0000000..6adf569 --- /dev/null +++ b/manifests/service-account.yaml @@ -0,0 +1,6 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: i-see-dead-pods + namespace: kube-system