From 913051163bc2dc16e42a73533e7074b4d8e1135d Mon Sep 17 00:00:00 2001 From: MoritzWeber Date: Tue, 24 Oct 2023 15:31:06 +0200 Subject: [PATCH] feat(k8s): Add Pod disruption budgets Kubernetes allows a lot flexibility when it comes to cluster and node updates. Cluster updates can happen at any time and can lead to massive interruptions (Guacamole not available) and session termination with potential data loss. Pod discruption budgets define under which circumstances the cluster operator can terminate or restart Pods. Our new policy is now: - For the management portal, at least one Pod has to be up&running at any time. - Session termination by the control plane is not allowed. The cluster operator has to contact the system administrator or wait until the session is terminated. Resolves #1089 --- .../capellacollab/sessions/operators/k8s.py | 64 ++++++++ .../tests/sessions/k8s_operator/conftest.py | 10 ++ .../test_session_k8s_download.py} | 42 +---- .../k8s_operator/test_session_k8s_operator.py | 153 ++++++++++++++++++ .../backend/backend.disruptionsbudget.yml | 12 ++ .../backend/backend.serviceaccount.yaml | 3 + .../backend/postgres.disruptionbudget.yml | 12 ++ .../templates/docs/docs.discruptionbudget.yml | 12 ++ .../frontend/frontend.disruptionbudget.yml | 12 ++ .../grafana/grafana.disruptionbudget.yml | 12 ++ .../grafana/nginx.disruptionbudget.yml | 12 ++ .../guacamole/guacamole.disruptionbudget.yml | 12 ++ .../guacamole/guacd.disruptionbudget.yml | 12 ++ .../guacamole/postgres.disruptionbudget.yml | 12 ++ .../prometheus/nginx.disruptionbudget.yml | 12 ++ .../prometheus.disruptionbudget.yml | 12 ++ .../routing/nginx.disruptionbudget.yml | 12 ++ 17 files changed, 375 insertions(+), 41 deletions(-) create mode 100644 backend/tests/sessions/k8s_operator/conftest.py rename backend/tests/sessions/{test_session_operator_k8s.py => k8s_operator/test_session_k8s_download.py} (57%) create mode 100644 backend/tests/sessions/k8s_operator/test_session_k8s_operator.py create mode 100644 helm/templates/backend/backend.disruptionsbudget.yml create mode 100644 helm/templates/backend/postgres.disruptionbudget.yml create mode 100644 helm/templates/docs/docs.discruptionbudget.yml create mode 100644 helm/templates/frontend/frontend.disruptionbudget.yml create mode 100644 helm/templates/grafana/grafana.disruptionbudget.yml create mode 100644 helm/templates/grafana/nginx.disruptionbudget.yml create mode 100644 helm/templates/guacamole/guacamole.disruptionbudget.yml create mode 100644 helm/templates/guacamole/guacd.disruptionbudget.yml create mode 100644 helm/templates/guacamole/postgres.disruptionbudget.yml create mode 100644 helm/templates/prometheus/nginx.disruptionbudget.yml create mode 100644 helm/templates/prometheus/prometheus.disruptionbudget.yml create mode 100644 helm/templates/routing/nginx.disruptionbudget.yml diff --git a/backend/capellacollab/sessions/operators/k8s.py b/backend/capellacollab/sessions/operators/k8s.py index 5fe8e51ef..e53edae60 100644 --- a/backend/capellacollab/sessions/operators/k8s.py +++ b/backend/capellacollab/sessions/operators/k8s.py @@ -96,6 +96,7 @@ def __init__(self) -> None: self.v1_apps = client.AppsV1Api(api_client=self.client) self.v1_batch = client.BatchV1Api(api_client=self.client) self.v1_networking = client.NetworkingV1Api(api_client=self.client) + self.v1_policy = client.PolicyV1Api(api_client=self.client) self._openshift = None @property @@ -200,6 +201,11 @@ def start_session( limits=limits, ) + self._create_disruption_budget( + name=_id, + deployment_name=_id, + ) + service = self._create_service( name=_id, deployment_name=_id, @@ -226,6 +232,13 @@ def kill_session(self, _id: str): "Deleted deployment %s with status %s", _id, dep_status.status ) + if disrupt_status := self._delete_disruptionbudget(name=_id): + log.info( + "Deleted Pod discruption budget %s with status %s", + _id, + disrupt_status.status, + ) + if loki_enabled and (conf_status := self._delete_config_map(name=_id)): log.info( "Deleted config map %s with status %s", _id, conf_status.status @@ -572,6 +585,7 @@ def _create_deployment( metadata=client.V1ObjectMeta(name=name), spec=client.V1DeploymentSpec( replicas=1, + strategy=client.V1DeploymentStrategy(type="Recreate"), selector=client.V1LabelSelector(match_labels={"app": name}), template=client.V1PodTemplateSpec( metadata=client.V1ObjectMeta( @@ -663,6 +677,41 @@ def _create_job( ) return self.v1_batch.create_namespaced_job(namespace, job) + def _create_disruption_budget( + self, + name: str, + deployment_name: str, + ) -> client.V1PodDisruptionBudget: + """Disallow any pod discription for the deployment + + If the deployment uses the recreate strategy together with + this budget, the cluster operator shall consult the administrator before + termination of the deployment. + + More information: + https://kubernetes.io/docs/tasks/run-application/configure-pdb/ + """ + + discruption_budget: client.V1PodDisruptionBudget = ( + client.V1PodDisruptionBudget( + kind="PodDisruptionBudget", + api_version="policy/v1", + metadata=client.V1ObjectMeta( + name=name, + labels={"app": name}, + ), + spec=client.V1PodDisruptionBudgetSpec( + max_unavailable=0, + selector=client.V1LabelSelector( + match_labels={"app": deployment_name} + ), + ), + ) + ) + return self.v1_policy.create_namespaced_pod_disruption_budget( + namespace, discruption_budget + ) + def _create_service( self, name: str, @@ -942,6 +991,21 @@ def _delete_service(self, name: str) -> client.V1Status | None: log.exception("Error deleting service with name: %s", name) return None + def _delete_disruptionbudget(self, name: str) -> client.V1Status | None: + try: + return self.v1_policy.delete_namespaced_pod_disruption_budget( + name, namespace + ) + except exceptions.ApiException as e: + # Pod disruption budge doesn't exist or was already deleted + # Nothing to do + if not e.status == http.HTTPStatus.NOT_FOUND: + log.exception( + "Error deleting discruptionbudget with name: %s", name + ) + + return None + def _delete_ingress(self, name: str) -> client.V1Status | None: try: return self.v1_networking.delete_namespaced_ingress( diff --git a/backend/tests/sessions/k8s_operator/conftest.py b/backend/tests/sessions/k8s_operator/conftest.py new file mode 100644 index 000000000..e1304d391 --- /dev/null +++ b/backend/tests/sessions/k8s_operator/conftest.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +import kubernetes.config +import pytest + + +@pytest.fixture(autouse=True) +def mock_k8s_load_config(monkeypatch): + monkeypatch.setattr(kubernetes.config, "load_config", lambda **_: None) diff --git a/backend/tests/sessions/test_session_operator_k8s.py b/backend/tests/sessions/k8s_operator/test_session_k8s_download.py similarity index 57% rename from backend/tests/sessions/test_session_operator_k8s.py rename to backend/tests/sessions/k8s_operator/test_session_k8s_download.py index 968f3cd2a..c4d03e914 100644 --- a/backend/tests/sessions/test_session_operator_k8s.py +++ b/backend/tests/sessions/k8s_operator/test_session_k8s_download.py @@ -2,9 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import base64 -import os -import kubernetes.config import pytest from capellacollab.sessions.operators.k8s import ( @@ -12,12 +10,6 @@ lazy_b64decode, ) - -@pytest.fixture(autouse=True) -def mock_k8s_load_config(monkeypatch): - monkeypatch.setattr(kubernetes.config, "load_config", lambda **_: None) - - hello = base64.b64encode(b"hello") # aGVsbG8= @@ -36,7 +28,7 @@ def test_lazy_b64_decode(): ) -def test_download_file(monkeypatch): +def test_download_file(monkeypatch: pytest.MonkeyPatch): mock_stream = MockStream([hello.decode("utf-8")]) monkeypatch.setattr( "kubernetes.stream.stream", lambda *a, **ka: mock_stream @@ -65,35 +57,3 @@ def is_open(self): def read_stdout(self, timeout=None): return self._blocks.pop(0) - - -def test_create_job(monkeypatch): - monkeypatch.setattr("kubernetes.config.load_config", lambda **_: None) - operator = KubernetesOperator() - monkeypatch.setattr( - operator.v1_batch, "create_namespaced_job", lambda namespace, job: None - ) - result = operator.create_job( - image="fakeimage", - command="fakecmd", - labels={"key": "value"}, - environment={"ENVVAR": "value"}, - ) - - assert result - - -def test_create_cronjob(monkeypatch): - operator = KubernetesOperator() - monkeypatch.setattr( - operator.v1_batch, - "create_namespaced_cron_job", - lambda namespace, job: None, - ) - result = operator.create_cronjob( - image="fakeimage", - command="fakecmd", - environment={"ENVVAR": "value"}, - ) - - assert result diff --git a/backend/tests/sessions/k8s_operator/test_session_k8s_operator.py b/backend/tests/sessions/k8s_operator/test_session_k8s_operator.py new file mode 100644 index 000000000..3c1680d80 --- /dev/null +++ b/backend/tests/sessions/k8s_operator/test_session_k8s_operator.py @@ -0,0 +1,153 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + + +import datetime + +import pytest +from kubernetes import client +from kubernetes.client import exceptions + +from capellacollab.sessions.operators import k8s + + +def test_start_session(monkeypatch: pytest.MonkeyPatch): + operator = k8s.KubernetesOperator() + monkeypatch.setattr(k8s, "loki_enabled", False) + + name = "testname" + creation_timestamp = datetime.datetime.now() + + deployment_counter = 0 + service_counter = 0 + disruption_budget_counter = 0 + + def create_namespaced_deployment(namespace, deployment): + nonlocal deployment_counter + deployment_counter += 1 + return client.V1Deployment( + metadata=client.V1ObjectMeta( + name=name, creation_timestamp=creation_timestamp + ) + ) + + monkeypatch.setattr( + operator.v1_apps, + "create_namespaced_deployment", + create_namespaced_deployment, + ) + + def create_namespaced_service(namespace, service): + nonlocal service_counter + service_counter += 1 + return client.V1Service(metadata=client.V1ObjectMeta(name=name)) + + monkeypatch.setattr( + operator.v1_core, + "create_namespaced_service", + create_namespaced_service, + ) + + def create_namespaced_pod_disruption_budget(namespace, budget): + nonlocal disruption_budget_counter + disruption_budget_counter += 1 + + monkeypatch.setattr( + operator.v1_policy, + "create_namespaced_pod_disruption_budget", + create_namespaced_pod_disruption_budget, + ) + + session = operator.start_session( + image="hello-world", + username="testuser", + session_type="persistent", + tool_name="test tool", + version_name="test version", + environment={}, + ports={"rdp": 3389}, + volumes=[], + ) + + assert deployment_counter == 1 + assert service_counter == 1 + assert disruption_budget_counter == 1 + + assert session["id"] == "testname" + + +def test_kill_session(monkeypatch: pytest.MonkeyPatch): + operator = k8s.KubernetesOperator() + monkeypatch.setattr(k8s, "loki_enabled", False) + + monkeypatch.setattr( + operator.v1_apps, + "delete_namespaced_deployment", + lambda namespace, name: client.V1Status(), + ) + + monkeypatch.setattr( + operator.v1_core, + "delete_namespaced_service", + lambda namespace, name: client.V1Status(), + ) + + monkeypatch.setattr( + operator.v1_policy, + "delete_namespaced_pod_disruption_budget", + lambda namespace, name: client.V1Status(), + ) + + operator.kill_session("testname") + + +def test_create_job(monkeypatch: pytest.MonkeyPatch): + operator = k8s.KubernetesOperator() + monkeypatch.setattr( + operator.v1_batch, "create_namespaced_job", lambda namespace, job: None + ) + result = operator.create_job( + image="fakeimage", + command="fakecmd", + labels={"key": "value"}, + environment={"ENVVAR": "value"}, + ) + + assert result + + +def test_create_cronjob(monkeypatch: pytest.MonkeyPatch): + operator = k8s.KubernetesOperator() + monkeypatch.setattr( + operator.v1_batch, + "create_namespaced_cron_job", + lambda namespace, job: None, + ) + result = operator.create_cronjob( + image="fakeimage", + command="fakecmd", + environment={"ENVVAR": "value"}, + ) + + assert result + + +def test_delete_disruption_budget_with_api_error( + monkeypatch: pytest.MonkeyPatch, +): + """Test that _delete_disruptionbudget does not raise an exception if the + Pod disruption budget does not exist. + """ + + operator = k8s.KubernetesOperator() + + def raise_api_exception(*args, **kwargs): + raise exceptions.ApiException(status=404) + + monkeypatch.setattr( + operator.v1_policy, + "delete_namespaced_pod_disruption_budget", + raise_api_exception, + ) + result = operator._delete_disruptionbudget("testname") + assert result is None diff --git a/helm/templates/backend/backend.disruptionsbudget.yml b/helm/templates/backend/backend.disruptionsbudget.yml new file mode 100644 index 000000000..337999fdb --- /dev/null +++ b/helm/templates/backend/backend.disruptionsbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-backend +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-backend diff --git a/helm/templates/backend/backend.serviceaccount.yaml b/helm/templates/backend/backend.serviceaccount.yaml index f2b4bc2b7..042b00aa5 100644 --- a/helm/templates/backend/backend.serviceaccount.yaml +++ b/helm/templates/backend/backend.serviceaccount.yaml @@ -39,6 +39,9 @@ rules: - apiGroups: [""] resources: ["secrets"] verbs: ["create", "delete"] +- apiGroups: ["policy"] + resources: ["poddisruptionbudgets"] + verbs: ["create", "delete"] {{- if eq .Values.cluster.kind "Kubernetes" }} - apiGroups: ["networking.k8s.io"] resources: ["ingresses"] diff --git a/helm/templates/backend/postgres.disruptionbudget.yml b/helm/templates/backend/postgres.disruptionbudget.yml new file mode 100644 index 000000000..5a6b4e13a --- /dev/null +++ b/helm/templates/backend/postgres.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-backend-postgres +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-backend-postgres diff --git a/helm/templates/docs/docs.discruptionbudget.yml b/helm/templates/docs/docs.discruptionbudget.yml new file mode 100644 index 000000000..296c588bd --- /dev/null +++ b/helm/templates/docs/docs.discruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-docs +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-docs diff --git a/helm/templates/frontend/frontend.disruptionbudget.yml b/helm/templates/frontend/frontend.disruptionbudget.yml new file mode 100644 index 000000000..607e5c0ad --- /dev/null +++ b/helm/templates/frontend/frontend.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-frontend +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-frontend diff --git a/helm/templates/grafana/grafana.disruptionbudget.yml b/helm/templates/grafana/grafana.disruptionbudget.yml new file mode 100644 index 000000000..fc5e64faa --- /dev/null +++ b/helm/templates/grafana/grafana.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-grafana +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-grafana-server diff --git a/helm/templates/grafana/nginx.disruptionbudget.yml b/helm/templates/grafana/nginx.disruptionbudget.yml new file mode 100644 index 000000000..b8491f519 --- /dev/null +++ b/helm/templates/grafana/nginx.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-grafana-nginx +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-grafana-nginx diff --git a/helm/templates/guacamole/guacamole.disruptionbudget.yml b/helm/templates/guacamole/guacamole.disruptionbudget.yml new file mode 100644 index 000000000..c1ea47979 --- /dev/null +++ b/helm/templates/guacamole/guacamole.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-guacamole-guacamole +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-guacamole-guacamole diff --git a/helm/templates/guacamole/guacd.disruptionbudget.yml b/helm/templates/guacamole/guacd.disruptionbudget.yml new file mode 100644 index 000000000..11d188c95 --- /dev/null +++ b/helm/templates/guacamole/guacd.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-guacamole-guacd +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-guacamole-guacd diff --git a/helm/templates/guacamole/postgres.disruptionbudget.yml b/helm/templates/guacamole/postgres.disruptionbudget.yml new file mode 100644 index 000000000..eddb1d2ec --- /dev/null +++ b/helm/templates/guacamole/postgres.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-guacamole-postgres +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-guacamole-postgres diff --git a/helm/templates/prometheus/nginx.disruptionbudget.yml b/helm/templates/prometheus/nginx.disruptionbudget.yml new file mode 100644 index 000000000..9c8b96719 --- /dev/null +++ b/helm/templates/prometheus/nginx.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-prometheus-nginx +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-prometheus-nginx diff --git a/helm/templates/prometheus/prometheus.disruptionbudget.yml b/helm/templates/prometheus/prometheus.disruptionbudget.yml new file mode 100644 index 000000000..c1ffa72e2 --- /dev/null +++ b/helm/templates/prometheus/prometheus.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-prometheus-server +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-prometheus-server diff --git a/helm/templates/routing/nginx.disruptionbudget.yml b/helm/templates/routing/nginx.disruptionbudget.yml new file mode 100644 index 000000000..59e13594e --- /dev/null +++ b/helm/templates/routing/nginx.disruptionbudget.yml @@ -0,0 +1,12 @@ +# SPDX-FileCopyrightText: Copyright DB Netz AG and the capella-collab-manager contributors +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: {{ .Release.Name }}-nginx +spec: + minAvailable: 1 + selector: + matchLabels: + id: {{ .Release.Name }}-deployment-nginx