Skip to content

Commit

Permalink
feat(backend): 故障自愈框架 #3742
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangzhw8 committed Apr 2, 2024
1 parent 04c825b commit debc9ca
Show file tree
Hide file tree
Showing 12 changed files with 348 additions and 1 deletion.
6 changes: 6 additions & 0 deletions dbm-ui/backend/db_monitor/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,9 @@ class BkMonitorDeleteAlarmException(DBMonitorBaseException):
ERROR_CODE = "202"
MESSAGE = _("监控策略删除失败")
MESSAGE_TPL = _("监控策略删除失败: {message}")


class AutofixException(DBMonitorBaseException):
ERROR_CODE = "203"
MESSAGE = _("故障自愈异常")
MESSAGE_TPL = _("故障自愈异常: {message}")
173 changes: 173 additions & 0 deletions dbm-ui/backend/db_monitor/mock_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,3 +397,176 @@
],
"rule_count": 1,
}

CALLBACK_REQUEST = {
"appointees": "admin,leader",
"callback_message": {
"type": "anomaly_notice",
"scenario": "os",
"bk_biz_id": 3,
"bk_biz_name": "dba",
"event": {
"id": "171170504652154",
"event_id": "171170504652154",
"is_shielded": False,
"begin_time": "2024-03-29 09:33:00",
"create_time": "2024-03-29 09:37:26",
"end_time": None,
"level": 1,
"level_name": "致命",
"agg_dimensions": [
"instance_role",
"device_name",
"app",
"bk_target_ip",
"cluster_domain",
"bk_target_cloud_id",
"appid",
],
"dimensions": {
"cluster_domain": "example.domain.db",
"bk_target_ip": "127.0.0.1",
"appid": "3",
"instance_role": "backend_slave",
"app": "dba",
"device_name": "/dev/vda1",
"bk_target_cloud_id": "0",
"bk_topo_node": ["biz|3", "set|5", "module|1195"],
"bk_host_id": 1,
},
"dimension_translation": {
"cluster_domain": {
"value": "example.domain.db",
"display_name": "dbm_meta cluster domain",
"display_value": "example.domain.db",
},
"bk_target_ip": {"value": "127.0.0.1", "display_name": "目标ip", "display_value": "127.0.0.1"},
"appid": {"value": "3", "display_name": "dbm_meta app id", "display_value": "3"},
"instance_role": {
"value": "backend_slave",
"display_name": "dbm_meta instance role",
"display_value": "backend_slave",
},
"app": {"value": "dba", "display_name": "dbm_meta app", "display_value": "dba"},
"device_name": {"value": "/dev/vda1", "display_name": "设备名", "display_value": "/dev/vda1"},
"bk_target_cloud_id": {"value": "0", "display_name": "云区域id", "display_value": "0"},
"bk_topo_node": {
"value": ["biz|3", "set|5", "module|1195"],
"display_name": "拓扑节点",
"display_value": [
{"bk_obj_name": "业务", "bk_inst_name": "dba"},
{"bk_obj_name": "集群", "bk_inst_name": "db.mysql.mysql"},
{"bk_obj_name": "模块", "bk_inst_name": "example.domain.db"},
],
},
"bk_host_id": {"value": 387, "display_name": "主机", "display_value": "127.0.0.1"},
},
},
"strategy": {
"id": 46650,
"name": "mysql 主机磁盘空间使用率cc",
"scenario": "os",
"item_list": [
{
"metric_field": "in_use",
"metric_field_name": "avg(磁盘空间使用率)",
"data_source_label": "bk_monitor",
"data_source_name": "监控平台",
"data_type_label": "time_series",
"data_type_name": "时序",
"metric_id": "bk_monitor.dbm_system.disk.in_use",
}
],
},
"latest_anomaly_record": {
"anomaly_id": "cc3d7ce5fb78f1f0b928b65f5e536bfa",
"source_time": "2024-03-29 09:36:00",
"create_time": "2024-03-29 09:37:25",
"origin_alarm": {
"trigger_time": 1711705045,
"data": {
"time": 1711704960,
"value": 9.759688,
"values": {"in_use": 0, "_result_": 9.759688, "time": 1711704960},
"dimensions": {
"cluster_domain": "example.domain.db",
"bk_target_ip": "127.0.0.1",
"appid": "3",
"instance_role": "backend_slave",
"app": "dba",
"device_name": "/dev/vda1",
"bk_target_cloud_id": "0",
"bk_topo_node": ["biz|3", "set|5", "module|1195"],
"bk_host_id": 387,
},
"record_id": "ce266d85bd01a4a50ba33d0acde845a0.1711704960",
"dimension_fields": [
"cluster_domain",
"bk_target_ip",
"appid",
"instance_role",
"app",
"device_name",
"bk_target_cloud_id",
],
"access_time": 1711705045.0090528,
"detect_time": 1711705045.618694,
},
"trigger": {
"level": "1",
"anomaly_ids": [
"ce266d85bd01a4a50ba33d0acde845a0.1711704720.46650.46650.1",
"ce266d85bd01a4a50ba33d0acde845a0.1711704780.46650.46650.1",
"ce266d85bd01a4a50ba33d0acde845a0.1711704840.46650.46650.1",
"ce266d85bd01a4a50ba33d0acde845a0.1711704900.46650.46650.1",
"ce266d85bd01a4a50ba33d0acde845a0.1711704960.46650.46650.1",
],
},
"anomaly": {
"1": {
"anomaly_message": "avg(磁盘空间使用率) >= 2.0%, 当前值9.759688%",
"anomaly_id": "ce266d85bd01a4a50ba33d0acde845a0.1711704960.46650.46650.1",
"anomaly_time": "2024-03-29 09:37:25",
},
"2": {
"anomaly_message": "avg(磁盘空间使用率) >= 1.0%, 当前值9.759688%",
"anomaly_id": "ce266d85bd01a4a50ba33d0acde845a0.1711704960.46650.46650.2",
"anomaly_time": "2024-03-29 09:37:25",
},
},
"dimension_translation": {
"cluster_domain": {
"value": "example.domain.db",
"display_name": "dbm_meta cluster domain",
"display_value": "example.domain.db",
},
"bk_target_ip": {"value": "127.0.0.1", "display_name": "目标ip", "display_value": "127.0.0.1"},
"appid": {"value": "3", "display_name": "dbm_meta app id", "display_value": "3"},
"instance_role": {
"value": "backend_slave",
"display_name": "dbm_meta instance role",
"display_value": "backend_slave",
},
"app": {"value": "dba", "display_name": "dbm_meta app", "display_value": "dba"},
"device_name": {"value": "/dev/vda1", "display_name": "设备名", "display_value": "/dev/vda1"},
"bk_target_cloud_id": {"value": "0", "display_name": "云区域id", "display_value": "0"},
"bk_topo_node": {
"value": ["biz|3", "set|5", "module|1195"],
"display_name": "拓扑节点",
"display_value": [
{"bk_obj_name": "业务", "bk_inst_name": "dba"},
{"bk_obj_name": "集群", "bk_inst_name": "db.mysql.mysql"},
{"bk_obj_name": "模块", "bk_inst_name": "example.domain.db"},
],
},
"bk_host_id": {"value": 387, "display_name": "主机", "display_value": "127.0.0.1"},
},
"strategy_snapshot_key": "bk_monitorv3.ce.cache.strategy.snapshot.46650.1711704968",
},
},
"current_value": 9.759688,
"description": "已持续7m, avg(磁盘空间使用率) >= 2.0%, 当前值9.759688%",
"related_info": "",
"labels": ["dbm_mysql", "mysql", "dbm", "need_autofix", "REDIS_CLUSTER_AUTOFIX"],
},
}
36 changes: 36 additions & 0 deletions dbm-ui/backend/db_monitor/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@
from backend.db_meta.enums import ClusterType
from backend.db_monitor import mock_data
from backend.db_monitor.constants import AlertLevelEnum, DetectAlgEnum, OperatorEnum, TargetLevel
from backend.db_monitor.exceptions import AutofixException
from backend.db_monitor.mock_data import CALLBACK_REQUEST
from backend.db_monitor.models import CollectTemplate, MonitorPolicy, NoticeGroup, RuleTemplate
from backend.db_monitor.models.alarm import DutyRule
from backend.db_periodic_task.constants import NoticeSignalEnum
from backend.ticket.constants import TicketType


class GetDashboardSerializer(serializers.Serializer):
Expand Down Expand Up @@ -209,3 +212,36 @@ class ListClusterSerializer(serializers.Serializer):

class ListModuleSerializer(ListClusterSerializer):
pass


class AlarmCallBackDataSerializer(serializers.Serializer):
class CallBackMessageSerializer(serializers.Serializer):
event = serializers.DictField(help_text=_("告警事件"))
strategy = serializers.DictField(help_text=_("监控策略"))
latest_anomaly_record = serializers.DictField(help_text=_("最新异常点信息"))
labels = serializers.ListSerializer(help_text=_("标签"), child=serializers.CharField())

appointees = serializers.CharField(help_text=_("告警负责人"))
callback_message = CallBackMessageSerializer(help_text=_("回调消息体"))

class Meta:
swagger_schema_fields = {"example": CALLBACK_REQUEST}

def to_internal_value(self, data):
data = super().to_internal_value(data)

# 取告警负责人作为单据创建人
data["creator"] = data["appointees"].split(",")[0]

# 判断是否需要自愈
labels = data["callback_message"].get("labels") or []
if "need_autofix" not in labels:
raise AutofixException(_("此策略无需进行故障自愈"))

# 取关联的的故障自愈处理单据
for label in labels:
if label in TicketType.get_values():
data["ticket_type"] = label
if data.get("ticket_type") is None:
raise AutofixException(_("未匹配到对应的故障自愈处理单据,请确认"))
return data
19 changes: 19 additions & 0 deletions dbm-ui/backend/db_monitor/views/policy.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
)
from ...iam_app.handlers.drf_perm.monitor import MonitorPolicyPermission
from ...iam_app.handlers.permission import Permission
from ...ticket.models import Ticket
from .. import constants
from ..models import MonitorPolicy

Expand Down Expand Up @@ -261,3 +262,21 @@ def db_module_list(self, request, *args, **kwargs):
"db_module_id", "db_module_name"
)
)

@common_swagger_auto_schema(
operation_summary=_("告警策略回调(处理套餐、故障自愈)"),
tags=[constants.SWAGGER_TAG],
request_body=serializers.AlarmCallBackDataSerializer,
)
@action(
methods=["POST"],
detail=False,
serializer_class=serializers.AlarmCallBackDataSerializer,
)
def callback(self, request, *args, **kwargs):
# 监控回调需要使用 Bearer Token 进行验证
bearer_token = request.META.get("HTTP_AUTHORIZATION", "")
print(f"{bearer_token} bearer_token")
# if settings.SECRET_KEY not in bearer_token:
# raise AuthenticationFailed(_('Bearer token is invalid'))
return Response(Ticket.create_ticket_from_bk_monitor(self.validated_data))
10 changes: 10 additions & 0 deletions dbm-ui/backend/homepage/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
import json
import logging

from blueapps.account.decorators import login_exempt
from django.conf import settings
from django.contrib import auth
Expand All @@ -23,6 +26,8 @@
from backend import env
from backend.version_log.utils import get_latest_version

logger = logging.getLogger("root")


class HomeView(APIView):
template_name = "index.html"
Expand All @@ -49,6 +54,11 @@ def get(self, request):

@login_exempt
def ping(request):
if request.method == "POST":
try:
logger.info(json.loads(request.body))
except TypeError:
pass
return HttpResponse("pong")


Expand Down
9 changes: 9 additions & 0 deletions dbm-ui/backend/ticket/builders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ class TicketFlowBuilder:
ticket_type = None
group = None
serializer = None
alarm_transform_serializer = None

# 默认的参数构造器
inner_flow_name: str = ""
Expand Down Expand Up @@ -387,10 +388,18 @@ def init_ticket_flows(self):
Flow.objects.bulk_create(flows)
return list(Flow.objects.filter(ticket=self.ticket))

def transform_alarm_to_ticket_details(self):
"""把监控时间转换为单据详情"""
pass

def patch_ticket_detail(self):
"""自定义补充单据详情,留给子类实现"""
pass

def alarm_callback_to_ticket_detail(self):
"""告警回调转化为单据详情"""
pass

@classmethod
def _add_itsm_pause_describe(cls, flow_desc, flow_config_map):
if flow_config_map[cls.ticket_type]["need_itsm"]:
Expand Down
10 changes: 10 additions & 0 deletions dbm-ui/backend/ticket/builders/cloud/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""
35 changes: 34 additions & 1 deletion dbm-ui/backend/ticket/builders/redis/redis_toolbox_autofix.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
from django.utils.translation import ugettext_lazy as _
from rest_framework import serializers

from backend.db_meta.enums import InstanceRole
from backend.db_meta.models import Cluster, Machine
from backend.db_monitor.serializers import AlarmCallBackDataSerializer
from backend.db_services.dbbase.constants import IpSource
from backend.flow.engine.controller.redis import RedisController
from backend.ticket import builders
Expand All @@ -36,10 +39,39 @@ class HostInfoSerializer(serializers.Serializer):
proxy = serializers.ListField(help_text=_("proxy列表"), child=HostInfoSerializer(), required=False)
redis_slave = serializers.ListField(help_text=_("slave列表"), child=HostInfoSerializer(), required=False)

ip_source = serializers.ChoiceField(help_text=_("主机来源"), choices=IpSource.get_choices())
ip_source = serializers.ChoiceField(
help_text=_("主机来源"), choices=IpSource.get_choices(), default=IpSource.RESOURCE_POOL.value
)
infos = serializers.ListField(help_text=_("批量操作参数列表"), child=InfoSerializer())


class RedisClusterAutofixAlarmTransformSerializer(AlarmCallBackDataSerializer):
# TODO: 这里举个例子,具体逻辑和场景,需各组件 DBA 实现
def to_internal_value(self, data):
data = super().to_internal_value(data)
dimensions = data["callback_message"]["event"]["dimensions"]
cluster = Cluster.objects.get(immute_domain=dimensions["cluster_domain"])
machine = Machine.objects.get(bk_host_id=dimensions["bk_host_id"])
proxy = redis_slave = []
host_info = [{"ip": machine.ip, "spec_id": machine.spec_id}]
if dimensions["instance_role"] == InstanceRole.REDIS_PROXY:
proxy = host_info
else:
redis_slave = host_info

ticket_detail = {
"infos": [
{
"cluster_id": cluster.id,
"bk_cloud_id": cluster.bk_cloud_id,
"proxy": proxy,
"redis_slave": redis_slave,
}
]
}
return ticket_detail


class RedisClusterAutofixParamBuilder(builders.FlowParamBuilder):
controller = RedisController.redis_cluster_auotfix_scene

Expand All @@ -56,6 +88,7 @@ def post_callback(self):
@builders.BuilderFactory.register(TicketType.REDIS_CLUSTER_AUTOFIX, is_apply=True)
class RedisClusterAutofixFlowBuilder(RedisClusterCutOffFlowBuilder):
serializer = RedisClusterAutofixDetailSerializer
alarm_transform_serializer = RedisClusterAutofixAlarmTransformSerializer
inner_flow_builder = RedisClusterAutofixParamBuilder
inner_flow_name = _("故障自愈")
resource_batch_apply_builder = RedisClusterAutofixResourceParamBuilder
Expand Down
Loading

0 comments on commit debc9ca

Please sign in to comment.