Skip to content

Commit

Permalink
feat(redis): redis 元数据巡检 close #2177
Browse files Browse the repository at this point in the history
  • Loading branch information
mikluo authored and zhangzhw8 committed Nov 29, 2023
1 parent 4c66cb4 commit f7666f6
Show file tree
Hide file tree
Showing 9 changed files with 312 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 3.2.19 on 2023-11-28 09:39

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
("encrypt", "0001_initial"),
]

operations = [
migrations.AlterField(
model_name="asymmetriccipherkey",
name="name",
field=models.CharField(
choices=[("password", "平台密码的非对称秘钥"), ("proxypass", "透传接口的非对称秘钥"), ("cloud", "云区域服务的非对称秘钥")],
max_length=128,
verbose_name="密钥名称",
),
),
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# -*- coding: utf-8 -*-
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""

import logging
from collections import defaultdict

from django.core.exceptions import ObjectDoesNotExist
from django.db.models import Q
from django.utils.translation import ugettext_lazy as _

from backend.db_meta.enums import ClusterType, InstanceRole, InstanceStatus
from backend.db_meta.models import Cluster
from backend.db_report.enums import MetaCheckSubType
from backend.db_report.models import MetaCheckReport

logger = logging.getLogger("root")


def check_redis_instance():
_check_redis_instance()


def _check_redis_instance():
"""
孤立实例检查 (孤立的proxy小于2个proxy,孤立的master,孤立的slave)
ALONE_PROXY
ALONE_MASTER
ALONE_SLAVE
实例状态异常检查, (不属于RUNNING状态)
STATUS_ABNORMAL
"""

# 构建查询条件:tendisplus,ssd,cache 三种类型一起检查,巡检0点发起
query = (
Q(cluster_type=ClusterType.TendisPredixyTendisplusCluster)
| Q(cluster_type=ClusterType.TwemproxyTendisSSDInstance)
| Q(cluster_type=ClusterType.TendisTwemproxyRedisInstance)
)
# 遍历集群
for c in Cluster.objects.filter(query):
logger.info("+===+++++=== start check {} db meta +++++===++++ ".format(c.immute_domain))
logger.info("+===+++++=== cluster type is: {} +++++===++++ ".format(c.cluster_type))
# proxy节点数不能小于2
if c.proxyinstance_set.count() < 2:
msg = _("集群 {} proxy numbers 小于2, only {}").format(c.immute_domain, c.proxyinstance_set.count())
MetaCheckReport.objects.create(
bk_biz_id=c.bk_biz_id,
bk_cloud_id=c.bk_cloud_id,
ip="none",
cluster=c.immute_domain,
cluster_type=c.cluster_type,
status=False,
msg=msg,
subtype=MetaCheckSubType.AloneInstance.value,
)

# 检查master对应的slave是否缺失
master_slave_map, slave_master_map = defaultdict(), defaultdict()
for master_obj in c.storageinstance_set.filter(instance_role=InstanceRole.REDIS_MASTER.value):
try:
slave_obj = master_obj.as_ejector.get().receiver
except ObjectDoesNotExist:
logger.error("Error occurred while getting slave_obj")
msg = _("集群{}的master:{} 获取slave失败").format(c.immute_domain, master_obj)
create_meta_alone_report(c, master_obj, msg)
raise NotImplementedError(_("集群{}的master{}get slave_obj failed".format(c.immute_domain, master_obj)))

# 集群不支持一个主多个从架构
ifslave = master_slave_map.get(master_obj.machine.ip)
if ifslave and ifslave != slave_obj.machine.ip:
msg = _("unsupport mutil slave with cluster {} 4:{}".format(c.immute_domain, master_obj.machine.ip))
create_meta_alone_report(c, master_obj, msg)
raise Exception(
"unsupport mutil slave with cluster {} 4:{}".format(c.immute_domain, master_obj.machine.ip)
)
else:
master_slave_map[master_obj.machine.ip] = slave_obj.machine.ip
# 没获取到对应端口
if master_obj.port != slave_obj.port:
msg = _("集群{}的master实例:{} 没有slave").format(c.immute_domain, master_obj)
create_meta_alone_report(c, master_obj, msg)

# 检查slave对应的master是否缺失
for slave_obj in c.storageinstance_set.filter(instance_role=InstanceRole.REDIS_SLAVE.value):
try:
master_obj = slave_obj.as_receiver.get().ejector
except ObjectDoesNotExist:
logger.error("Error occurred while getting master_obj")
msg = _("集群{}的slave:{} 获取master失败").format(c.immute_domain, slave_obj)
create_meta_alone_report(c, slave_obj, msg)
raise NotImplementedError(_("集群{}的slave{} get master_obj failed".format(c.immute_domain, slave_obj)))

# 不支持一从多主
ifmaster = slave_master_map.get(slave_obj.machine.ip)
if ifmaster and ifmaster != master_obj.machine.ip:
msg = _("unsupport mutil master with cluster {} 4:{}".format(c.immute_domain, slave_obj.machine.ip))
create_meta_alone_report(c, slave_obj, msg)
raise Exception(
"unsupport mutil master for cluster {}:{}".format(c.immute_domain, slave_obj.machine.ip)
)
else:
slave_master_map[slave_obj.machine.ip] = master_obj.machine.ip
# 没获取到对应端口
if slave_obj.port != master_obj.port:
msg = _("集群{}的slave实例:{} 没有master").format(c.immute_domain, slave_obj)
create_meta_alone_report(c, slave_obj, msg)
# 实例状态异常
for instance_obj in c.storageinstance_set.filter():
create_meta_statue_report(c, instance_obj)
# proxy状态异常
for instance_obj in c.proxyinstance_set.filter():
create_meta_statue_report(c, instance_obj)


def create_meta_statue_report(c, instance_obj):
"""
实例状态不为running的写入表中
"""
if instance_obj.status != InstanceStatus.RUNNING:
msg = _("集群{}的实例:{}实例状态异常:{}").format(c.immute_domain, instance_obj.ip_port, instance_obj.status)
MetaCheckReport.objects.create(
bk_biz_id=c.bk_biz_id,
bk_cloud_id=c.bk_cloud_id,
ip=instance_obj.machine.ip,
port=instance_obj.port,
cluster=c.immute_domain,
cluster_type=c.cluster_type,
status=False,
msg=msg,
subtype=MetaCheckSubType.StatusAbnormal.value,
)


def create_meta_alone_report(c, instance_obj, msg):
"""
孤立实例写入表中
"""
MetaCheckReport.objects.create(
bk_biz_id=c.bk_biz_id,
bk_cloud_id=c.bk_cloud_id,
ip=instance_obj.machine.ip,
port=instance_obj.port,
cluster=c.immute_domain,
cluster_type=c.cluster_type,
status=False,
msg=msg,
subtype=MetaCheckSubType.AloneInstance.value,
)
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from backend.db_periodic_task.local_tasks.register import register_periodic_task

from .check_instance_belong import check_instance_belong
from .check_redis_instance import check_redis_instance
from .check_replicate_role import check_replicate_role

logger = logging.getLogger("celery")
Expand All @@ -26,5 +27,6 @@ def db_meta_check_task():
"""
巡检校验元数据
"""
check_redis_instance()
check_instance_belong()
check_replicate_role()
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def _check_tendis_binlog_backup():
# 如果节点维度没有数据,就不用在进行下面的了
# 这里如果提升为集群维度的话,一般会有40*10*24*3=28800个文件,所以按节点维度来查
if not bklogs:
msg = _("无法查找到在时间范围内{}-{},集群{}:{}的全备份日志").format(start_time, end_time, c.immute_domain, instance)
msg = _("无法查找到在时间范围内{}-{},集群{}:{}的binlog备份日志").format(start_time, end_time, c.immute_domain, instance)
logger.error(msg)
RedisBackupCheckReport.objects.create(
creator=c.creator,
Expand Down
2 changes: 2 additions & 0 deletions dbm-ui/backend/db_report/enums/meta_check_sub_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,5 @@ class MetaCheckSubType(str, StructuredEnum):
InstanceBelong = EnumField("instance_belong", _("实例集群归属"))
ReplicateRole = EnumField("replicate_role", _("数据同步实例角色"))
ClusterTopo = EnumField("cluster_topo", _("集群结构"))
AloneInstance = EnumField("alone_instance", _("孤立的实例"))
StatusAbnormal = EnumField("status_abnormal", _("不属于RUNNING状态"))
18 changes: 18 additions & 0 deletions dbm-ui/backend/db_report/mock_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,21 @@
{"name": "create_at", "display_name": "心跳超时时间", "format": "text"},
],
}

# 元数据检查那里还需要在增加redis特有的检查
REDIS_META_CHECK_DATA = {
"count": 1,
"next": None,
"previous": None,
"results": [
{"bk_biz_id": 3, "cluster": "xx.xx.xx.xx", "cluster_type": "TwemproxyRedisInstance", "status": True, "msg": ""}
],
"name": "redis 元数据检查",
"title": [
{"name": "bk_biz_id", "display_name": "业务", "format": "text"},
{"name": "cluster", "display_name": "集群名", "format": "text"},
{"name": "cluster_type", "display_name": "集群类型", "format": "text"},
{"name": "status", "display_name": "元数据状态", "format": "status"},
{"name": "msg", "display_name": "详情", "format": "text"},
],
}
2 changes: 2 additions & 0 deletions dbm-ui/backend/db_report/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@
url("^redis_check/full_backup$", views.RedisFullBackupCheckReportViewSet.as_view({"get": "list"})),
url("^redis_check/binlog_backup$", views.RedisBinlogBackupCheckReportViewSet.as_view({"get": "list"})),
url("^dbmon/heartbeat$", views.DbmonHeatbeartCheckReportBaseViewSet.as_view({"get": "list"})),
url("^redis_meta_check/status_abnormal$", views.RedisStatusAbnormalCheckReportViewSet.as_view({"get": "list"})),
url("^redis_meta_check/alone_instance$", views.RedisAloneInstanceCheckReportViewSet.as_view({"get": "list"})),
]
1 change: 1 addition & 0 deletions dbm-ui/backend/db_report/views/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,5 @@
from .dbmon_heartbeat_view import DbmonHeatbeartCheckReportBaseViewSet
from .meta_check_view import MetaCheckReportInstanceBelongViewSet
from .mysqlbackup_check_view import MysqlBinlogBackupCheckReportViewSet, MysqlFullBackupCheckReportViewSet
from .redis_dbmeta_check_view import RedisAloneInstanceCheckReportViewSet, RedisStatusAbnormalCheckReportViewSet
from .redisbackup_check_view import RedisBinlogBackupCheckReportViewSet, RedisFullBackupCheckReportViewSet
106 changes: 106 additions & 0 deletions dbm-ui/backend/db_report/views/redis_dbmeta_check_view.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""
TencentBlueKing is pleased to support the open source community by making 蓝鲸智云-DB管理系统(BlueKing-BK-DBM) available.
Copyright (C) 2017-2023 THL A29 Limited, a Tencent company. All rights reserved.
Licensed under the MIT License (the "License"); you may not use this file except in compliance with the License.
You may obtain a copy of the License at https://opensource.org/licenses/MIT
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
"""


import logging

from django.utils.translation import ugettext_lazy as _
from rest_framework import serializers, status

from backend.bk_web.swagger import common_swagger_auto_schema
from backend.db_report import mock_data
from backend.db_report.enums import SWAGGER_TAG, MetaCheckSubType, ReportFieldFormat
from backend.db_report.models import MetaCheckReport
from backend.db_report.report_baseview import ReportBaseViewSet

logger = logging.getLogger("root")


class RedisDbmetaCheckReportSerializer(serializers.ModelSerializer):
class Meta:
model = MetaCheckReport
fields = ("bk_biz_id", "cluster", "cluster_type", "status", "msg")
swagger_schema_fields = {"example": mock_data.REDIS_META_CHECK_DATA}


class RedisDbmetaCheckReportBaseViewSet(ReportBaseViewSet):
queryset = MetaCheckReport.objects.all()
serializer_class = RedisDbmetaCheckReportSerializer
filter_fields = { # 大部分时候不需要覆盖默认的filter
"bk_biz_id": ["exact"],
"cluster_type": ["exact", "in"],
"create_at": ["gte", "lte"],
"status": ["exact", "in"],
}
report_name = _("redis 元数据检查")
report_title = [
{
"name": "bk_biz_id",
"display_name": _("业务"),
"format": ReportFieldFormat.TEXT.value,
},
{
"name": "cluster",
"display_name": _("集群域名"),
"format": ReportFieldFormat.TEXT.value,
},
{
"name": "cluster_type",
"display_name": _("集群类型"),
"format": ReportFieldFormat.TEXT.value,
},
{
"name": "status",
"display_name": _("元数据状态"),
"format": ReportFieldFormat.STATUS.value,
},
{
"name": "msg",
"display_name": _("详情"),
"format": ReportFieldFormat.TEXT.value,
},
]

@common_swagger_auto_schema(
operation_summary=_("redis 元数据检查报告"),
responses={status.HTTP_200_OK: RedisDbmetaCheckReportSerializer()},
tags=[SWAGGER_TAG],
)
def list(self, request, *args, **kwargs):
logger.info("list")
return super().list(request, *args, **kwargs)


class RedisAloneInstanceCheckReportViewSet(RedisDbmetaCheckReportBaseViewSet):
queryset = MetaCheckReport.objects.filter(subtype=MetaCheckSubType.AloneInstance.value)
serializer_class = RedisDbmetaCheckReportSerializer
report_name = _("孤立节点检查")

@common_swagger_auto_schema(
operation_summary=_("孤立节点检查报告"),
responses={status.HTTP_200_OK: RedisDbmetaCheckReportSerializer()},
tags=[SWAGGER_TAG],
)
def list(self, request, *args, **kwargs):
return super().list(request, *args, **kwargs)


class RedisStatusAbnormalCheckReportViewSet(RedisDbmetaCheckReportBaseViewSet):
queryset = MetaCheckReport.objects.filter(subtype=MetaCheckSubType.StatusAbnormal.value)
serializer_class = RedisDbmetaCheckReportSerializer
report_name = _("实例状态异常检查")

@common_swagger_auto_schema(
operation_summary=_("实例状态异常检查"),
responses={status.HTTP_200_OK: RedisDbmetaCheckReportSerializer()},
tags=[SWAGGER_TAG],
)
def list(self, request, *args, **kwargs):
return super().list(request, *args, **kwargs)

0 comments on commit f7666f6

Please sign in to comment.