Skip to content

Commit

Permalink
fix: riak监控增加指标_增加实例重启_调整dbconfig获取方式 #1576
Browse files Browse the repository at this point in the history
  • Loading branch information
fanfanyangyang authored and xfwduke committed Nov 6, 2023
1 parent 2553309 commit c24e765
Show file tree
Hide file tree
Showing 24 changed files with 439 additions and 39 deletions.
1 change: 1 addition & 0 deletions dbm-services/mysql/db-tools/dbactuator/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ require (
github.com/TylerBrock/colorjson v0.0.0-20200706003622-8a50f05110d2
github.com/dustin/go-humanize v1.0.1
github.com/go-sql-driver/mysql v1.7.1
github.com/golang/glog v1.1.2
github.com/jmoiron/sqlx v1.3.5
github.com/mitchellh/go-ps v1.0.0
github.com/pkg/errors v0.9.1
Expand Down
2 changes: 2 additions & 0 deletions dbm-services/mysql/db-tools/dbactuator/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ github.com/go-test/deep v1.0.3 h1:ZrJSEWsXzPOxaZnFteGEfooLba+ju3FYIbOrS+rQd68=
github.com/go-test/deep v1.0.3/go.mod h1:wGDj63lr65AM2AQyKZd/NYHGb0R+1RLqB8NKt3aSFNA=
github.com/golang-jwt/jwt/v4 v4.5.0 h1:7cYmW1XlMY7h7ii7UhUyChSgS5wUJEnm9uZVTGqOWzg=
github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0=
github.com/golang/glog v1.1.2 h1:DVjP2PbBOzHyzA+dn3WhHIq4NdVu3Q+pvivFICf/7fo=
github.com/golang/glog v1.1.2/go.mod h1:zR+okUeTbrL6EL3xHUDxZuEtGv04p5shwip1+mL/rLQ=
github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ func NewRiakCommand() *cobra.Command {
NewUninstallCommand(),
NewStopCommand(),
NewStartCommand(),
NewRestartCommand(),
NewDeployMonitorCommand(),
NewStartMonitorCommand(),
NewStopMonitorCommand(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ func (d *RemoveNodeAct) Run() error {
FunName: "环境预检查",
Func: d.Payload.PreCheck,
},
// 在剔除状态异常的状态为"down!"节点前,先down节点
{
FunName: "down异常节点,保障集群ring正常",
Func: d.Payload.MarkInvalidNodeDown,
},
{
FunName: "集群剔除节点",
Func: d.Payload.RemoveNode,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package riakcmd

import (
"fmt"

"dbm-services/common/go-pubpkg/logger"
"dbm-services/riak/db-tools/dbactuator/internal/subcmd"
"dbm-services/riak/db-tools/dbactuator/pkg/components/riak"
"dbm-services/riak/db-tools/dbactuator/pkg/util"

"github.com/spf13/cobra"
)

// RestartAct 重启riak dbactor参数
type RestartAct struct {
*subcmd.BaseOptions
Payload riak.RestartComp
}

// NewRestartCommand riak重启节点
func NewRestartCommand() *cobra.Command {
act := RestartAct{
BaseOptions: subcmd.GBaseOptions,
}
cmd := &cobra.Command{
Use: "restart",
Short: "重启节点",
Example: fmt.Sprintf("dbactuator riak restart %s", subcmd.CmdBaseExampleStr),
Run: func(cmd *cobra.Command, args []string) {
util.CheckErr(act.Validator())
util.CheckErr(act.Init())
util.CheckErr(act.Run())
},
}
return cmd
}

// Validator TODO
func (d *RestartAct) Validator() error {
return d.BaseOptions.Validate()
}

// Init 反序列化并检查
func (d *RestartAct) Init() error {
if err := d.DeserializeAndValidate(&d.Payload); err != nil {
logger.Error("DeserializeAndValidate err %s", err.Error())
return err
}
return nil
}

// Run 运行,重启节点
func (d *RestartAct) Run() error {
steps := subcmd.Steps{
{
FunName: "重启节点",
Func: d.Payload.Restart,
},
}
if err := steps.Run(); err != nil {
return err
}
logger.Info("restart success")
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"dbm-services/riak/db-tools/dbactuator/pkg/core/cst"
"dbm-services/riak/db-tools/dbactuator/pkg/util/osutil"
"fmt"
"strings"

"golang.org/x/exp/slog"
)
Expand All @@ -29,10 +30,23 @@ type CheckConnectionsRunTimeCtx struct {

// CheckConnections 集群数据搬迁进度检查
func (i *CheckConnectionsComp) CheckConnections() error {
cmd := fmt.Sprintf(`netstat -anpl|grep %d | grep "beam.smp" | grep -E -v '0.0.0.0:*'`, cst.DefaultProtobufPort)
localIp, err := osutil.GetLocalIP()
if err != nil {
logger.Error("get local ip error: %s", err.Error())
return err
}
// 剔除蓝鲸监控的探活进程
cmd := fmt.Sprintf(`netstat -anpl|grep ':%d' | grep "beam.smp" | grep -E -v '0.0.0.0:*'`, cst.DefaultProtobufPort)
cmd = fmt.Sprintf(`%s | awk '{ if (index($5,"%s:") == 0) { print $0 } }' `, cmd, localIp)
res, err := osutil.ExecShellCommand(false, cmd)
// 没有连接
if err != nil {
errInfo := fmt.Sprintf("execute [ %s ] error: %s", cmd, err.Error())
slog.Error(errInfo)
return fmt.Errorf(errInfo)
}
content := strings.Replace(res, " ", "", -1)
content = strings.Replace(res, "\n", "", -1)
if content == "" {
slog.Info("check success, no connection")
return nil
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"dbm-services/riak/db-tools/dbactuator/pkg/util/osutil"
"fmt"
"strings"
"time"
)

// RemoveNodeComp TODO
Expand Down Expand Up @@ -76,6 +77,7 @@ func (i *RemoveNodeComp) MarkInvalidNodeDown() error {
return fmt.Errorf("execute shell [%s] error: %s", cmd, err.Error())
}
logger.Info("execute shell [%s] success", cmd)
time.Sleep(10 * time.Second)
}
}
return nil
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
// Package riak TODO
/*
* @Description: 安装 Riak
*/
package riak

import (
"dbm-services/common/go-pubpkg/logger"
"dbm-services/riak/db-tools/dbactuator/pkg/util/osutil"
"fmt"
"time"
)

// RestartComp TODO
type RestartComp struct {
Params *RestartParam `json:"extend"`
RestartRunTimeCtx `json:"-"`
}

// RestartParam TODO
type RestartParam struct {
}

// RestartRunTimeCtx 运行时上下文
type RestartRunTimeCtx struct {
}

// Restart 启动
func (i *RestartComp) Restart() error {
cmd := "riak restart"
_, err := osutil.ExecShellCommand(false, cmd)
if err != nil {
logger.Error("execute shell [%s] error: %s", cmd, err.Error())
err = fmt.Errorf("execute shell [%s] error: %s", cmd, err.Error())
return err
}
time.Sleep(time.Minute)
logger.Info("restart riak success")
return nil
}
46 changes: 32 additions & 14 deletions dbm-services/riak/db-tools/riak-monitor/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,14 @@ type MonitorItem struct {

# 监控项

| 监控项 |调度计划| 机器类型 | 实例角色 |级别| 说明 |自定义|
|--------------------------|-----|----------------|-----------------|-----|-----------------------------|-----|
| riak-err-notice |@every 1m| riak | | 预警 | 预警错误日志 |schedule, enable
| riak-db-up |@every 10s| backend, proxy | | 致命 | db 连通性. 硬编码, 不可配置, 无需录入配置系统 |enable
| riak_monitor_heart_beat |@every 10s| riak | | 致命 | 监控心跳. 硬编码, 不可配置, 无需录入配置系统 |enable
| riak-load-health |@every 1m| riak | | 致命 | 检查负载与响应情况 |enable
| riak-ring-status |@every 10s| riak | | 致命 | 检查ring status, 发现集群中所有的故障节点 |enable
| 监控项 |调度计划| 机器类型 | 实例角色 | 级别 | 说明 |自定义|
|-----------------------------|-----|----------------|-----------------|----|------------------------------------|-----|
| riak-err-notice |@every 1m| riak | | 预警 | 预警错误日志 |schedule, enable
| riak-db-up |@every 30s| backend, proxy | | 致命 | db连通性事件以及连通心跳. 硬编码, 不可配置, 无需录入配置系统 |enable
| riak_monitor_heart_beat |@every 30s| riak | | 致命 | 监控心跳. 硬编码, 不可配置, 无需录入配置系统 |enable
| riak-load-health |@every 1m| riak | | 致命 | 检查负载与响应情况 |enable
| riak-ring-status |@every 30s| riak | | 致命 | 检查ring status, 发现集群中所有的故障节点 |enable
| riak_connections_heart_beat |@every 1m| riak | | 预警 | 实例连接数目心跳 |enable


## 示例文件:
Expand Down Expand Up @@ -109,6 +110,18 @@ jobs_config: /data/monitor/riak-crond/jobs-config.yaml
schedule: '@every 1m'
creator: admin
work_dir: ""
- name: riak_connections_heart_beat@every 1m
enable: true
command: /data/monitor/riak-monitor/riak-monitor
args:
- run
- --items
- riak_connections_heart_beat
- -c
- /data/monitor/riak-monitor/runtime.yaml
schedule: '@every 1m'
creator: admin
work_dir: ""
- name: riak-load-health@every 1m
enable: true
command: /data/monitor/riak-monitor/riak-monitor
Expand All @@ -121,7 +134,7 @@ jobs_config: /data/monitor/riak-crond/jobs-config.yaml
schedule: '@every 1m'
creator: admin
work_dir: ""
- name: riak-ring-status@every 10s
- name: riak-ring-status@every 30s
enable: true
command: /data/monitor/riak-monitor/riak-monitor
args:
Expand All @@ -130,10 +143,10 @@ jobs_config: /data/monitor/riak-crond/jobs-config.yaml
- riak-ring-status
- -c
- /data/monitor/riak-monitor/runtime.yaml
schedule: '@every 10s'
schedule: '@every 30s'
creator: admin
work_dir: ""
- name: riak-monitor-hardcode@every 10s
- name: riak-monitor-hardcode@every 30s
enable: true
command: /data/monitor/riak-monitor/riak-monitor
args:
Expand All @@ -142,7 +155,7 @@ jobs_config: /data/monitor/riak-crond/jobs-config.yaml
- riak-db-up,riak_monitor_heart_beat
- -c
- /data/monitor/riak-monitor/runtime.yaml
schedule: '@every 10s'
schedule: '@every 30s'
creator: admin
work_dir: ""
bk_biz_id: xxx
Expand Down Expand Up @@ -177,18 +190,23 @@ default_schedule: '@every 1m'
schedule: '@every 1m'
machine_type:
- riak
- name: riak_connections_heart_beat
enable: true
schedule: '@every 1m'
machine_type:
- riak
- name: riak-ring-status
enable: true
schedule: '@every 10s'
schedule: '@every 30s'
machine_type:
- riak
- name: riak-db-up
enable: true
schedule: '@every 10s'
schedule: '@every 30s'
machine_type:
- riak
- name: riak_monitor_heart_beat
enable: true
schedule: '@every 10s'
schedule: '@every 30s'
machine_type:
- riak
11 changes: 8 additions & 3 deletions dbm-services/riak/db-tools/riak-monitor/items-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,21 @@
- riak
- name: riak-ring-status
enable: true
schedule: '@every 10s'
schedule: '@every 30s'
machine_type:
- riak
- name: riak_connections_heart_beat
enable: true
schedule: '@every 1m'
machine_type:
- riak
- name: riak-db-up
enable: true
schedule: '@every 10s'
schedule: '@every 30s'
machine_type:
- riak
- name: riak_monitor_heart_beat
enable: true
schedule: '@every 10s'
schedule: '@every 30s'
machine_type:
- riak
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package connections

import (
"dbm-services/riak/db-tools/riak-monitor/pkg/utils"
"fmt"
"strings"

"golang.org/x/exp/slog"
)

// Connections riak实例上的连接
func Connections() (string, error) {
localIp, err := utils.GetLocalIP()
if err != nil {
errInfo := fmt.Sprintf("get local ip error: %s", err.Error())
slog.Error(errInfo)
return "", fmt.Errorf(errInfo)
}
// 使用netstat查看端口上的连接
cmd := `netstat -anpl|grep ':8087' | grep "beam.smp" | grep -E -v '0.0.0.0:*'`
// 剔除本机上的蓝鲸监控连接,返回连接数
cmd = fmt.Sprintf(`%s | awk '{ if (index($5,"%s:") == 0) { print $0 } }' | wc -l`, cmd, localIp)
resp, err := utils.ExecShellCommand(false, cmd)
if err != nil {
errInfo := fmt.Sprintf("execute [ %s ] error: %s", cmd, err.Error())
slog.Error(errInfo)
return "", fmt.Errorf(errInfo)
}
resp = strings.Replace(resp, " ", "", -1)
resp = strings.Replace(resp, "\n", "", -1)
return resp, nil
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package connections

import (
"dbm-services/riak/db-tools/riak-monitor/pkg/monitoriteminterface"
"fmt"

"github.com/pkg/errors"
)

// NameConnections 检查riak连接数监控名称
var NameConnections = "riak_connections_heart_beat"

func init() {}

// Checker TODO
type Checker struct {
name string
f func() (string, error)
}

// Run TODO
func (c *Checker) Run() (msg string, err error) {
msg, err = c.f()
if err != nil {
return "", errors.Wrap(err, fmt.Sprintf("run %s", c.name))
}
return msg, nil
}

// Name 监控名称
func (c *Checker) Name() string {
return c.name
}

// NewConnections 查看连接
func NewConnections(cc *monitoriteminterface.ConnectionCollect) monitoriteminterface.MonitorItemInterface {
return &Checker{
name: NameConnections,
f: Connections,
}
}

// RegisterConnections 注册查看连接
func RegisterConnections() (string, monitoriteminterface.MonitorItemConstructorFuncType) {
return NameConnections, NewConnections
}
Loading

0 comments on commit c24e765

Please sign in to comment.