From 8aa1e0face0b28e3e2640b22edd4c0c2e767784c Mon Sep 17 00:00:00 2001 From: ulya-sidorina Date: Wed, 6 Nov 2024 18:01:50 +0100 Subject: [PATCH] feat(ydbcp): add metrics --- cmd/ydbcp/main.go | 17 +-- internal/connectors/s3/connector.go | 12 ++- internal/handlers/delete_backup.go | 41 +++++-- internal/handlers/restore_backup.go | 29 +++-- internal/handlers/schedule_backup.go | 5 +- internal/handlers/take_backup.go | 47 +++++--- internal/handlers/take_backup_retry.go | 5 +- internal/handlers/utils.go | 10 +- internal/metrics/metrics.go | 102 +++++++++++++++++- internal/processor/processor.go | 27 +---- .../server/services/backup/backup_service.go | 71 +++++++++--- .../backup_schedule_service.go | 77 ++++++++++--- .../services/operation/operation_service.go | 45 ++++++-- 13 files changed, 386 insertions(+), 102 deletions(-) diff --git a/cmd/ydbcp/main.go b/cmd/ydbcp/main.go index 8b0609c3..5d74cd8a 100644 --- a/cmd/ydbcp/main.go +++ b/cmd/ydbcp/main.go @@ -121,9 +121,10 @@ func main() { authProvider, configInstance.ClientConnection.AllowedEndpointDomains, configInstance.ClientConnection.AllowInsecureEndpoint, + metrics, ).Register(server) - operation.NewOperationService(dbConnector, authProvider).Register(server) - backup_schedule.NewBackupScheduleService(dbConnector, clientConnector, authProvider).Register(server) + operation.NewOperationService(dbConnector, authProvider, metrics).Register(server) + backup_schedule.NewBackupScheduleService(dbConnector, clientConnector, authProvider, metrics).Register(server) if err := server.Start(ctx, &wg); err != nil { xlog.Error(ctx, "Error start GRPC server", zap.Error(err)) os.Exit(1) @@ -133,7 +134,7 @@ func main() { if err := handlersRegistry.Add( types.OperationTypeTB, handlers.NewTBOperationHandler( - dbConnector, clientConnector, s3Connector, configInstance, queries.NewWriteTableQuery, + dbConnector, clientConnector, s3Connector, configInstance, queries.NewWriteTableQuery, metrics, ), ); err != nil { xlog.Error(ctx, "failed to register TB handler", zap.Error(err)) @@ -142,7 +143,7 @@ func main() { if err := handlersRegistry.Add( types.OperationTypeRB, - handlers.NewRBOperationHandler(dbConnector, clientConnector, configInstance), + handlers.NewRBOperationHandler(dbConnector, clientConnector, configInstance, metrics), ); err != nil { xlog.Error(ctx, "failed to register RB handler", zap.Error(err)) os.Exit(1) @@ -150,7 +151,7 @@ func main() { if err := handlersRegistry.Add( types.OperationTypeDB, - handlers.NewDBOperationHandler(dbConnector, s3Connector, configInstance, queries.NewWriteTableQuery), + handlers.NewDBOperationHandler(dbConnector, s3Connector, configInstance, queries.NewWriteTableQuery, metrics), ); err != nil { xlog.Error(ctx, "failed to register DB handler", zap.Error(err)) os.Exit(1) @@ -164,7 +165,9 @@ func main() { configInstance.S3, configInstance.ClientConnection, queries.NewWriteTableQuery, - clockwork.NewRealClock()), + clockwork.NewRealClock(), + metrics, + ), ); err != nil { xlog.Error(ctx, "failed to register TBWR handler", zap.Error(err)) os.Exit(1) @@ -174,7 +177,7 @@ func main() { ttl_watcher.NewTtlWatcher(ctx, &wg, dbConnector, queries.NewWriteTableQuery) backupScheduleHandler := handlers.NewBackupScheduleHandler( - queries.NewWriteTableQuery, clockwork.NewRealClock(), + queries.NewWriteTableQuery, clockwork.NewRealClock(), metrics, ) schedule_watcher.NewScheduleWatcher(ctx, &wg, dbConnector, backupScheduleHandler) xlog.Info(ctx, "YDBCP started") diff --git a/internal/connectors/s3/connector.go b/internal/connectors/s3/connector.go index e3c5c87f..e42f6c5e 100644 --- a/internal/connectors/s3/connector.go +++ b/internal/connectors/s3/connector.go @@ -13,7 +13,7 @@ import ( ) type S3Connector interface { - ListObjects(pathPrefix string, bucket string) ([]string, error) + ListObjects(pathPrefix string, bucket string) ([]string, int64, error) GetSize(pathPrefix string, bucket string) (int64, error) DeleteObjects(keys []string, bucket string) error } @@ -49,7 +49,8 @@ func NewS3Connector(config config.S3Config) (*ClientS3Connector, error) { return &ClientS3Connector{s3: s3Client}, nil } -func (c *ClientS3Connector) ListObjects(pathPrefix string, bucket string) ([]string, error) { +func (c *ClientS3Connector) ListObjects(pathPrefix string, bucket string) ([]string, int64, error) { + var size int64 objects := make([]string, 0) objectsPtr := &objects @@ -65,6 +66,9 @@ func (c *ClientS3Connector) ListObjects(pathPrefix string, bucket string) ([]str func(p *s3.ListObjectsOutput, last bool) (shouldContinue bool) { for _, object := range p.Contents { *objectsPtr = append(*objectsPtr, *object.Key) + if object.Size != nil { + size += *object.Size + } } return true @@ -72,10 +76,10 @@ func (c *ClientS3Connector) ListObjects(pathPrefix string, bucket string) ([]str ) if err != nil { - return nil, err + return nil, 0, err } - return *objectsPtr, nil + return *objectsPtr, size, nil } func (c *ClientS3Connector) GetSize(pathPrefix string, bucket string) (int64, error) { diff --git a/internal/handlers/delete_backup.go b/internal/handlers/delete_backup.go index 2083ee08..8a4cfb61 100644 --- a/internal/handlers/delete_backup.go +++ b/internal/handlers/delete_backup.go @@ -3,6 +3,7 @@ package handlers import ( "context" "fmt" + "ydbcp/internal/metrics" "ydbcp/internal/config" "ydbcp/internal/connectors/db" @@ -21,9 +22,10 @@ func NewDBOperationHandler( s3 s3.S3Connector, config config.Config, queryBulderFactory queries.WriteQueryBulderFactory, + mon metrics.MetricsRegistry, ) types.OperationHandler { return func(ctx context.Context, op types.Operation) error { - return DBOperationHandler(ctx, op, db, s3, config, queryBulderFactory) + return DBOperationHandler(ctx, op, db, s3, config, queryBulderFactory, mon) } } @@ -34,6 +36,7 @@ func DBOperationHandler( s3 s3.S3Connector, config config.Config, queryBuilderFactory queries.WriteQueryBulderFactory, + mon metrics.MetricsRegistry, ) error { xlog.Info(ctx, "DBOperationHandler", zap.String("OperationMessage", operation.GetMessage())) @@ -59,9 +62,15 @@ func DBOperationHandler( operation.SetState(types.OperationStateError) operation.SetMessage("Operation deadline exceeded") operation.GetAudit().CompletedAt = timestamppb.Now() - return db.ExecuteUpsert( + err := db.ExecuteUpsert( ctx, queryBuilderFactory().WithUpdateOperation(operation).WithUpdateBackup(backupToWrite), ) + + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } backups, err := db.SelectBackups( @@ -84,7 +93,13 @@ func DBOperationHandler( operation.SetState(types.OperationStateError) operation.SetMessage("Backup not found") operation.GetAudit().CompletedAt = timestamppb.Now() - return db.UpdateOperation(ctx, operation) + err = db.UpdateOperation(ctx, operation) + + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } backup := backups[0] @@ -92,15 +107,23 @@ func DBOperationHandler( operation.SetState(types.OperationStateError) operation.SetMessage(fmt.Sprintf("Unexpected backup status: %s", backup.Status)) operation.GetAudit().CompletedAt = timestamppb.Now() - return db.UpdateOperation(ctx, operation) + err = db.UpdateOperation(ctx, operation) + + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } deleteBackup := func(pathPrefix string, bucket string) error { - err := DeleteBackupData(s3, pathPrefix, bucket) + size, err := DeleteBackupData(s3, pathPrefix, bucket) if err != nil { return fmt.Errorf("failed to delete backup data: %v", err) } + mon.IncBytesDeletedCounter(backup.ContainerID, backup.S3Bucket, size) + backupToWrite.Status = types.BackupStateDeleted operation.SetState(types.OperationStateDone) operation.SetMessage("Success") @@ -133,7 +156,13 @@ func DBOperationHandler( return fmt.Errorf("unexpected operation state %s", dbOp.State) } - return db.ExecuteUpsert( + err = db.ExecuteUpsert( ctx, queryBuilderFactory().WithUpdateOperation(operation).WithUpdateBackup(backupToWrite), ) + + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } diff --git a/internal/handlers/restore_backup.go b/internal/handlers/restore_backup.go index 9a8e10c5..c1988cbc 100644 --- a/internal/handlers/restore_backup.go +++ b/internal/handlers/restore_backup.go @@ -3,6 +3,7 @@ package handlers import ( "context" "fmt" + "ydbcp/internal/metrics" "ydbcp/internal/config" "ydbcp/internal/connectors/client" @@ -16,10 +17,10 @@ import ( ) func NewRBOperationHandler( - db db.DBConnector, client client.ClientConnector, config config.Config, + db db.DBConnector, client client.ClientConnector, config config.Config, mon metrics.MetricsRegistry, ) types.OperationHandler { return func(ctx context.Context, op types.Operation) error { - return RBOperationHandler(ctx, op, db, client, config) + return RBOperationHandler(ctx, op, db, client, config, mon) } } @@ -29,6 +30,7 @@ func RBOperationHandler( db db.DBConnector, client client.ClientConnector, config config.Config, + mon metrics.MetricsRegistry, ) error { xlog.Info(ctx, "RBOperationHandler", zap.String("OperationMessage", operation.GetMessage())) @@ -64,7 +66,12 @@ func RBOperationHandler( operation.SetState(ydbOpResponse.opState) operation.SetMessage(ydbOpResponse.opMessage) operation.GetAudit().CompletedAt = timestamppb.Now() - return db.UpdateOperation(ctx, operation) + err = db.UpdateOperation(ctx, operation) + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } if ydbOpResponse.opResponse == nil { @@ -114,9 +121,14 @@ func RBOperationHandler( operation.SetState(types.OperationStateError) operation.SetMessage("Operation deadline exceeded") operation.GetAudit().CompletedAt = timestamppb.Now() - } - return db.UpdateOperation(ctx, operation) + err := db.UpdateOperation(ctx, operation) + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err + } } if opResponse.GetOperation().Status == Ydb.StatusIds_SUCCESS { operation.SetState(types.OperationStateDone) @@ -160,5 +172,10 @@ func RBOperationHandler( } operation.GetAudit().CompletedAt = timestamppb.Now() - return db.UpdateOperation(ctx, operation) + err = db.UpdateOperation(ctx, operation) + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } diff --git a/internal/handlers/schedule_backup.go b/internal/handlers/schedule_backup.go index a692b51e..de7e742d 100644 --- a/internal/handlers/schedule_backup.go +++ b/internal/handlers/schedule_backup.go @@ -9,6 +9,7 @@ import ( "google.golang.org/protobuf/types/known/timestamppb" "ydbcp/internal/connectors/db" "ydbcp/internal/connectors/db/yql/queries" + "ydbcp/internal/metrics" "ydbcp/internal/types" "ydbcp/internal/util/xlog" pb "ydbcp/pkg/proto/ydbcp/v1alpha1" @@ -19,11 +20,12 @@ type BackupScheduleHandlerType func(context.Context, db.DBConnector, types.Backu func NewBackupScheduleHandler( queryBuilderFactory queries.WriteQueryBulderFactory, clock clockwork.Clock, + mon metrics.MetricsRegistry, ) BackupScheduleHandlerType { return func(ctx context.Context, driver db.DBConnector, schedule types.BackupSchedule) error { return BackupScheduleHandler( ctx, driver, schedule, - queryBuilderFactory, clock, + queryBuilderFactory, clock, mon, ) } } @@ -34,6 +36,7 @@ func BackupScheduleHandler( schedule types.BackupSchedule, queryBuilderFactory queries.WriteQueryBulderFactory, clock clockwork.Clock, + mon metrics.MetricsRegistry, ) error { if schedule.Status != types.BackupScheduleStateActive { xlog.Error(ctx, "backup schedule is not active", zap.String("scheduleID", schedule.ID)) diff --git a/internal/handlers/take_backup.go b/internal/handlers/take_backup.go index d1604b66..95ee479f 100644 --- a/internal/handlers/take_backup.go +++ b/internal/handlers/take_backup.go @@ -8,6 +8,7 @@ import ( "ydbcp/internal/connectors/db" "ydbcp/internal/connectors/db/yql/queries" "ydbcp/internal/connectors/s3" + "ydbcp/internal/metrics" "ydbcp/internal/types" "ydbcp/internal/util/xlog" pb "ydbcp/pkg/proto/ydbcp/v1alpha1" @@ -20,10 +21,10 @@ import ( func NewTBOperationHandler( db db.DBConnector, client client.ClientConnector, s3 s3.S3Connector, config config.Config, - queryBuilderFactory queries.WriteQueryBulderFactory, + queryBuilderFactory queries.WriteQueryBulderFactory, mon metrics.MetricsRegistry, ) types.OperationHandler { return func(ctx context.Context, op types.Operation) error { - return TBOperationHandler(ctx, op, db, client, s3, config, queryBuilderFactory) + return TBOperationHandler(ctx, op, db, client, s3, config, queryBuilderFactory, mon) } } @@ -35,6 +36,7 @@ func TBOperationHandler( s3 s3.S3Connector, config config.Config, queryBuilderFactory queries.WriteQueryBulderFactory, + mon metrics.MetricsRegistry, ) error { xlog.Info(ctx, "TBOperationHandler", zap.String("OperationMessage", operation.GetMessage())) @@ -74,9 +76,15 @@ func TBOperationHandler( backupToWrite.Status = types.BackupStateError backupToWrite.Message = operation.GetMessage() backupToWrite.AuditInfo.CompletedAt = now - return db.ExecuteUpsert( + err = db.ExecuteUpsert( ctx, queryBuilderFactory().WithUpdateOperation(operation).WithUpdateBackup(backupToWrite), ) + + if err == nil { + mon.ObserveOperationDuration(operation) + } + + return err } if ydbOpResponse.opResponse == nil { return nil @@ -123,14 +131,15 @@ func TBOperationHandler( operation.SetMessage("Operation deadline exceeded") } return db.UpdateOperation(ctx, operation) - } else if opResponse.GetOperation().Status == Ydb.StatusIds_SUCCESS { - size, err := getBackupSize(backup.S3PathPrefix, backup.S3Bucket) - if err != nil { - return err - } + } + size, err := getBackupSize(backup.S3PathPrefix, backup.S3Bucket) + if err != nil { + return err + } + + if opResponse.GetOperation().Status == Ydb.StatusIds_SUCCESS { backupToWrite.Status = types.BackupStateAvailable - backupToWrite.Size = size operation.SetState(types.OperationStateDone) operation.SetMessage("Success") } else if opResponse.GetOperation().Status == Ydb.StatusIds_CANCELLED { @@ -147,6 +156,8 @@ func TBOperationHandler( operation.SetMessage(ydbOpResponse.IssueString()) } backupToWrite.Message = operation.GetMessage() + backupToWrite.Size = size + mon.IncBytesWrittenCounter(backup.ContainerID, backup.S3Bucket, size) } case types.OperationStateStartCancelling: { @@ -178,30 +189,36 @@ func TBOperationHandler( return db.UpdateOperation(ctx, operation) } - if opResponse.GetOperation().Status == Ydb.StatusIds_SUCCESS { - size, err := getBackupSize(backup.S3PathPrefix, backup.S3Bucket) - if err != nil { - return err - } + size, err := getBackupSize(backup.S3PathPrefix, backup.S3Bucket) + if err != nil { + return err + } + + if opResponse.GetOperation().Status == Ydb.StatusIds_SUCCESS { backupToWrite.Status = types.BackupStateAvailable backupToWrite.Size = size operation.SetState(types.OperationStateDone) operation.SetMessage("Operation was completed despite cancellation: " + tb.Message) } else if opResponse.GetOperation().Status == Ydb.StatusIds_CANCELLED { - err = DeleteBackupData(s3, backup.S3PathPrefix, backup.S3Bucket) + size, err = DeleteBackupData(s3, backup.S3PathPrefix, backup.S3Bucket) if err != nil { return err } + + mon.IncBytesDeletedCounter(backup.ContainerID, backup.S3Bucket, size) + backupToWrite.Status = types.BackupStateCancelled operation.SetState(types.OperationStateCancelled) operation.SetMessage(tb.Message) } else { backupToWrite.Status = types.BackupStateError + backupToWrite.Size = size operation.SetState(types.OperationStateError) operation.SetMessage(ydbOpResponse.IssueString()) } backupToWrite.Message = operation.GetMessage() + mon.IncBytesWrittenCounter(backup.ContainerID, backup.S3Bucket, size) } default: return fmt.Errorf("unexpected operation state %s", tb.State) diff --git a/internal/handlers/take_backup_retry.go b/internal/handlers/take_backup_retry.go index a298f254..5e1c7a0b 100644 --- a/internal/handlers/take_backup_retry.go +++ b/internal/handlers/take_backup_retry.go @@ -14,6 +14,7 @@ import ( "ydbcp/internal/connectors/client" "ydbcp/internal/connectors/db" "ydbcp/internal/connectors/db/yql/queries" + "ydbcp/internal/metrics" "ydbcp/internal/types" "ydbcp/internal/util/xlog" pb "ydbcp/pkg/proto/ydbcp/v1alpha1" @@ -26,9 +27,10 @@ func NewTBWROperationHandler( clientConfig config.ClientConnectionConfig, queryBuilderFactory queries.WriteQueryBulderFactory, clock clockwork.Clock, + mon metrics.MetricsRegistry, ) types.OperationHandler { return func(ctx context.Context, op types.Operation) error { - return TBWROperationHandler(ctx, op, db, client, s3, clientConfig, queryBuilderFactory, clock) + return TBWROperationHandler(ctx, op, db, client, s3, clientConfig, queryBuilderFactory, clock, mon) } } @@ -144,6 +146,7 @@ func TBWROperationHandler( clientConfig config.ClientConnectionConfig, queryBuilderFactory queries.WriteQueryBulderFactory, clock clockwork.Clock, + mon metrics.MetricsRegistry, ) error { ctx = xlog.With(ctx, zap.String("OperationID", operation.GetID())) diff --git a/internal/handlers/utils.go b/internal/handlers/utils.go index e8807476..7672f33b 100644 --- a/internal/handlers/utils.go +++ b/internal/handlers/utils.go @@ -131,18 +131,18 @@ func CancelYdbOperation( return nil } -func DeleteBackupData(s3 s3.S3Connector, s3PathPrefix string, s3Bucket string) error { - objects, err := s3.ListObjects(s3PathPrefix, s3Bucket) +func DeleteBackupData(s3 s3.S3Connector, s3PathPrefix string, s3Bucket string) (int64, error) { + objects, size, err := s3.ListObjects(s3PathPrefix, s3Bucket) if err != nil { - return fmt.Errorf("failed to list S3 objects: %v", err) + return 0, fmt.Errorf("failed to list S3 objects: %v", err) } if len(objects) != 0 { err = s3.DeleteObjects(objects, s3Bucket) if err != nil { - return fmt.Errorf("failed to delete S3 objects: %v", err) + return 0, fmt.Errorf("failed to delete S3 objects: %v", err) } } - return nil + return size, nil } diff --git a/internal/metrics/metrics.go b/internal/metrics/metrics.go index 2ed37ee5..f47bd90f 100644 --- a/internal/metrics/metrics.go +++ b/internal/metrics/metrics.go @@ -7,6 +7,7 @@ import ( "net/http" "sync" "time" + "ydbcp/internal/types" "ydbcp/internal/config" "ydbcp/internal/util/xlog" @@ -18,17 +19,69 @@ import ( ) type MetricsRegistry interface { - Factory() promauto.Factory + IncApiCallsCounter(serviceName string, methodName string, status string) + IncBytesWrittenCounter(containerId string, bucketId string, bytes int64) + IncBytesDeletedCounter(containerId string, bucketId string, bytes int64) + ObserveOperationDuration(operation types.Operation) + IncHandlerRunsCount(containerId string, operationType string) + IncFailedHandlerRunsCount(containerId string, operationType string) + IncSuccessfulHandlerRunsCount(containerId string, operationType string) } type MetricsRegistryImpl struct { server *http.Server reg *prometheus.Registry cfg config.MetricsServerConfig + + // api metrics + apiCallsCounter *prometheus.CounterVec + + // storage metrics + bytesWrittenCounter *prometheus.CounterVec + bytesDeletedCounter *prometheus.CounterVec + + // operation metrics + operationsDuration *prometheus.HistogramVec + + // operation processor metrics + handlerRunsCount *prometheus.CounterVec + handlerFailedCount *prometheus.CounterVec + handlerSuccessfulCount *prometheus.CounterVec +} + +func (s *MetricsRegistryImpl) IncApiCallsCounter(serviceName string, methodName string, code string) { + s.apiCallsCounter.WithLabelValues(serviceName, methodName, code).Inc() +} + +func (s *MetricsRegistryImpl) IncBytesWrittenCounter(containerId string, bucketId string, bytes int64) { + s.bytesWrittenCounter.WithLabelValues(containerId, bucketId).Add(float64(bytes)) +} + +func (s *MetricsRegistryImpl) IncBytesDeletedCounter(containerId string, bucketId string, bytes int64) { + s.bytesDeletedCounter.WithLabelValues(containerId, bucketId).Add(float64(bytes)) +} + +func (s *MetricsRegistryImpl) ObserveOperationDuration(operation types.Operation) { + if operation.GetAudit() != nil && operation.GetAudit().CompletedAt != nil { + duration := operation.GetAudit().CompletedAt.AsTime().Sub(operation.GetAudit().CreatedAt.AsTime()) + s.operationsDuration.WithLabelValues( + operation.GetContainerID(), + operation.GetType().String(), + operation.GetState().String(), + ).Observe(duration.Seconds()) + } +} + +func (s *MetricsRegistryImpl) IncHandlerRunsCount(containerId string, operationType string) { + s.handlerRunsCount.WithLabelValues(containerId, operationType).Inc() } -func (s *MetricsRegistryImpl) Factory() promauto.Factory { - return promauto.With(s.reg) +func (s *MetricsRegistryImpl) IncFailedHandlerRunsCount(containerId string, operationType string) { + s.handlerFailedCount.WithLabelValues(containerId, operationType).Inc() +} + +func (s *MetricsRegistryImpl) IncSuccessfulHandlerRunsCount(containerId string, operationType string) { + s.handlerSuccessfulCount.WithLabelValues(containerId, operationType).Inc() } func NewMetricsRegistry(ctx context.Context, wg *sync.WaitGroup, cfg *config.MetricsServerConfig) *MetricsRegistryImpl { @@ -37,6 +90,49 @@ func NewMetricsRegistry(ctx context.Context, wg *sync.WaitGroup, cfg *config.Met cfg: *cfg, } + s.apiCallsCounter = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{ + Subsystem: "api", + Name: "api_calls_total", + Help: "Total count of API calls", + }, []string{"service", "method", "status"}) + + s.bytesWrittenCounter = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{ + Subsystem: "storage", + Name: "bytes_written", + Help: "Count of bytes written to storage", + }, []string{"container_id", "bucket_id"}) + + s.bytesDeletedCounter = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{ + Subsystem: "storage", + Name: "bytes_deleted", + Help: "Count of bytes deleted from storage", + }, []string{"container_id", "bucket_id"}) + + s.operationsDuration = promauto.With(s.reg).NewHistogramVec(prometheus.HistogramOpts{ + Subsystem: "operations", + Name: "duration_seconds", + Help: "Duration of operations in seconds", + Buckets: prometheus.ExponentialBuckets(10, 2, 8), + }, []string{"container_id", "type", "status"}) + + s.handlerRunsCount = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{ + Subsystem: "operation_processor", + Name: "operation_handler_runs_count", + Help: "Total count of operation handler runs", + }, []string{"container_id", "operation_type"}) + + s.handlerFailedCount = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{ + Subsystem: "operation_processor", + Name: "operation_handler_runs_failed_count", + Help: "Total count of failed operation handler runs", + }, []string{"container_id", "operation_type"}) + + s.handlerSuccessfulCount = promauto.With(s.reg).NewCounterVec(prometheus.CounterOpts{ + Subsystem: "operation_processor", + Name: "operation_handler_runs_successful_count", + Help: "Total count of successful operation handler runs", + }, []string{"container_id", "operation_type"}) + mux := http.NewServeMux() mux.Handle("/metrics", promhttp.HandlerFor(s.reg, promhttp.HandlerOpts{Registry: s.reg})) diff --git a/internal/processor/processor.go b/internal/processor/processor.go index dcc58ee3..f9a5e7df 100644 --- a/internal/processor/processor.go +++ b/internal/processor/processor.go @@ -12,7 +12,6 @@ import ( "ydbcp/internal/util/xlog" "github.com/google/uuid" - "github.com/prometheus/client_golang/prometheus" "go.uber.org/zap" ) @@ -33,9 +32,7 @@ type OperationProcessorImpl struct { runningOperations map[string]bool results chan string - runsCount *prometheus.CounterVec - failedCount *prometheus.CounterVec - successfulCount *prometheus.CounterVec + mon metrics.MetricsRegistry } type Option func(*OperationProcessorImpl) @@ -73,22 +70,6 @@ func NewOperationProcessor( tickerProvider: ticker.NewRealTicker, runningOperations: make(map[string]bool), results: make(chan string), - - runsCount: mon.Factory().NewCounterVec(prometheus.CounterOpts{ - Subsystem: metricsSubsystem, - Name: "operations_runs_count", - Help: "Total count of runs of the operation", - }, []string{"task_type"}), - failedCount: mon.Factory().NewCounterVec(prometheus.CounterOpts{ - Subsystem: metricsSubsystem, - Name: "operations_failed_count", - Help: "Total count of failed operations", - }, []string{"task_type"}), - successfulCount: mon.Factory().NewCounterVec(prometheus.CounterOpts{ - Subsystem: metricsSubsystem, - Name: "operations_successful_count", - Help: "Total count of successful operations", - }, []string{"task_type"}), } for _, opt := range options { opt(op) @@ -165,7 +146,7 @@ func (o *OperationProcessorImpl) processOperation(op types.Operation) { ) return } - o.runsCount.WithLabelValues(op.GetType().String()).Inc() + o.mon.IncHandlerRunsCount(op.GetContainerID(), op.GetType().String()) o.runningOperations[op.GetID()] = true o.workersWaitGroup.Add(1) go func() { @@ -183,14 +164,14 @@ func (o *OperationProcessorImpl) processOperation(op types.Operation) { zap.String("operation", types.OperationToString(op)), zap.Error(err), ) - o.failedCount.WithLabelValues(op.GetType().String()).Inc() + o.mon.IncFailedHandlerRunsCount(op.GetContainerID(), op.GetType().String()) } else { xlog.Debug( ctx, "operation handler finished successfully", zap.String("operation", types.OperationToString(op)), ) - o.successfulCount.WithLabelValues(op.GetType().String()).Inc() + o.mon.IncSuccessfulHandlerRunsCount(op.GetContainerID(), op.GetType().String()) } o.results <- op.GetID() }() diff --git a/internal/server/services/backup/backup_service.go b/internal/server/services/backup/backup_service.go index f91ecc83..f60fd289 100644 --- a/internal/server/services/backup/backup_service.go +++ b/internal/server/services/backup/backup_service.go @@ -10,6 +10,7 @@ import ( "ydbcp/internal/connectors/client" "ydbcp/internal/connectors/db" "ydbcp/internal/connectors/db/yql/queries" + "ydbcp/internal/metrics" "ydbcp/internal/server" "ydbcp/internal/server/grpcinfo" "ydbcp/internal/types" @@ -33,15 +34,22 @@ type BackupService struct { allowedEndpointDomains []string allowInsecureEndpoint bool clock clockwork.Clock + mon metrics.MetricsRegistry +} + +func (s *BackupService) IncApiCallsCounter(methodName string, code codes.Code) { + s.mon.IncApiCallsCounter("BackupService", methodName, code.String()) } func (s *BackupService) GetBackup(ctx context.Context, request *pb.GetBackupRequest) (*pb.Backup, error) { + const methodName string = "GetBackup" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Debug(ctx, "GetBackup", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) ctx = xlog.With(ctx, zap.String("BackupID", request.Id)) backupID, err := types.ParseObjectID(request.GetId()) if err != nil { xlog.Error(ctx, "failed to parse BackupID", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.InvalidArgument) return nil, status.Error(codes.InvalidArgument, "failed to parse BackupID") } ctx = xlog.With(ctx, zap.String("BackupID", backupID)) @@ -58,10 +66,12 @@ func (s *BackupService) GetBackup(ctx context.Context, request *pb.GetBackupRequ ) if err != nil { xlog.Error(ctx, "can't select backups", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't select backups") } if len(backups) == 0 { xlog.Error(ctx, "backup not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup not found") // TODO: Permission denied? } backup := backups[0] @@ -69,21 +79,25 @@ func (s *BackupService) GetBackup(ctx context.Context, request *pb.GetBackupRequ // TODO: Need to check access to backup resource by backupID subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupGet, backup.ContainerID, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) - xlog.Debug(ctx, "GetBackup", zap.String("backup", backup.String())) + xlog.Debug(ctx, methodName, zap.String("backup", backup.String())) + s.IncApiCallsCounter(methodName, codes.OK) return backups[0].Proto(), nil } func (s *BackupService) MakeBackup(ctx context.Context, req *pb.MakeBackupRequest) (*pb.Operation, error) { + const methodName string = "MakeBackup" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Info(ctx, "MakeBackup", zap.String("request", req.String())) + xlog.Debug(ctx, methodName, zap.String("request", req.String())) ctx = xlog.With(ctx, zap.String("ContainerID", req.ContainerId)) subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupCreate, req.ContainerId, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -116,6 +130,7 @@ func (s *BackupService) MakeBackup(ctx context.Context, req *pb.MakeBackupReques _, err = backup_operations.OpenConnAndValidateSourcePaths(ctx, backup_operations.FromTBWROperation(tbwr), s.clientConn) if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } @@ -123,22 +138,26 @@ func (s *BackupService) MakeBackup(ctx context.Context, req *pb.MakeBackupReques ctx, queries.NewWriteTableQuery().WithCreateOperation(tbwr), ) if err != nil { - return nil, err + s.IncApiCallsCounter(methodName, codes.Internal) + return nil, status.Error(codes.Internal, err.Error()) } ctx = xlog.With(ctx, zap.String("BackupID", tbwr.BackupID)) ctx = xlog.With(ctx, zap.String("OperationID", tbwr.GetID())) - xlog.Debug(ctx, "MakeBackup was started successfully", zap.String("operation", types.OperationToString(tbwr))) + xlog.Debug(ctx, methodName, zap.String("operation", types.OperationToString(tbwr))) + s.IncApiCallsCounter(methodName, codes.OK) return tbwr.Proto(), nil } func (s *BackupService) DeleteBackup(ctx context.Context, req *pb.DeleteBackupRequest) (*pb.Operation, error) { + const methodName string = "DeleteBackup" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Info(ctx, "DeleteBackup", zap.String("request", req.String())) + xlog.Debug(ctx, methodName, zap.String("request", req.String())) ctx = xlog.With(ctx, zap.String("BackupID", req.BackupId)) backupID, err := types.ParseObjectID(req.BackupId) if err != nil { xlog.Error(ctx, "failed to parse BackupID", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.InvalidArgument) return nil, status.Error(codes.InvalidArgument, "failed to parse BackupID") } ctx = xlog.With(ctx, zap.String("BackupID", backupID)) @@ -157,11 +176,13 @@ func (s *BackupService) DeleteBackup(ctx context.Context, req *pb.DeleteBackupRe if err != nil { xlog.Error(ctx, "can't select backups", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't select backups") } if len(backups) == 0 { xlog.Error(ctx, "backup not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup not found") // TODO: Permission Denied? } @@ -170,12 +191,14 @@ func (s *BackupService) DeleteBackup(ctx context.Context, req *pb.DeleteBackupRe subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupCreate, backup.ContainerID, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) if !backup.CanBeDeleted() { xlog.Error(ctx, "backup can't be deleted", zap.String("BackupStatus", backup.Status)) + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Errorf(codes.FailedPrecondition, "backup can't be deleted, status %s", backup.Status) } @@ -207,22 +230,26 @@ func (s *BackupService) DeleteBackup(ctx context.Context, req *pb.DeleteBackupRe ) if err != nil { xlog.Error(ctx, "can't create operation", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't create operation") } ctx = xlog.With(ctx, zap.String("OperationID", op.GetID())) - xlog.Debug(ctx, "DeleteBackup was started successfully", zap.String("operation", types.OperationToString(op))) + xlog.Debug(ctx, methodName, zap.String("operation", types.OperationToString(op))) + s.IncApiCallsCounter(methodName, codes.OK) return op.Proto(), nil } func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequest) (*pb.Operation, error) { + const methodName string = "MakeRestore" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Info(ctx, "MakeRestore", zap.String("request", req.String())) + xlog.Debug(ctx, methodName, zap.String("request", req.String())) ctx = xlog.With(ctx, zap.String("BackupID", req.BackupId)) backupID, err := types.ParseObjectID(req.BackupId) if err != nil { xlog.Error(ctx, "failed to parse BackupID", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.InvalidArgument) return nil, status.Error(codes.InvalidArgument, "failed to parse BackupID") } ctx = xlog.With(ctx, zap.String("BackupID", backupID)) @@ -240,10 +267,12 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ ) if err != nil { xlog.Error(ctx, "can't select backups", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't select backups") } if len(backups) == 0 { xlog.Error(ctx, "backup not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup not found") // TODO: Permission denied? } backup := backups[0] @@ -253,6 +282,7 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ ctx, s.auth, auth.PermissionBackupRestore, backup.ContainerID, "", ) // TODO: check access to backup as resource if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -263,6 +293,7 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ "endpoint of database is invalid or not allowed", zap.String("DatabaseEndpoint", req.DatabaseEndpoint), ) + s.IncApiCallsCounter(methodName, codes.InvalidArgument) return nil, status.Errorf( codes.InvalidArgument, "endpoint of database is invalid or not allowed, endpoint %s", req.DatabaseEndpoint, ) @@ -270,6 +301,7 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ if backup.Status != types.BackupStateAvailable { xlog.Error(ctx, "backup is not available", zap.String("BackupStatus", backup.Status)) + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Errorf(codes.FailedPrecondition, "backup is not available, status %s", backup.Status) } @@ -282,6 +314,7 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ client, err := s.clientConn.Open(ctx, dsn) if err != nil { xlog.Error(ctx, "can't open client connection", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Unknown) return nil, status.Errorf(codes.Unknown, "can't open client connection, dsn %s", dsn) } defer func() { @@ -293,11 +326,13 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ accessKey, err := s.s3.AccessKey() if err != nil { xlog.Error(ctx, "can't get S3AccessKey", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't get S3AccessKey") } secretKey, err := s.s3.SecretKey() if err != nil { xlog.Error(ctx, "can't get S3SecretKey", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't get S3SecretKey") } @@ -310,6 +345,7 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ fullPath, ok := backup_operations.SafePathJoin(backup.S3PathPrefix, p) if !ok { xlog.Error(ctx, "incorrect source path", zap.String("path", p)) + s.IncApiCallsCounter(methodName, codes.InvalidArgument) return nil, status.Errorf(codes.InvalidArgument, "incorrect source path %s", p) } sourcePaths = append(sourcePaths, fullPath) @@ -333,6 +369,7 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ clientOperationID, err := s.clientConn.ImportFromS3(ctx, client, s3Settings) if err != nil { xlog.Error(ctx, "can't start import operation", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Unknown) return nil, status.Errorf(codes.Unknown, "can't start import operation, dsn %s", dsn) } ctx = xlog.With(ctx, zap.String("ClientOperationID", clientOperationID)) @@ -360,23 +397,27 @@ func (s *BackupService) MakeRestore(ctx context.Context, req *pb.MakeRestoreRequ operationID, err := s.driver.CreateOperation(ctx, op) if err != nil { xlog.Error(ctx, "can't create operation", zap.String("operation", types.OperationToString(op)), zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't create operation") } ctx = xlog.With(ctx, zap.String("OperationID", operationID)) - xlog.Info(ctx, "RestoreBackup operation created") - op.ID = operationID + + xlog.Debug(ctx, methodName, zap.String("operation", types.OperationToString(op))) + s.IncApiCallsCounter(methodName, codes.OK) return op.Proto(), nil } func (s *BackupService) ListBackups(ctx context.Context, request *pb.ListBackupsRequest) ( *pb.ListBackupsResponse, error, ) { + const methodName string = "ListBackups" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Debug(ctx, "ListBackups", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) ctx = xlog.With(ctx, zap.String("ContainerID", request.ContainerId)) subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupList, request.ContainerId, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -421,10 +462,12 @@ func (s *BackupService) ListBackups(ctx context.Context, request *pb.ListBackups } pageSpec, err := queries.NewPageSpec(request.GetPageSize(), request.GetPageToken()) if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } orderSpec, err := queries.NewOrderSpec(request.GetOrder()) if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } @@ -438,6 +481,7 @@ func (s *BackupService) ListBackups(ctx context.Context, request *pb.ListBackups ) if err != nil { xlog.Error(ctx, "error getting backups", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting backups") } pbBackups := make([]*pb.Backup, 0, len(backups)) @@ -450,7 +494,8 @@ func (s *BackupService) ListBackups(ctx context.Context, request *pb.ListBackups if uint64(len(pbBackups)) == pageSpec.Limit { res.NextPageToken = strconv.FormatUint(pageSpec.Offset+pageSpec.Limit, 10) } - xlog.Debug(ctx, "ListBackups success") + xlog.Debug(ctx, methodName, zap.String("response", res.String())) + s.IncApiCallsCounter(methodName, codes.OK) return res, nil } @@ -465,6 +510,7 @@ func NewBackupService( auth ap.AuthProvider, allowedEndpointDomains []string, allowInsecureEndpoint bool, + mon metrics.MetricsRegistry, ) *BackupService { return &BackupService{ driver: driver, @@ -474,5 +520,6 @@ func NewBackupService( allowedEndpointDomains: allowedEndpointDomains, allowInsecureEndpoint: allowInsecureEndpoint, clock: clockwork.NewRealClock(), + mon: mon, } } diff --git a/internal/server/services/backup_schedule/backup_schedule_service.go b/internal/server/services/backup_schedule/backup_schedule_service.go index c57c2345..33c43ef3 100644 --- a/internal/server/services/backup_schedule/backup_schedule_service.go +++ b/internal/server/services/backup_schedule/backup_schedule_service.go @@ -11,6 +11,7 @@ import ( "ydbcp/internal/connectors/client" "ydbcp/internal/connectors/db" "ydbcp/internal/connectors/db/yql/queries" + "ydbcp/internal/metrics" "ydbcp/internal/server" "ydbcp/internal/server/grpcinfo" "ydbcp/internal/types" @@ -31,6 +32,11 @@ type BackupScheduleService struct { clientConn client.ClientConnector auth ap.AuthProvider clock clockwork.Clock + mon metrics.MetricsRegistry +} + +func (s *BackupScheduleService) IncApiCallsCounter(methodName string, code codes.Code) { + s.mon.IncApiCallsCounter("BackupScheduleService", methodName, code.String()) } func (s *BackupScheduleService) CheckClientDbAccess( @@ -54,11 +60,13 @@ func (s *BackupScheduleService) CheckClientDbAccess( func (s *BackupScheduleService) CreateBackupSchedule( ctx context.Context, request *pb.CreateBackupScheduleRequest, ) (*pb.BackupSchedule, error) { + const methodName string = "CreateBackupSchedule" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Info(ctx, "CreateBackupSchedule", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) ctx = xlog.With(ctx, zap.String("ContainerID", request.ContainerId)) subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupCreate, request.ContainerId, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -66,16 +74,19 @@ func (s *BackupScheduleService) CreateBackupSchedule( Endpoint: request.Endpoint, DatabaseName: request.DatabaseName, }); err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } if request.ScheduleSettings == nil { xlog.Error( ctx, "no backup schedule settings for CreateBackupSchedule", zap.String("request", request.String()), ) + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "no backup schedule settings for CreateBackupSchedule") } if request.ScheduleSettings.RecoveryPointObjective != nil && (request.ScheduleSettings.RecoveryPointObjective.Seconds == 0) { + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "recovery point objective should be greater than 0") } var scheduleName *string @@ -106,6 +117,7 @@ func (s *BackupScheduleService) CreateBackupSchedule( err = schedule.UpdateNextLaunch(s.clock.Now()) if err != nil { + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, err.Error()) } @@ -115,21 +127,24 @@ func (s *BackupScheduleService) CreateBackupSchedule( ctx, "can't create backup schedule", zap.String("backup schedule", schedule.Proto(s.clock).String()), zap.Error(err), ) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't create backup schedule") } - xlog.Info(ctx, "backup schedule created", zap.String("BackupScheduleID", schedule.ID)) + xlog.Debug(ctx, methodName, zap.Stringer("schedule", &schedule)) + s.IncApiCallsCounter(methodName, codes.OK) return schedule.Proto(s.clock), nil } func (s *BackupScheduleService) UpdateBackupSchedule( ctx context.Context, request *pb.UpdateBackupScheduleRequest, ) (*pb.BackupSchedule, error) { + const methodName string = "UpdateBackupSchedule" ctx = grpcinfo.WithGRPCInfo(ctx) scheduleID := request.GetId() ctx = xlog.With(ctx, zap.String("BackupScheduleID", scheduleID)) - xlog.Debug(ctx, "UpdateBackupSchedule", zap.Stringer("request", request)) + xlog.Debug(ctx, methodName, zap.Stringer("request", request)) schedules, err := s.driver.SelectBackupSchedulesWithRPOInfo( ctx, queries.NewReadTableQuery( @@ -142,10 +157,12 @@ func (s *BackupScheduleService) UpdateBackupSchedule( if err != nil { xlog.Error(ctx, "error getting backup schedule", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting backup schedule") } if len(schedules) == 0 { xlog.Error(ctx, "backup schedule not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup schedule not found") } @@ -154,6 +171,7 @@ func (s *BackupScheduleService) UpdateBackupSchedule( // TODO: Need to check access to backup schedule not by container id? subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupCreate, schedule.ContainerID, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -161,11 +179,13 @@ func (s *BackupScheduleService) UpdateBackupSchedule( Endpoint: schedule.DatabaseEndpoint, DatabaseName: schedule.DatabaseName, }); err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } if schedule.Status == types.BackupScheduleStateDeleted { xlog.Error(ctx, "backup schedule was deleted") + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "backup schedule was deleted") } @@ -180,17 +200,20 @@ func (s *BackupScheduleService) UpdateBackupSchedule( if request.ScheduleSettings.SchedulePattern != nil { _, err = types.ParseCronExpr(request.ScheduleSettings.SchedulePattern.Crontab) if err != nil { + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "failed to parse crontab") } } if request.ScheduleSettings.RecoveryPointObjective != nil && request.ScheduleSettings.RecoveryPointObjective.Seconds == 0 { + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "recovery point objective should be greater than 0") } schedule.ScheduleSettings = request.ScheduleSettings err = schedule.UpdateNextLaunch(s.clock.Now()) if err != nil { + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "failed to update next launch time") } } @@ -206,22 +229,25 @@ func (s *BackupScheduleService) UpdateBackupSchedule( ctx, "can't update backup schedule", zap.String("backup schedule", schedule.Proto(s.clock).String()), zap.Error(err), ) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't update backup schedule") } - xlog.Info(ctx, "UpdateBackupSchedule was completed successfully", zap.Stringer("schedule", schedule)) + xlog.Debug(ctx, methodName, zap.Stringer("schedule", schedule)) + s.IncApiCallsCounter(methodName, codes.OK) return schedule.Proto(s.clock), nil } func (s *BackupScheduleService) GetBackupSchedule( ctx context.Context, request *pb.GetBackupScheduleRequest, ) (*pb.BackupSchedule, error) { + const methodName string = "GetBackupSchedule" ctx = grpcinfo.WithGRPCInfo(ctx) scheduleID := request.GetId() ctx = xlog.With(ctx, zap.String("BackupScheduleID", scheduleID)) - xlog.Debug(ctx, "GetBackupSchedule", zap.Stringer("request", request)) + xlog.Debug(ctx, methodName, zap.Stringer("request", request)) schedules, err := s.driver.SelectBackupSchedulesWithRPOInfo( ctx, queries.NewReadTableQuery( @@ -234,10 +260,12 @@ func (s *BackupScheduleService) GetBackupSchedule( if err != nil { xlog.Error(ctx, "error getting backup schedule", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting backup schedule") } if len(schedules) == 0 { xlog.Error(ctx, "backup schedule not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup schedule not found") // TODO: Permission denied? } @@ -246,23 +274,27 @@ func (s *BackupScheduleService) GetBackupSchedule( // TODO: Need to check access to backup schedule not by container id? subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupGet, schedule.ContainerID, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) - xlog.Debug(ctx, "GetBackupSchedule", zap.Stringer("schedule", schedule)) + xlog.Debug(ctx, methodName, zap.Stringer("schedule", schedule)) + s.IncApiCallsCounter(methodName, codes.OK) return schedule.Proto(s.clock), nil } func (s *BackupScheduleService) ListBackupSchedules( ctx context.Context, request *pb.ListBackupSchedulesRequest, ) (*pb.ListBackupSchedulesResponse, error) { + const methodName string = "ListBackupSchedules" ctx = grpcinfo.WithGRPCInfo(ctx) ctx = xlog.With(ctx, zap.String("ContainerID", request.ContainerId)) - xlog.Debug(ctx, "ListBackupSchedules", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupList, request.ContainerId, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -293,6 +325,7 @@ func (s *BackupScheduleService) ListBackupSchedules( pageSpec, err := queries.NewPageSpec(request.GetPageSize(), request.GetPageToken()) if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } @@ -311,6 +344,7 @@ func (s *BackupScheduleService) ListBackupSchedules( ) if err != nil { xlog.Error(ctx, "error getting backup schedules", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting backup schedules") } pbSchedules := make([]*pb.BackupSchedule, 0, len(schedules)) @@ -321,19 +355,21 @@ func (s *BackupScheduleService) ListBackupSchedules( if uint64(len(pbSchedules)) == pageSpec.Limit { res.NextPageToken = strconv.FormatUint(pageSpec.Offset+pageSpec.Limit, 10) } - xlog.Debug(ctx, "ListBackupSchedules success") + xlog.Debug(ctx, methodName, zap.Stringer("response", res)) + s.IncApiCallsCounter(methodName, codes.OK) return res, nil } func (s *BackupScheduleService) ToggleBackupSchedule( ctx context.Context, request *pb.ToggleBackupScheduleRequest, ) (*pb.BackupSchedule, error) { + const methodName string = "ToggleBackupSchedule" ctx = grpcinfo.WithGRPCInfo(ctx) scheduleID := request.GetId() ctx = xlog.With(ctx, zap.String("BackupScheduleID", scheduleID)) - xlog.Debug(ctx, "ToggleBackupSchedule", zap.Stringer("request", request)) + xlog.Debug(ctx, methodName, zap.Stringer("request", request)) schedules, err := s.driver.SelectBackupSchedulesWithRPOInfo( ctx, queries.NewReadTableQuery( @@ -346,10 +382,12 @@ func (s *BackupScheduleService) ToggleBackupSchedule( if err != nil { xlog.Error(ctx, "error getting backup schedule", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting backup schedule") } if len(schedules) == 0 { xlog.Error(ctx, "backup schedule not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup schedule not found") } @@ -357,6 +395,7 @@ func (s *BackupScheduleService) ToggleBackupSchedule( ctx = xlog.With(ctx, zap.String("ContainerID", schedule.ContainerID)) subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupCreate, schedule.ContainerID, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -364,11 +403,13 @@ func (s *BackupScheduleService) ToggleBackupSchedule( Endpoint: schedule.DatabaseEndpoint, DatabaseName: schedule.DatabaseName, }); err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } if schedule.Status == types.BackupScheduleStateDeleted { xlog.Error(ctx, "backup schedule was deleted") + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "backup schedule was deleted") } @@ -381,6 +422,7 @@ func (s *BackupScheduleService) ToggleBackupSchedule( if schedule.ScheduleSettings != nil { err = schedule.UpdateNextLaunch(s.clock.Now()) if err != nil { + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "failed to update next launch time") } } @@ -391,22 +433,25 @@ func (s *BackupScheduleService) ToggleBackupSchedule( ctx, "can't update backup schedule", zap.String("backup schedule", schedule.Proto(s.clock).String()), zap.Error(err), ) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't update backup schedule") } - xlog.Info(ctx, "ToggleBackupSchedule was completed successfully", zap.Stringer("schedule", schedule)) + xlog.Debug(ctx, methodName, zap.Stringer("schedule", schedule)) + s.IncApiCallsCounter(methodName, codes.OK) return schedule.Proto(s.clock), nil } func (s *BackupScheduleService) DeleteBackupSchedule( ctx context.Context, request *pb.DeleteBackupScheduleRequest, ) (*pb.BackupSchedule, error) { + const methodName string = "DeleteBackupSchedule" ctx = grpcinfo.WithGRPCInfo(ctx) scheduleID := request.GetId() ctx = xlog.With(ctx, zap.String("BackupScheduleID", scheduleID)) - xlog.Debug(ctx, "DeleteBackupSchedule", zap.Stringer("request", request)) + xlog.Debug(ctx, methodName, zap.Stringer("request", request)) schedules, err := s.driver.SelectBackupSchedulesWithRPOInfo( ctx, queries.NewReadTableQuery( @@ -419,10 +464,12 @@ func (s *BackupScheduleService) DeleteBackupSchedule( if err != nil { xlog.Error(ctx, "error getting backup schedule", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting backup schedule") } if len(schedules) == 0 { xlog.Error(ctx, "backup schedule not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "backup schedule not found") } @@ -431,12 +478,14 @@ func (s *BackupScheduleService) DeleteBackupSchedule( // TODO: Need to check access to backup schedule not by container id? subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupCreate, schedule.ContainerID, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) if schedule.Status == types.BackupScheduleStateDeleted { xlog.Error(ctx, "backup schedule already deleted") + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Error(codes.FailedPrecondition, "backup schedule already deleted") } @@ -447,10 +496,12 @@ func (s *BackupScheduleService) DeleteBackupSchedule( ctx, "can't delete backup schedule", zap.String("backup schedule", schedule.Proto(s.clock).String()), zap.Error(err), ) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't delete backup schedule") } - xlog.Info(ctx, "DeleteBackupSchedule was completed successfully", zap.Stringer("schedule", schedule)) + xlog.Debug(ctx, methodName, zap.Stringer("schedule", schedule)) + s.IncApiCallsCounter(methodName, codes.OK) return schedule.Proto(s.clock), nil } @@ -462,11 +513,13 @@ func NewBackupScheduleService( driver db.DBConnector, clientConn client.ClientConnector, auth ap.AuthProvider, + mon metrics.MetricsRegistry, ) *BackupScheduleService { return &BackupScheduleService{ driver: driver, clientConn: clientConn, auth: auth, clock: clockwork.NewRealClock(), + mon: mon, } } diff --git a/internal/server/services/operation/operation_service.go b/internal/server/services/operation/operation_service.go index 59a07b9b..ba291348 100644 --- a/internal/server/services/operation/operation_service.go +++ b/internal/server/services/operation/operation_service.go @@ -7,6 +7,7 @@ import ( "ydbcp/internal/auth" "ydbcp/internal/connectors/db" "ydbcp/internal/connectors/db/yql/queries" + "ydbcp/internal/metrics" "ydbcp/internal/server" "ydbcp/internal/server/grpcinfo" "ydbcp/internal/types" @@ -24,17 +25,24 @@ type OperationService struct { pb.UnimplementedOperationServiceServer driver db.DBConnector auth ap.AuthProvider + mon metrics.MetricsRegistry +} + +func (s *OperationService) IncApiCallsCounter(methodName string, code codes.Code) { + s.mon.IncApiCallsCounter("OperationService", methodName, code.String()) } func (s *OperationService) ListOperations( ctx context.Context, request *pb.ListOperationsRequest, ) (*pb.ListOperationsResponse, error) { + const methodName string = "ListOperations" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Debug(ctx, "ListOperations", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) ctx = xlog.With(ctx, zap.String("ContainerID", request.ContainerId)) subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupList, request.ContainerId, "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) @@ -77,6 +85,7 @@ func (s *OperationService) ListOperations( pageSpec, err := queries.NewPageSpec(request.GetPageSize(), request.GetPageToken()) if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } @@ -95,6 +104,7 @@ func (s *OperationService) ListOperations( ) if err != nil { xlog.Error(ctx, "error getting operations", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting operations") } pbOperations := make([]*pb.Operation, 0, len(operations)) @@ -105,7 +115,8 @@ func (s *OperationService) ListOperations( if uint64(len(pbOperations)) == pageSpec.Limit { res.NextPageToken = strconv.FormatUint(pageSpec.Offset+pageSpec.Limit, 10) } - xlog.Debug(ctx, "success ListOperations") + xlog.Debug(ctx, methodName, zap.Stringer("response", res)) + s.IncApiCallsCounter(methodName, codes.OK) return res, nil } @@ -113,8 +124,9 @@ func (s *OperationService) CancelOperation( ctx context.Context, request *pb.CancelOperationRequest, ) (*pb.Operation, error) { + const methodName string = "CancelOperation" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Debug(ctx, "CancelOperation", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) ctx = xlog.With(ctx, zap.String("OperationID", request.OperationId)) operations, err := s.driver.SelectOperations( @@ -131,11 +143,13 @@ func (s *OperationService) CancelOperation( if err != nil { xlog.Error(ctx, "error getting operation", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error getting operation") } if len(operations) == 0 { xlog.Error(ctx, "operation not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "operation not found") } @@ -154,20 +168,24 @@ func (s *OperationService) CancelOperation( permission = auth.PermissionBackupRestore } else if operation.GetType() == types.OperationTypeDB { xlog.Error(ctx, "can't cancel DeleteBackup operation") + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Errorf(codes.FailedPrecondition, "can't cancel DeleteBackup operation: %s", types.OperationToString(operation)) } else { xlog.Error(ctx, "unknown operation type") + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Errorf(codes.Internal, "unknown operation type: %s", operation.GetType().String()) } subject, err := auth.CheckAuth(ctx, s.auth, permission, operation.GetContainerID(), "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) if operation.GetState() != types.OperationStatePending && operation.GetState() != types.OperationStateRunning { xlog.Error(ctx, "can't cancel operation with state", zap.String("OperationState", operation.GetState().String())) + s.IncApiCallsCounter(methodName, codes.FailedPrecondition) return nil, status.Errorf(codes.FailedPrecondition, "can't cancel operation with state: %s", operation.GetState().String()) } @@ -177,22 +195,26 @@ func (s *OperationService) CancelOperation( err = s.driver.UpdateOperation(ctx, operation) if err != nil { xlog.Error(ctx, "error updating operation", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "error updating operation") } xlog.Debug( - ctx, "CancelOperation was started", + ctx, methodName, zap.String("operation", types.OperationToString(operation)), ) + s.IncApiCallsCounter(methodName, codes.OK) return operation.Proto(), nil } func (s *OperationService) GetOperation(ctx context.Context, request *pb.GetOperationRequest) (*pb.Operation, error) { + const methodName string = "GetOperation" ctx = grpcinfo.WithGRPCInfo(ctx) - xlog.Debug(ctx, "GetOperation", zap.String("request", request.String())) + xlog.Debug(ctx, methodName, zap.String("request", request.String())) operationID, err := types.ParseObjectID(request.GetId()) if err != nil { xlog.Error(ctx, "failed to parse OperationID", zap.String("OperationID", request.GetId()), zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "failed to parse ObjectID") } ctx = xlog.With(ctx, zap.String("OperationID", operationID)) @@ -210,11 +232,13 @@ func (s *OperationService) GetOperation(ctx context.Context, request *pb.GetOper ) if err != nil { xlog.Error(ctx, "can't select operations", zap.Error(err)) + s.IncApiCallsCounter(methodName, codes.Internal) return nil, status.Error(codes.Internal, "can't select operations") } if len(operations) == 0 { xlog.Error(ctx, "operation not found") + s.IncApiCallsCounter(methodName, codes.NotFound) return nil, status.Error(codes.NotFound, "operation not found") // TODO: permission denied? } operation := operations[0] @@ -222,11 +246,13 @@ func (s *OperationService) GetOperation(ctx context.Context, request *pb.GetOper // TODO: Need to check access to operation resource by operationID subject, err := auth.CheckAuth(ctx, s.auth, auth.PermissionBackupGet, operation.GetContainerID(), "") if err != nil { + s.IncApiCallsCounter(methodName, status.Code(err)) return nil, err } ctx = xlog.With(ctx, zap.String("SubjectID", subject)) - xlog.Debug(ctx, "GetOperation", zap.String("operation", types.OperationToString(operations[0]))) + xlog.Debug(ctx, methodName, zap.String("operation", types.OperationToString(operations[0]))) + s.IncApiCallsCounter(methodName, codes.OK) return operations[0].Proto(), nil } @@ -234,9 +260,14 @@ func (s *OperationService) Register(server server.Server) { pb.RegisterOperationServiceServer(server.GRPCServer(), s) } -func NewOperationService(driver db.DBConnector, auth ap.AuthProvider) *OperationService { +func NewOperationService( + driver db.DBConnector, + auth ap.AuthProvider, + mon metrics.MetricsRegistry, +) *OperationService { return &OperationService{ driver: driver, auth: auth, + mon: mon, } }