From be20dc26ade2fa0212dc91126cf42087a84a07c9 Mon Sep 17 00:00:00 2001 From: Dhawal Sanghvi <43755122+dhawal1248@users.noreply.github.com> Date: Thu, 23 May 2024 16:51:11 +0530 Subject: [PATCH 1/2] feat: sre 456 ut move high cardinality histogram metrics to summaries cp (#3409) * chore(user-transformer): move some high cardinality metrics to summaries * chore(user-transformer): move some high cardinality metrics to summaries * chore: add flag to disable summary metric collection * chore: minor fix * chore: minor fix --- src/legacy/router.js | 3 + src/services/userTransform.ts | 12 ++- src/util/customTransformer-v1.js | 1 + src/util/customTransformer.js | 1 + src/util/customTransforrmationsStore-v1.js | 3 + src/util/customTransforrmationsStore.js | 1 + src/util/openfaas/index.js | 1 + src/util/prometheus.js | 97 +++++++++++++++++++++- src/util/stats.js | 24 +++++- src/util/statsd.js | 5 ++ 10 files changed, 141 insertions(+), 7 deletions(-) diff --git a/src/legacy/router.js b/src/legacy/router.js index 9dd83b5988..afc8c1a797 100644 --- a/src/legacy/router.js +++ b/src/legacy/router.js @@ -649,6 +649,9 @@ if (startDestTransformer) { stats.timing('user_transform_request_latency', startTime, { processSessions, }); + stats.timingSummary('user_transform_request_latency_summary', startTime, { + processSessions, + }); stats.increment('user_transform_requests', { processSessions }); stats.histogram('user_transform_output_events', transformedEvents.length, { processSessions, diff --git a/src/services/userTransform.ts b/src/services/userTransform.ts index 18c47ddc83..62980a935a 100644 --- a/src/services/userTransform.ts +++ b/src/services/userTransform.ts @@ -173,7 +173,17 @@ export class UserTransformService { ...getTransformationMetadata(eventsToProcess[0]?.metadata), }); - stats.histogram('user_transform_batch_size', requestSize, { + stats.timing('user_transform_batch_size', requestSize, { + ...metaTags, + ...getTransformationMetadata(eventsToProcess[0]?.metadata), + }); + + stats.timingSummary('user_transform_request_latency_summary', userFuncStartTime, { + ...metaTags, + ...getTransformationMetadata(eventsToProcess[0]?.metadata), + }); + + stats.timingSummary('user_transform_batch_size_summary', requestSize, { ...metaTags, ...getTransformationMetadata(eventsToProcess[0]?.metadata), }); diff --git a/src/util/customTransformer-v1.js b/src/util/customTransformer-v1.js index 7e854a3714..e9877a614d 100644 --- a/src/util/customTransformer-v1.js +++ b/src/util/customTransformer-v1.js @@ -93,6 +93,7 @@ async function userTransformHandlerV1( }; stats.counter('user_transform_function_input_events', events.length, tags); stats.timing('user_transform_function_latency', invokeTime, tags); + stats.timingSummary('user_transform_function_latency_summary', invokeTime, tags); } return { transformedEvents, logs }; diff --git a/src/util/customTransformer.js b/src/util/customTransformer.js index a87c12dd6e..37364ef5d0 100644 --- a/src/util/customTransformer.js +++ b/src/util/customTransformer.js @@ -260,6 +260,7 @@ async function runUserTransform( stats.counter('user_transform_function_input_events', events.length, tags); stats.timing('user_transform_function_latency', invokeTime, tags); + stats.timingSummary('user_transform_function_latency_summary', invokeTime, tags); } return { diff --git a/src/util/customTransforrmationsStore-v1.js b/src/util/customTransforrmationsStore-v1.js index 3263049b6f..6e2d799f3a 100644 --- a/src/util/customTransforrmationsStore-v1.js +++ b/src/util/customTransforrmationsStore-v1.js @@ -31,6 +31,7 @@ async function getTransformationCodeV1(versionId) { responseStatusHandler(response.status, 'Transformation', versionId, url); stats.increment('get_transformation_code', { success: 'true', ...tags }); stats.timing('get_transformation_code_time', startTime, tags); + stats.timingSummary('get_transformation_code_time_summary', startTime, tags); const myJson = await response.json(); transformationCache[versionId] = myJson; return myJson; @@ -56,6 +57,7 @@ async function getLibraryCodeV1(versionId) { responseStatusHandler(response.status, 'Transformation Library', versionId, url); stats.increment('get_libraries_code', { success: 'true', ...tags }); stats.timing('get_libraries_code_time', startTime, tags); + stats.timingSummary('get_libraries_code_time_summary', startTime, tags); const myJson = await response.json(); libraryCache[versionId] = myJson; return myJson; @@ -83,6 +85,7 @@ async function getRudderLibByImportName(importName) { responseStatusHandler(response.status, 'Rudder Library', importName, url); stats.increment('get_libraries_code', { success: 'true', ...tags }); stats.timing('get_libraries_code_time', startTime, tags); + stats.timingSummary('get_libraries_code_time_summary', startTime, tags); const myJson = await response.json(); rudderLibraryCache[importName] = myJson; return myJson; diff --git a/src/util/customTransforrmationsStore.js b/src/util/customTransforrmationsStore.js index 08d417c07c..2c5a7b446d 100644 --- a/src/util/customTransforrmationsStore.js +++ b/src/util/customTransforrmationsStore.js @@ -24,6 +24,7 @@ async function getTransformationCode(versionId) { responseStatusHandler(response.status, 'Transformation', versionId, url); stats.increment('get_transformation_code', { versionId, success: 'true' }); stats.timing('get_transformation_code_time', startTime, { versionId }); + stats.timingSummary('get_transformation_code_time_summary', startTime, { versionId }); const myJson = await response.json(); myCache.set(versionId, myJson); return myJson; diff --git a/src/util/openfaas/index.js b/src/util/openfaas/index.js index 7a1fce3cfa..3cf3525e6f 100644 --- a/src/util/openfaas/index.js +++ b/src/util/openfaas/index.js @@ -305,6 +305,7 @@ const executeFaasFunction = async ( stats.counter('user_transform_function_input_events', events.length, tags); stats.timing('user_transform_function_latency', startTime, tags); + stats.timingSummary('user_transform_function_latency_summary', startTime, tags); } }; diff --git a/src/util/prometheus.js b/src/util/prometheus.js index 882dff9e75..a618d35068 100644 --- a/src/util/prometheus.js +++ b/src/util/prometheus.js @@ -11,7 +11,7 @@ function appendPrefix(name) { } class Prometheus { - constructor() { + constructor(enableSummaryMetrics = true) { this.prometheusRegistry = new prometheusClient.Registry(); this.prometheusRegistry.setDefaultLabels(defaultLabels); prometheusClient.collectDefaultMetrics({ @@ -21,7 +21,7 @@ class Prometheus { prometheusClient.AggregatorRegistry.setRegistries(this.prometheusRegistry); this.aggregatorRegistry = new prometheusClient.AggregatorRegistry(); - this.createMetrics(); + this.createMetrics(enableSummaryMetrics); } async metricsController(ctx) { @@ -56,11 +56,22 @@ class Prometheus { return gauge; } - newSummaryStat(name, help, labelNames) { + newSummaryStat( + name, + help, + labelNames, + percentiles = [0.5, 0.9, 0.99], + maxAgeSeconds = 300, + ageBuckets = 5, + ) { + // we enable a 5 minute sliding window and calculate the 50th, 90th, and 99th percentiles by default const summary = new prometheusClient.Summary({ name, help, labelNames, + percentiles, + maxAgeSeconds, + ageBuckets, }); this.prometheusRegistry.registerMetric(summary); return summary; @@ -117,6 +128,21 @@ class Prometheus { } } + timingSummary(name, start, tags = {}) { + try { + let metric = this.prometheusRegistry.getSingleMetric(appendPrefix(name)); + if (!metric) { + logger.warn( + `Prometheus: summary metric ${name} not found in the registry. Creating a new one`, + ); + metric = this.newSummaryStat(name, name, Object.keys(tags)); + } + metric.observe(tags, (new Date() - start) / 1000); + } catch (e) { + logger.error(`Prometheus: Summary metric ${name} failed with error ${e}`); + } + } + histogram(name, value, tags = {}) { try { let metric = this.prometheusRegistry.getSingleMetric(appendPrefix(name)); @@ -166,7 +192,7 @@ class Prometheus { } } - createMetrics() { + createMetrics(enableSummaryMetrics) { const metrics = [ // Counters { @@ -698,6 +724,18 @@ class Prometheus { 'k8_namespace', ], }, + { + name: 'user_transform_request_latency_summary', + help: 'user_transform_request_latency_summary', + type: 'summary', + labelNames: [ + 'workspaceId', + 'transformationId', + 'sourceType', + 'destinationType', + 'k8_namespace', + ], + }, { name: 'user_transform_batch_size', help: 'user_transform_batch_size', @@ -714,6 +752,18 @@ class Prometheus { 524288000, ], // 1KB, 100KB, 0.5MB, 1MB, 10MB, 20MB, 50MB, 100MB, 200MB, 500MB }, + { + name: 'user_transform_batch_size_summary', + help: 'user_transform_batch_size_summary', + type: 'summary', + labelNames: [ + 'workspaceId', + 'transformationId', + 'sourceType', + 'destinationType', + 'k8_namespace', + ], + }, { name: 'source_transform_request_latency', help: 'source_transform_request_latency', @@ -770,12 +820,24 @@ class Prometheus { type: 'histogram', labelNames: ['versionId', 'version'], }, + { + name: 'get_transformation_code_time_summary', + help: 'get_transformation_code_time_summary', + type: 'summary', + labelNames: ['versionId', 'version'], + }, { name: 'get_libraries_code_time', help: 'get_libraries_code_time', type: 'histogram', labelNames: ['libraryVersionId', 'versionId', 'type', 'version'], }, + { + name: 'get_libraries_code_time_summary', + help: 'get_libraries_code_time_summary', + type: 'summary', + labelNames: ['libraryVersionId', 'versionId', 'type', 'version'], + }, { name: 'isolate_cpu_time', help: 'isolate_cpu_time', @@ -1027,6 +1089,22 @@ class Prometheus { 'workspaceId', ], }, + { + name: 'user_transform_function_latency_summary', + help: 'user_transform_function_latency_summary', + type: 'summary', + labelNames: [ + 'identifier', + 'testMode', + 'sourceType', + 'destinationType', + 'k8_namespace', + 'errored', + 'statusCode', + 'transformationId', + 'workspaceId', + ], + }, ]; metrics.forEach((metric) => { @@ -1042,6 +1120,17 @@ class Prometheus { metric.labelNames, metric.buckets, ); + } else if (metric.type === 'summary') { + if (enableSummaryMetrics) { + this.newSummaryStat( + appendPrefix(metric.name), + metric.help, + metric.labelNames, + metric.percentiles, + metric.maxAge, + metric.ageBuckets, + ); + } } else { logger.error( `Prometheus: Metric creation failed. Name: ${metric.name}. Invalid type: ${metric.type}`, diff --git a/src/util/stats.js b/src/util/stats.js index 9a32fd1de3..0aa13fc85c 100644 --- a/src/util/stats.js +++ b/src/util/stats.js @@ -4,6 +4,8 @@ const logger = require('../logger'); const enableStats = process.env.ENABLE_STATS !== 'false'; const statsClientType = process.env.STATS_CLIENT || 'statsd'; +// summary metrics are enabled by default. To disable set ENABLE_SUMMARY_METRICS='false'. +const enableSummaryMetrics = process.env.ENABLE_SUMMARY_METRICS !== 'false'; let statsClient; function init() { @@ -19,7 +21,7 @@ function init() { case 'prometheus': logger.info('setting up prometheus client'); - statsClient = new prometheus.Prometheus(); + statsClient = new prometheus.Prometheus(enableSummaryMetrics); break; default: @@ -38,6 +40,15 @@ const timing = (name, start, tags = {}) => { statsClient.timing(name, start, tags); }; +// timingSummary is used to record observations for a summary metric +const timingSummary = (name, start, tags = {}) => { + if (!enableStats || !statsClient || !enableSummaryMetrics) { + return; + } + + statsClient.timingSummary(name, start, tags); +}; + const increment = (name, tags = {}) => { if (!enableStats || !statsClient) { return; @@ -88,4 +99,13 @@ async function metricsController(ctx) { init(); -module.exports = { init, timing, increment, counter, gauge, histogram, metricsController }; +module.exports = { + init, + timing, + timingSummary, + increment, + counter, + gauge, + histogram, + metricsController, +}; diff --git a/src/util/statsd.js b/src/util/statsd.js index a32a6f6f30..7613de7975 100644 --- a/src/util/statsd.js +++ b/src/util/statsd.js @@ -21,6 +21,11 @@ class Statsd { this.statsdClient.timing(name, start, tags); } + // timingSummary is just a wrapper around timing for statsd.For prometheus, we will have to implement a different function. + timingSummary(name, start, tags = {}) { + this.statsdClient.timing(name, start, tags); + } + increment(name, tags = {}) { this.statsdClient.increment(name, 1, tags); } From 9280aca17b2277bb1b2d717238b9f6b48e637f68 Mon Sep 17 00:00:00 2001 From: GitHub Actions Date: Thu, 23 May 2024 12:12:17 +0000 Subject: [PATCH 2/2] chore(release): 1.67.0 --- CHANGELOG.md | 7 +++++++ package-lock.json | 4 ++-- package.json | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4258fdfcc5..9c04835cd4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. +## [1.67.0](https://github.com/rudderlabs/rudder-transformer/compare/v1.66.1...v1.67.0) (2024-05-23) + + +### Features + +* sre 456 ut move high cardinality histogram metrics to summaries cp ([#3409](https://github.com/rudderlabs/rudder-transformer/issues/3409)) ([be20dc2](https://github.com/rudderlabs/rudder-transformer/commit/be20dc26ade2fa0212dc91126cf42087a84a07c9)) + ### [1.66.1](https://github.com/rudderlabs/rudder-transformer/compare/v1.66.0...v1.66.1) (2024-05-20) diff --git a/package-lock.json b/package-lock.json index f51f3ccd8e..bfeb00963a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "rudder-transformer", - "version": "1.66.1", + "version": "1.67.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "rudder-transformer", - "version": "1.66.1", + "version": "1.67.0", "license": "ISC", "dependencies": { "@amplitude/ua-parser-js": "0.7.24", diff --git a/package.json b/package.json index 50a276ce42..7fa3a7330c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "rudder-transformer", - "version": "1.66.1", + "version": "1.67.0", "description": "", "homepage": "https://github.com/rudderlabs/rudder-transformer#readme", "bugs": {