From 8226093178549ed2f2e9fac73d934a9143cfed71 Mon Sep 17 00:00:00 2001 From: Dhawal Sanghvi Date: Thu, 16 May 2024 17:47:54 +0530 Subject: [PATCH 1/4] chore(user-transformer): move some high cardinality metrics to summaries --- src/legacy/router.js | 3 + src/middleware.js | 2 +- src/services/userTransform.ts | 12 ++- src/util/customTransformer-v1.js | 1 + src/util/customTransformer.js | 2 +- src/util/customTransforrmationsStore-v1.js | 3 + src/util/customTransforrmationsStore.js | 1 + src/util/openfaas/index.js | 2 +- src/util/prometheus.js | 89 +++++++++++++++++++++- src/util/stats.js | 20 ++++- src/util/statsd.js | 5 ++ 11 files changed, 134 insertions(+), 6 deletions(-) diff --git a/src/legacy/router.js b/src/legacy/router.js index 9dd83b5988..afc8c1a797 100644 --- a/src/legacy/router.js +++ b/src/legacy/router.js @@ -649,6 +649,9 @@ if (startDestTransformer) { stats.timing('user_transform_request_latency', startTime, { processSessions, }); + stats.timingSummary('user_transform_request_latency_summary', startTime, { + processSessions, + }); stats.increment('user_transform_requests', { processSessions }); stats.histogram('user_transform_output_events', transformedEvents.length, { processSessions, diff --git a/src/middleware.js b/src/middleware.js index 543b3af8d1..541461a1c2 100644 --- a/src/middleware.js +++ b/src/middleware.js @@ -29,7 +29,7 @@ function durationMiddleware() { route: ctx.request.url, destType: getDestTypeFromContext(ctx), }; - stats.timing('http_request_duration', startTime, labels); + stats.timing('http_request_duration_summary', startTime, labels); }; } diff --git a/src/services/userTransform.ts b/src/services/userTransform.ts index 18c47ddc83..62980a935a 100644 --- a/src/services/userTransform.ts +++ b/src/services/userTransform.ts @@ -173,7 +173,17 @@ export class UserTransformService { ...getTransformationMetadata(eventsToProcess[0]?.metadata), }); - stats.histogram('user_transform_batch_size', requestSize, { + stats.timing('user_transform_batch_size', requestSize, { + ...metaTags, + ...getTransformationMetadata(eventsToProcess[0]?.metadata), + }); + + stats.timingSummary('user_transform_request_latency_summary', userFuncStartTime, { + ...metaTags, + ...getTransformationMetadata(eventsToProcess[0]?.metadata), + }); + + stats.timingSummary('user_transform_batch_size_summary', requestSize, { ...metaTags, ...getTransformationMetadata(eventsToProcess[0]?.metadata), }); diff --git a/src/util/customTransformer-v1.js b/src/util/customTransformer-v1.js index 7e854a3714..e9877a614d 100644 --- a/src/util/customTransformer-v1.js +++ b/src/util/customTransformer-v1.js @@ -93,6 +93,7 @@ async function userTransformHandlerV1( }; stats.counter('user_transform_function_input_events', events.length, tags); stats.timing('user_transform_function_latency', invokeTime, tags); + stats.timingSummary('user_transform_function_latency_summary', invokeTime, tags); } return { transformedEvents, logs }; diff --git a/src/util/customTransformer.js b/src/util/customTransformer.js index a87c12dd6e..5ae9df95ba 100644 --- a/src/util/customTransformer.js +++ b/src/util/customTransformer.js @@ -259,7 +259,7 @@ async function runUserTransform( }; stats.counter('user_transform_function_input_events', events.length, tags); - stats.timing('user_transform_function_latency', invokeTime, tags); + stats.timing('user_transform_function_latency_summary', invokeTime, tags); } return { diff --git a/src/util/customTransforrmationsStore-v1.js b/src/util/customTransforrmationsStore-v1.js index 3263049b6f..6e2d799f3a 100644 --- a/src/util/customTransforrmationsStore-v1.js +++ b/src/util/customTransforrmationsStore-v1.js @@ -31,6 +31,7 @@ async function getTransformationCodeV1(versionId) { responseStatusHandler(response.status, 'Transformation', versionId, url); stats.increment('get_transformation_code', { success: 'true', ...tags }); stats.timing('get_transformation_code_time', startTime, tags); + stats.timingSummary('get_transformation_code_time_summary', startTime, tags); const myJson = await response.json(); transformationCache[versionId] = myJson; return myJson; @@ -56,6 +57,7 @@ async function getLibraryCodeV1(versionId) { responseStatusHandler(response.status, 'Transformation Library', versionId, url); stats.increment('get_libraries_code', { success: 'true', ...tags }); stats.timing('get_libraries_code_time', startTime, tags); + stats.timingSummary('get_libraries_code_time_summary', startTime, tags); const myJson = await response.json(); libraryCache[versionId] = myJson; return myJson; @@ -83,6 +85,7 @@ async function getRudderLibByImportName(importName) { responseStatusHandler(response.status, 'Rudder Library', importName, url); stats.increment('get_libraries_code', { success: 'true', ...tags }); stats.timing('get_libraries_code_time', startTime, tags); + stats.timingSummary('get_libraries_code_time_summary', startTime, tags); const myJson = await response.json(); rudderLibraryCache[importName] = myJson; return myJson; diff --git a/src/util/customTransforrmationsStore.js b/src/util/customTransforrmationsStore.js index 08d417c07c..2c5a7b446d 100644 --- a/src/util/customTransforrmationsStore.js +++ b/src/util/customTransforrmationsStore.js @@ -24,6 +24,7 @@ async function getTransformationCode(versionId) { responseStatusHandler(response.status, 'Transformation', versionId, url); stats.increment('get_transformation_code', { versionId, success: 'true' }); stats.timing('get_transformation_code_time', startTime, { versionId }); + stats.timingSummary('get_transformation_code_time_summary', startTime, { versionId }); const myJson = await response.json(); myCache.set(versionId, myJson); return myJson; diff --git a/src/util/openfaas/index.js b/src/util/openfaas/index.js index 7a1fce3cfa..26b57b68be 100644 --- a/src/util/openfaas/index.js +++ b/src/util/openfaas/index.js @@ -304,7 +304,7 @@ const executeFaasFunction = async ( }; stats.counter('user_transform_function_input_events', events.length, tags); - stats.timing('user_transform_function_latency', startTime, tags); + stats.timing('user_transform_function_latency_summary', startTime, tags); } }; diff --git a/src/util/prometheus.js b/src/util/prometheus.js index 882dff9e75..9eda9b21d5 100644 --- a/src/util/prometheus.js +++ b/src/util/prometheus.js @@ -56,11 +56,22 @@ class Prometheus { return gauge; } - newSummaryStat(name, help, labelNames) { + newSummaryStat( + name, + help, + labelNames, + percentiles = [0.5, 0.9, 0.99], + maxAgeSeconds = 300, + ageBuckets = 5, + ) { + // we enable a 5 minute sliding window and calculate the 50th, 90th, and 99th percentiles by default const summary = new prometheusClient.Summary({ name, help, labelNames, + percentiles, + maxAgeSeconds, + ageBuckets, }); this.prometheusRegistry.registerMetric(summary); return summary; @@ -117,6 +128,21 @@ class Prometheus { } } + timingSummary(name, start, tags = {}) { + try { + let metric = this.prometheusRegistry.getSingleMetric(appendPrefix(name)); + if (!metric) { + logger.warn( + `Prometheus: summary metric ${name} not found in the registry. Creating a new one`, + ); + metric = this.newSummaryStat(name, name, Object.keys(tags)); + } + metric.observe(tags, (new Date() - start) / 1000); + } catch (e) { + logger.error(`Prometheus: Summary metric ${name} failed with error ${e}`); + } + } + histogram(name, value, tags = {}) { try { let metric = this.prometheusRegistry.getSingleMetric(appendPrefix(name)); @@ -698,6 +724,18 @@ class Prometheus { 'k8_namespace', ], }, + { + name: 'user_transform_request_latency_summary', + help: 'user_transform_request_latency_summary', + type: 'summary', + labelNames: [ + 'workspaceId', + 'transformationId', + 'sourceType', + 'destinationType', + 'k8_namespace', + ], + }, { name: 'user_transform_batch_size', help: 'user_transform_batch_size', @@ -714,6 +752,18 @@ class Prometheus { 524288000, ], // 1KB, 100KB, 0.5MB, 1MB, 10MB, 20MB, 50MB, 100MB, 200MB, 500MB }, + { + name: 'user_transform_batch_size_summary', + help: 'user_transform_batch_size_summary', + type: 'summary', + labelNames: [ + 'workspaceId', + 'transformationId', + 'sourceType', + 'destinationType', + 'k8_namespace', + ], + }, { name: 'source_transform_request_latency', help: 'source_transform_request_latency', @@ -770,12 +820,24 @@ class Prometheus { type: 'histogram', labelNames: ['versionId', 'version'], }, + { + name: 'get_transformation_code_time_summary', + help: 'get_transformation_code_time_summary', + type: 'summary', + labelNames: ['versionId', 'version'], + }, { name: 'get_libraries_code_time', help: 'get_libraries_code_time', type: 'histogram', labelNames: ['libraryVersionId', 'versionId', 'type', 'version'], }, + { + name: 'get_libraries_code_time_summary', + help: 'get_libraries_code_time_summary', + type: 'summary', + labelNames: ['libraryVersionId', 'versionId', 'type', 'version'], + }, { name: 'isolate_cpu_time', help: 'isolate_cpu_time', @@ -1027,6 +1089,22 @@ class Prometheus { 'workspaceId', ], }, + { + name: 'user_transform_function_latency_summary', + help: 'user_transform_function_latency_summary', + type: 'summary', + labelNames: [ + 'identifier', + 'testMode', + 'sourceType', + 'destinationType', + 'k8_namespace', + 'errored', + 'statusCode', + 'transformationId', + 'workspaceId', + ], + }, ]; metrics.forEach((metric) => { @@ -1042,6 +1120,15 @@ class Prometheus { metric.labelNames, metric.buckets, ); + } else if (metric.type === 'summary') { + this.newSummaryStat( + appendPrefix(metric.name), + metric.help, + metric.labelNames, + metric.percentiles, + metric.maxAge, + metric.ageBuckets, + ); } else { logger.error( `Prometheus: Metric creation failed. Name: ${metric.name}. Invalid type: ${metric.type}`, diff --git a/src/util/stats.js b/src/util/stats.js index 9a32fd1de3..0853554306 100644 --- a/src/util/stats.js +++ b/src/util/stats.js @@ -38,6 +38,15 @@ const timing = (name, start, tags = {}) => { statsClient.timing(name, start, tags); }; +// timingSummary is used to record observations for a summary metric +const timingSummary = (name, start, tags = {}) => { + if (!enableStats || !statsClient) { + return; + } + + statsClient.timingSummary(name, start, tags); +}; + const increment = (name, tags = {}) => { if (!enableStats || !statsClient) { return; @@ -88,4 +97,13 @@ async function metricsController(ctx) { init(); -module.exports = { init, timing, increment, counter, gauge, histogram, metricsController }; +module.exports = { + init, + timing, + timingSummary, + increment, + counter, + gauge, + histogram, + metricsController, +}; diff --git a/src/util/statsd.js b/src/util/statsd.js index a32a6f6f30..7613de7975 100644 --- a/src/util/statsd.js +++ b/src/util/statsd.js @@ -21,6 +21,11 @@ class Statsd { this.statsdClient.timing(name, start, tags); } + // timingSummary is just a wrapper around timing for statsd.For prometheus, we will have to implement a different function. + timingSummary(name, start, tags = {}) { + this.statsdClient.timing(name, start, tags); + } + increment(name, tags = {}) { this.statsdClient.increment(name, 1, tags); } From 397df55e51bee09d4382e73e49d2840571fa491a Mon Sep 17 00:00:00 2001 From: Dhawal Sanghvi Date: Thu, 16 May 2024 18:06:58 +0530 Subject: [PATCH 2/4] chore(user-transformer): move some high cardinality metrics to summaries --- src/middleware.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/middleware.js b/src/middleware.js index 541461a1c2..543b3af8d1 100644 --- a/src/middleware.js +++ b/src/middleware.js @@ -29,7 +29,7 @@ function durationMiddleware() { route: ctx.request.url, destType: getDestTypeFromContext(ctx), }; - stats.timing('http_request_duration_summary', startTime, labels); + stats.timing('http_request_duration', startTime, labels); }; } From 5774bf6caeec1ea24511adb51fe1c5c94b5c6d8b Mon Sep 17 00:00:00 2001 From: Dhawal Sanghvi Date: Tue, 21 May 2024 17:25:13 +0530 Subject: [PATCH 3/4] chore: add flag to disable summary metric collection --- src/util/prometheus.js | 24 +++++++++++++----------- src/util/stats.js | 6 ++++-- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/util/prometheus.js b/src/util/prometheus.js index 9eda9b21d5..a618d35068 100644 --- a/src/util/prometheus.js +++ b/src/util/prometheus.js @@ -11,7 +11,7 @@ function appendPrefix(name) { } class Prometheus { - constructor() { + constructor(enableSummaryMetrics = true) { this.prometheusRegistry = new prometheusClient.Registry(); this.prometheusRegistry.setDefaultLabels(defaultLabels); prometheusClient.collectDefaultMetrics({ @@ -21,7 +21,7 @@ class Prometheus { prometheusClient.AggregatorRegistry.setRegistries(this.prometheusRegistry); this.aggregatorRegistry = new prometheusClient.AggregatorRegistry(); - this.createMetrics(); + this.createMetrics(enableSummaryMetrics); } async metricsController(ctx) { @@ -192,7 +192,7 @@ class Prometheus { } } - createMetrics() { + createMetrics(enableSummaryMetrics) { const metrics = [ // Counters { @@ -1121,14 +1121,16 @@ class Prometheus { metric.buckets, ); } else if (metric.type === 'summary') { - this.newSummaryStat( - appendPrefix(metric.name), - metric.help, - metric.labelNames, - metric.percentiles, - metric.maxAge, - metric.ageBuckets, - ); + if (enableSummaryMetrics) { + this.newSummaryStat( + appendPrefix(metric.name), + metric.help, + metric.labelNames, + metric.percentiles, + metric.maxAge, + metric.ageBuckets, + ); + } } else { logger.error( `Prometheus: Metric creation failed. Name: ${metric.name}. Invalid type: ${metric.type}`, diff --git a/src/util/stats.js b/src/util/stats.js index 0853554306..0aa13fc85c 100644 --- a/src/util/stats.js +++ b/src/util/stats.js @@ -4,6 +4,8 @@ const logger = require('../logger'); const enableStats = process.env.ENABLE_STATS !== 'false'; const statsClientType = process.env.STATS_CLIENT || 'statsd'; +// summary metrics are enabled by default. To disable set ENABLE_SUMMARY_METRICS='false'. +const enableSummaryMetrics = process.env.ENABLE_SUMMARY_METRICS !== 'false'; let statsClient; function init() { @@ -19,7 +21,7 @@ function init() { case 'prometheus': logger.info('setting up prometheus client'); - statsClient = new prometheus.Prometheus(); + statsClient = new prometheus.Prometheus(enableSummaryMetrics); break; default: @@ -40,7 +42,7 @@ const timing = (name, start, tags = {}) => { // timingSummary is used to record observations for a summary metric const timingSummary = (name, start, tags = {}) => { - if (!enableStats || !statsClient) { + if (!enableStats || !statsClient || !enableSummaryMetrics) { return; } From 15c60f31ad10a40191cbd83eddbd4450450188a5 Mon Sep 17 00:00:00 2001 From: Dhawal Sanghvi Date: Thu, 23 May 2024 16:19:55 +0530 Subject: [PATCH 4/4] chore: minor fix --- src/util/customTransformer.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/util/customTransformer.js b/src/util/customTransformer.js index 5ae9df95ba..37364ef5d0 100644 --- a/src/util/customTransformer.js +++ b/src/util/customTransformer.js @@ -259,7 +259,8 @@ async function runUserTransform( }; stats.counter('user_transform_function_input_events', events.length, tags); - stats.timing('user_transform_function_latency_summary', invokeTime, tags); + stats.timing('user_transform_function_latency', invokeTime, tags); + stats.timingSummary('user_transform_function_latency_summary', invokeTime, tags); } return {