From fd062a168f7c5f2e9ebe25ce39c0903b041afe1c Mon Sep 17 00:00:00 2001 From: Tom van der Woerdt Date: Thu, 3 Sep 2020 17:08:56 +0200 Subject: [PATCH] Export client request latencies as histograms This adds support for histogram-style metrics instead of using summaries. It means we can sum on a cluster level and present the user's experienced latency instead of looking at it on a per-node level. The current version limits the range of histogram buckets between 0.1ms and 60s, to avoid exporting huge amounts of buckets that are likely empty. Further patches could limit this further, for example by going for a 1.44x increment instead of the 1.2x increment, or by specifying the ranges in the configuration. Even with the limits in place, this exports 76 metrics to 3 metric families per histogram. The original summaries-based code only exports 8 metrics (in 3 families), though in theory those are no longer needed and could be disabled with a flag. --- .../exporter/CassandraMetricsUtilities.java | 15 +++++ .../exporter/CollectorFunctions.java | 55 +++++++++++++++++++ .../cassandra/exporter/FactoriesSupplier.java | 11 ++++ .../cassandra/exporter/SamplingCounting.java | 2 + 4 files changed, 83 insertions(+) diff --git a/common/src/main/java/com/zegelin/cassandra/exporter/CassandraMetricsUtilities.java b/common/src/main/java/com/zegelin/cassandra/exporter/CassandraMetricsUtilities.java index 2dca131..fe1bd95 100644 --- a/common/src/main/java/com/zegelin/cassandra/exporter/CassandraMetricsUtilities.java +++ b/common/src/main/java/com/zegelin/cassandra/exporter/CassandraMetricsUtilities.java @@ -87,6 +87,11 @@ public Iterable getIntervals() { new Interval(Interval.Quantile.P_99_9, (float) timer.get999thPercentile() * durationFactor) ); } + + @Override + public long[] getValues() { + return timer.values(); + } }; } @@ -108,6 +113,11 @@ public Iterable getIntervals() { new Interval(Interval.Quantile.P_99_9, (float) histogram.get999thPercentile()) ); } + + @Override + public long[] getValues() { + return histogram.values(); + } }; } @@ -125,6 +135,11 @@ public Iterable getIntervals() { return Interval.asIntervals(Interval.Quantile.STANDARD_PERCENTILES, q -> (float) snapshot.getValue(q.value)); } + + @Override + public long[] getValues() { + return metric.getSnapshot().getValues(); + } }; } diff --git a/common/src/main/java/com/zegelin/cassandra/exporter/CollectorFunctions.java b/common/src/main/java/com/zegelin/cassandra/exporter/CollectorFunctions.java index 6455bee..3044311 100644 --- a/common/src/main/java/com/zegelin/cassandra/exporter/CollectorFunctions.java +++ b/common/src/main/java/com/zegelin/cassandra/exporter/CollectorFunctions.java @@ -10,6 +10,7 @@ import org.apache.cassandra.metrics.CassandraMetricsRegistry.JmxMeterMBean; import org.apache.cassandra.utils.EstimatedHistogram; +import java.util.ArrayList; import java.util.stream.Stream; public final class CollectorFunctions { @@ -176,4 +177,58 @@ protected static CollectorFunction samplingAndCountingAsSummar public static CollectorFunction samplingAndCountingAsSummary() { return samplingAndCountingAsSummary(FloatFloatFunction.identity()); } + + /** + * Collect a {@link SamplingCounting} as a Prometheus histogram. + */ + private static float[] newBucketOffsets(int size, final FloatFloatFunction bucketScaleFunction) { + long[] rawOffsets = EstimatedHistogram.newOffsets(size, false); + float[] adjustedOffsets = new float[size]; + for (int i = 0; i < size; i++) { + adjustedOffsets[i] = bucketScaleFunction.apply(rawOffsets[i]); + } + return adjustedOffsets; + } + + protected static CollectorFunction samplingAndCountingAsHistogram(final FloatFloatFunction bucketScaleFunction) { + // Set some limits on the range so we don't export all 170 buckets + float bucketMin = 0.0001f; // 0.1ms + float bucketMax = 60.0f; // 60sec + + // Avoid recomputing the buckets frequently. Cassandra uses ~170 buckets + float[] cachedBuckets = newBucketOffsets(200, bucketScaleFunction); + + return group -> { + final Stream histogramStream = group.labeledObjects().entrySet().stream() + .map(e -> { + long[] values = e.getValue().getValues(); + float[] buckets = values.length <= cachedBuckets.length + ? cachedBuckets + : newBucketOffsets(values.length, bucketScaleFunction); + + float sum = 0; + long count = 0; + ArrayList intervals = new ArrayList<>(); + assert values[values.length-1] == 0; + + for (int i = 0; i < values.length; i++) { + if (values[i] != 0) { + sum += buckets[i] * values[i]; + count += values[i]; + } + if (buckets[i] >= bucketMin && buckets[i] <= bucketMax) { + intervals.add(new Interval(new Interval.Quantile(buckets[i]), count)); + } + } + + return new HistogramMetricFamily.Histogram(e.getKey(), sum, count, intervals); + }); + + return Stream.of(new HistogramMetricFamily(group.name(), group.help(), histogramStream)); + }; + } + + public static CollectorFunction samplingAndCountingAsHistogram() { + return samplingAndCountingAsHistogram(FloatFloatFunction.identity()); + } } diff --git a/common/src/main/java/com/zegelin/cassandra/exporter/FactoriesSupplier.java b/common/src/main/java/com/zegelin/cassandra/exporter/FactoriesSupplier.java index 2d88503..ef2aec1 100644 --- a/common/src/main/java/com/zegelin/cassandra/exporter/FactoriesSupplier.java +++ b/common/src/main/java/com/zegelin/cassandra/exporter/FactoriesSupplier.java @@ -510,6 +510,15 @@ private static FactoryBuilder.CollectorConstructor histogramAsSummaryCollectorCo }; } + private static FactoryBuilder.CollectorConstructor histogramAsHistogramCollectorConstructor() { + return (name, help, labels, mBean) -> { + final NamedObject samplingCountingNamedObject = CassandraMetricsUtilities.jmxHistogramAsSamplingCounting(mBean); + + return new FunctionalMetricFamilyCollector<>(name, help, ImmutableMap.of(labels, samplingCountingNamedObject), + samplingAndCountingAsHistogram(MetricValueConversionFunctions::nanosecondsToSeconds)); + }; + } + private static FactoryBuilder.CollectorConstructor functionalCollectorConstructor(final FunctionalMetricFamilyCollector.CollectorFunction function) { return (final String name, final String help, final Labels labels, final NamedObject mBean) -> new FunctionalMetricFamilyCollector<>(name, help, ImmutableMap.of(labels, mBean.cast()), function); @@ -592,6 +601,8 @@ public List get() { builder.add(clientRequestMetricFactory(LatencyMetricGroupSummaryCollector::collectorForMBean, "Latency", "latency_seconds", "Request latency.")); builder.add(clientRequestMetricFactory(LatencyMetricGroupSummaryCollector::collectorForMBean, "TotalLatency", "latency_seconds", "Total request duration.")); + + builder.add(clientRequestMetricFactory(histogramAsHistogramCollectorConstructor(), "Latency", "latency_hist_seconds", "Request latency.")); } diff --git a/common/src/main/java/com/zegelin/cassandra/exporter/SamplingCounting.java b/common/src/main/java/com/zegelin/cassandra/exporter/SamplingCounting.java index 89c886d..cee55e0 100644 --- a/common/src/main/java/com/zegelin/cassandra/exporter/SamplingCounting.java +++ b/common/src/main/java/com/zegelin/cassandra/exporter/SamplingCounting.java @@ -13,4 +13,6 @@ public interface SamplingCounting { long getCount(); Iterable getIntervals(); + + long[] getValues(); }