diff --git a/.chloggen/fix_cpu-utilization-divide-by-cores.yaml b/.chloggen/fix_cpu-utilization-divide-by-cores.yaml new file mode 100755 index 000000000000..77983b00d2d0 --- /dev/null +++ b/.chloggen/fix_cpu-utilization-divide-by-cores.yaml @@ -0,0 +1,18 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: hostmetricsreceiver + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: Adds the receiver.hostmetrics.normalizeProcessCPUUtilization feature gate to optionally normalize process.cpu.utilization values. + +subtext: > + When enabled, the receiver.hostmetrics.normalizeProcessCPUUtilization feature gate will cause process.cpu.utilization + values to be divided by the number of logical cores on the system. This is necessary to produce a value on the interval of + [0-1], as the description of process.cpu.utilization the metric says. + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [31368] diff --git a/receiver/hostmetricsreceiver/README.md b/receiver/hostmetricsreceiver/README.md index d522eae54ac1..837fbbfd7945 100644 --- a/receiver/hostmetricsreceiver/README.md +++ b/receiver/hostmetricsreceiver/README.md @@ -192,3 +192,18 @@ Currently, the hostmetrics receiver does not set any Resource attributes on the export OTEL_RESOURCE_ATTRIBUTES="service.name=,service.namespace=,service.instance.id=" ``` +## Feature Gates + +See the [Collector feature gates](https://github.com/open-telemetry/opentelemetry-collector/blob/main/featuregate/README.md#collector-feature-gates) for an overview of feature gates in the collector. + +### `receiver.hostmetrics.normalizeProcessCPUUtilization` + +When enabled, normalizes the `process.cpu.utilization` metric onto the interval [0-1] by dividing the value by the number of logical processors. With this feature gate disabled, the value of the `process.cpu.utilization` metric may exceed 1. + +For example, if you have 4 logical cores on your system, and a process is occupying 2 logical cores for an entire scrape interval, with this feature gate disabled a `process.cpu.utilization` metric will be emitted with a value of 2. if this feature gate is enabled in the same scenario, the value of the emitted metric will be 0.5. + +The schedule for this feature gate is: +- Introduced in v0.97.0 (March 2024) as `alpha` - disabled by default. +- Moved to `beta` in v0.99.0 (April 2024) - enabled by default. +- Moved to `stable` in v0.101.0 (May 2024) - cannot be disabled. +- Removed three releases after `stable`. diff --git a/receiver/hostmetricsreceiver/go.mod b/receiver/hostmetricsreceiver/go.mod index 7080a26567c2..929f126e73f2 100644 --- a/receiver/hostmetricsreceiver/go.mod +++ b/receiver/hostmetricsreceiver/go.mod @@ -15,6 +15,7 @@ require ( go.opentelemetry.io/collector/component v0.96.1-0.20240306115632-b2693620eff6 go.opentelemetry.io/collector/confmap v0.96.1-0.20240306115632-b2693620eff6 go.opentelemetry.io/collector/consumer v0.96.1-0.20240306115632-b2693620eff6 + go.opentelemetry.io/collector/featuregate v1.3.1-0.20240306115632-b2693620eff6 go.opentelemetry.io/collector/otelcol v0.96.1-0.20240306115632-b2693620eff6 go.opentelemetry.io/collector/pdata v1.3.1-0.20240306115632-b2693620eff6 go.opentelemetry.io/collector/receiver v0.96.1-0.20240306115632-b2693620eff6 @@ -101,7 +102,6 @@ require ( go.opentelemetry.io/collector/connector v0.96.1-0.20240306115632-b2693620eff6 // indirect go.opentelemetry.io/collector/exporter v0.96.1-0.20240306115632-b2693620eff6 // indirect go.opentelemetry.io/collector/extension v0.96.1-0.20240306115632-b2693620eff6 // indirect - go.opentelemetry.io/collector/featuregate v1.3.1-0.20240306115632-b2693620eff6 // indirect go.opentelemetry.io/collector/processor v0.96.1-0.20240306115632-b2693620eff6 // indirect go.opentelemetry.io/collector/service v0.96.1-0.20240306115632-b2693620eff6 // indirect go.opentelemetry.io/contrib/config v0.4.0 // indirect diff --git a/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go b/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go index 01d897fe6458..88571bc8c604 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go +++ b/receiver/hostmetricsreceiver/internal/scraper/processscraper/process_scraper.go @@ -11,6 +11,7 @@ import ( "time" "github.com/shirou/gopsutil/v3/common" + "github.com/shirou/gopsutil/v3/cpu" "github.com/shirou/gopsutil/v3/process" "go.opentelemetry.io/collector/component" "go.opentelemetry.io/collector/pdata/pcommon" @@ -48,6 +49,7 @@ type scraper struct { excludeFS filterset.FilterSet scrapeProcessDelay time.Duration ucals map[int32]*ucal.CPUUtilizationCalculator + logicalCores int // for mocking getProcessCreateTime func(p processHandle, ctx context.Context) (int64, error) @@ -84,6 +86,13 @@ func newProcessScraper(settings receiver.CreateSettings, cfg *Config) (*scraper, } } + logicalCores, err := cpu.Counts(true) + if err != nil { + return nil, fmt.Errorf("error getting number of logical cores: %w", err) + } + + scraper.logicalCores = logicalCores + return scraper, nil } @@ -284,7 +293,7 @@ func (s *scraper) scrapeAndAppendCPUTimeMetric(ctx context.Context, now pcommon. s.ucals[pid] = &ucal.CPUUtilizationCalculator{} } - err = s.ucals[pid].CalculateAndRecord(now, times, s.recordCPUUtilization) + err = s.ucals[pid].CalculateAndRecord(now, s.logicalCores, times, s.recordCPUUtilization) return err } diff --git a/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator.go b/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator.go index 7418b7122f1d..32ffd5c11630 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator.go +++ b/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator.go @@ -7,9 +7,18 @@ import ( "time" "github.com/shirou/gopsutil/v3/cpu" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/pdata/pcommon" ) +var normalizeProcessCPUUtilizationFeatureGate = featuregate.GlobalRegistry().MustRegister( + "receiver.hostmetrics.normalizeProcessCPUUtilization", + featuregate.StageAlpha, + featuregate.WithRegisterDescription("When enabled, normalizes the process.cpu.utilization metric onto the interval [0-1] by dividing the value by the number of logical processors."), + featuregate.WithRegisterFromVersion("v0.97.0"), + featuregate.WithRegisterReferenceURL("https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/31368"), +) + // CPUUtilization stores the utilization percents [0-1] for the different cpu states type CPUUtilization struct { User float64 @@ -27,9 +36,9 @@ type CPUUtilizationCalculator struct { // CalculateAndRecord calculates the cpu utilization for the different cpu states comparing previously // stored []cpu.TimesStat and time.Time and current []cpu.TimesStat and current time.Time // If no previous data is stored it will return empty slice of CPUUtilization and no error -func (c *CPUUtilizationCalculator) CalculateAndRecord(now pcommon.Timestamp, currentCPUStats *cpu.TimesStat, recorder func(pcommon.Timestamp, CPUUtilization)) error { +func (c *CPUUtilizationCalculator) CalculateAndRecord(now pcommon.Timestamp, logicalCores int, currentCPUStats *cpu.TimesStat, recorder func(pcommon.Timestamp, CPUUtilization)) error { if c.previousCPUStats != nil { - recorder(now, cpuUtilization(c.previousCPUStats, c.previousReadTime, currentCPUStats, now)) + recorder(now, cpuUtilization(logicalCores, c.previousCPUStats, c.previousReadTime, currentCPUStats, now)) } c.previousCPUStats = currentCPUStats c.previousReadTime = now @@ -38,14 +47,26 @@ func (c *CPUUtilizationCalculator) CalculateAndRecord(now pcommon.Timestamp, cur } // cpuUtilization calculates the difference between 2 cpu.TimesStat using spent time between them -func cpuUtilization(startStats *cpu.TimesStat, startTime pcommon.Timestamp, endStats *cpu.TimesStat, endTime pcommon.Timestamp) CPUUtilization { +func cpuUtilization(logicalCores int, startStats *cpu.TimesStat, startTime pcommon.Timestamp, endStats *cpu.TimesStat, endTime pcommon.Timestamp) CPUUtilization { elapsedTime := time.Duration(endTime - startTime).Seconds() if elapsedTime <= 0 { return CPUUtilization{} } + + userUtilization := (endStats.User - startStats.User) / elapsedTime + systemUtilization := (endStats.System - startStats.System) / elapsedTime + ioWaitUtilization := (endStats.Iowait - startStats.Iowait) / elapsedTime + + if normalizeProcessCPUUtilizationFeatureGate.IsEnabled() && logicalCores > 0 { + // Normalize onto the [0-1] interval by dividing by the number of logical cores + userUtilization /= float64(logicalCores) + systemUtilization /= float64(logicalCores) + ioWaitUtilization /= float64(logicalCores) + } + return CPUUtilization{ - User: (endStats.User - startStats.User) / elapsedTime, - System: (endStats.System - startStats.System) / elapsedTime, - Iowait: (endStats.Iowait - startStats.Iowait) / elapsedTime, + User: userUtilization, + System: systemUtilization, + Iowait: ioWaitUtilization, } } diff --git a/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator_test.go b/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator_test.go index 89644284001e..6a226ba62d1a 100644 --- a/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator_test.go +++ b/receiver/hostmetricsreceiver/internal/scraper/processscraper/ucal/cpu_utilization_calculator_test.go @@ -9,6 +9,8 @@ import ( "github.com/shirou/gopsutil/v3/cpu" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "go.opentelemetry.io/collector/featuregate" "go.opentelemetry.io/collector/pdata/pcommon" ) @@ -24,14 +26,17 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) { t.Parallel() testCases := []struct { name string + logicalCores int currentReadTime pcommon.Timestamp currentCPUStat *cpu.TimesStat previousReadTime pcommon.Timestamp previousCPUStat *cpu.TimesStat expectedUtilization *CPUUtilization + normalize bool }{ { - name: "no previous times", + name: "no previous times", + logicalCores: 1, currentCPUStat: &cpu.TimesStat{ User: 8260.4, }, @@ -39,6 +44,7 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) { }, { name: "no delta time should return utilization=0", + logicalCores: 1, previousReadTime: 1640097430772858000, currentReadTime: 1640097430772858000, previousCPUStat: &cpu.TimesStat{ @@ -51,6 +57,71 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) { }, { name: "one second time delta", + logicalCores: 1, + previousReadTime: 1640097430772858000, + currentReadTime: 1640097431772858000, + previousCPUStat: &cpu.TimesStat{ + User: 8258.4, + System: 6193.3, + Iowait: 34.201, + }, + currentCPUStat: &cpu.TimesStat{ + User: 8258.5, + System: 6193.6, + Iowait: 34.202, + }, + expectedUtilization: &CPUUtilization{ + User: 0.1, + System: 0.3, + Iowait: 0.001, + }, + }, + { + name: "one second time delta, 2 logical cores, normalized", + logicalCores: 2, + previousReadTime: 1640097430772858000, + currentReadTime: 1640097431772858000, + previousCPUStat: &cpu.TimesStat{ + User: 8258.4, + System: 6193.3, + Iowait: 34.201, + }, + currentCPUStat: &cpu.TimesStat{ + User: 8258.5, + System: 6193.6, + Iowait: 34.202, + }, + expectedUtilization: &CPUUtilization{ + User: 0.05, + System: 0.15, + Iowait: 0.0005, + }, + normalize: true, + }, + { + name: "one second time delta, 2 logical cores, not normalized", + logicalCores: 2, + previousReadTime: 1640097430772858000, + currentReadTime: 1640097431772858000, + previousCPUStat: &cpu.TimesStat{ + User: 8258.4, + System: 6193.3, + Iowait: 34.201, + }, + currentCPUStat: &cpu.TimesStat{ + User: 8258.5, + System: 6193.6, + Iowait: 34.202, + }, + expectedUtilization: &CPUUtilization{ + User: 0.1, + System: 0.3, + Iowait: 0.001, + }, + }, + { + name: "0 logical cores", + logicalCores: 0, previousReadTime: 1640097430772858000, currentReadTime: 1640097431772858000, previousCPUStat: &cpu.TimesStat{ @@ -73,13 +144,13 @@ func TestCpuUtilizationCalculator_Calculate(t *testing.T) { for _, test := range testCases { test := test t.Run(test.name, func(t *testing.T) { - t.Parallel() + setNormalizeProcessCPUUtilizationFeatureGate(t, test.normalize) recorder := inMemoryRecorder{} calculator := CPUUtilizationCalculator{ previousReadTime: test.previousReadTime, previousCPUStats: test.previousCPUStat, } - err := calculator.CalculateAndRecord(test.currentReadTime, test.currentCPUStat, recorder.record) + err := calculator.CalculateAndRecord(test.currentReadTime, test.logicalCores, test.currentCPUStat, recorder.record) assert.NoError(t, err) assert.InDelta(t, test.expectedUtilization.System, recorder.cpuUtilization.System, 0.00001) assert.InDelta(t, test.expectedUtilization.User, recorder.cpuUtilization.User, 0.00001) @@ -108,9 +179,26 @@ func Test_cpuUtilization(t *testing.T) { Iowait: 0.024, } - actualUtilization := cpuUtilization(startStat, startTime, endStat, halfSecondLater) + actualUtilization := cpuUtilization(1, startStat, startTime, endStat, halfSecondLater) assert.InDelta(t, expectedUtilization.User, actualUtilization.User, 0.00001) assert.InDelta(t, expectedUtilization.System, actualUtilization.System, 0.00001) assert.InDelta(t, expectedUtilization.Iowait, actualUtilization.Iowait, 0.00001) } + +func setNormalizeProcessCPUUtilizationFeatureGate(t *testing.T, val bool) { + wasEnabled := normalizeProcessCPUUtilizationFeatureGate.IsEnabled() + err := featuregate.GlobalRegistry().Set( + normalizeProcessCPUUtilizationFeatureGate.ID(), + val, + ) + require.NoError(t, err) + + t.Cleanup(func() { + err := featuregate.GlobalRegistry().Set( + normalizeProcessCPUUtilizationFeatureGate.ID(), + wasEnabled, + ) + require.NoError(t, err) + }) +}