From cd27c4a3e09b32444da0221dccd9246eda9ed1b7 Mon Sep 17 00:00:00 2001 From: LandonTClipp <11232769+LandonTClipp@users.noreply.github.com> Date: Thu, 19 Sep 2024 14:46:21 -0500 Subject: [PATCH 1/2] feat(inputs.nvidia-smi): Add test_on_startup option There are some cases where the nvidia-smi plugin might be found in PATH and executable, but upon running it might always return a non-zero exit code. For various reasons, in the environment I work in, this might be expected. It's thus disruptive for system logs to be polluted with infinite error messages. It's preferable in this situation to check if nvidia-smi returns a good result on plugin startup, and if not, allow the error to be bubbled up and handled according to startup_error_behavior. --- plugins/inputs/nvidia_smi/README.md | 6 ++++ plugins/inputs/nvidia_smi/nvidia_smi.go | 26 +++++++++----- plugins/inputs/nvidia_smi/nvidia_smi_test.go | 38 ++++++++++++++++++++ plugins/inputs/nvidia_smi/sample.conf | 6 ++++ 4 files changed, 67 insertions(+), 9 deletions(-) diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md index c493ff357edb2..cb07033a680b6 100644 --- a/plugins/inputs/nvidia_smi/README.md +++ b/plugins/inputs/nvidia_smi/README.md @@ -37,6 +37,12 @@ using the `startup_error_behavior` setting. Available values are: ## Optional: timeout for GPU polling # timeout = "5s" + + ## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero + ## exit code, the plugin will return an error. This is particularly useful + ## if used in conjunction with `startup_error_behavior` to allow the plugin to be + ## disabled if nvidia-smi cannot run successfully. + # probe_on_startup = false ``` ### Linux diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go index 695b8c6f601ee..375ca8455987b 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -27,12 +27,14 @@ var sampleConfig string // NvidiaSMI holds the methods for this plugin type NvidiaSMI struct { - BinPath string `toml:"bin_path"` - Timeout config.Duration `toml:"timeout"` - Log telegraf.Logger `toml:"-"` - - ignorePlugin bool - once sync.Once + BinPath string `toml:"bin_path"` + Timeout config.Duration `toml:"timeout"` + ProbeOnStartup bool `toml:"probe_on_startup"` + Log telegraf.Logger `toml:"-"` + + ignorePlugin bool + once sync.Once + nvidiaSMIArgs []string } func (*NvidiaSMI) SampleConfig() string { @@ -47,6 +49,11 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error { } smi.BinPath = binPath } + if smi.ProbeOnStartup { + if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil { + return &internal.StartupError{Err: err} + } + } return nil } @@ -60,7 +67,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { } // Construct and execute metrics query - data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout)) + data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)) if err != nil { return fmt.Errorf("calling %q failed: %w", smi.BinPath, err) } @@ -119,8 +126,9 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error { func init() { inputs.Add("nvidia_smi", func() telegraf.Input { return &NvidiaSMI{ - BinPath: "/usr/bin/nvidia-smi", - Timeout: config.Duration(5 * time.Second), + BinPath: "/usr/bin/nvidia-smi", + Timeout: config.Duration(5 * time.Second), + nvidiaSMIArgs: []string{"-q", "-x"}, } }) } diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index 23c57e5f6a3b6..3061b368a365d 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -8,12 +8,50 @@ import ( "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/models" "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) +func TestOnStartupError(t *testing.T) { + tests := []struct { + ProbeOnStartup bool + }{ + { + ProbeOnStartup: true, + }, + { + ProbeOnStartup: false, + }, + } + for _, tt := range tests { + plugin := &NvidiaSMI{ + BinPath: "/bin/bash", + ProbeOnStartup: tt.ProbeOnStartup, + Timeout: config.Duration(time.Second), + Log: &testutil.Logger{}, + nvidiaSMIArgs: []string{"-c", "exit 9"}, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "nvidia_smi", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + err := model.Start(&acc) + + if tt.ProbeOnStartup { + require.False(t, errors.As(err, &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) + } else { + require.NoError(t, err) + } + } +} + func TestErrorBehaviorDefault(t *testing.T) { // make sure we can't find nvidia-smi in $PATH somewhere os.Unsetenv("PATH") diff --git a/plugins/inputs/nvidia_smi/sample.conf b/plugins/inputs/nvidia_smi/sample.conf index 8879b3923a2cc..0582ac3cb8bf7 100644 --- a/plugins/inputs/nvidia_smi/sample.conf +++ b/plugins/inputs/nvidia_smi/sample.conf @@ -7,3 +7,9 @@ ## Optional: timeout for GPU polling # timeout = "5s" + + ## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero + ## exit code, the plugin will return an error. This is particularly useful + ## if used in conjunction with `startup_error_behavior` to allow the plugin to be + ## disabled if nvidia-smi cannot run successfully. + # probe_on_startup = false From c053088a103a861429e70016914923692bc30291 Mon Sep 17 00:00:00 2001 From: LandonTClipp <11232769+LandonTClipp@users.noreply.github.com> Date: Wed, 2 Oct 2024 10:53:22 -0500 Subject: [PATCH 2/2] Fix test on windows --- plugins/inputs/nvidia_smi/nvidia_smi_test.go | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index 3061b368a365d..b30673666f3fe 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -4,6 +4,7 @@ import ( "errors" "os" "path/filepath" + "runtime" "testing" "time" @@ -16,6 +17,16 @@ import ( ) func TestOnStartupError(t *testing.T) { + var binPath string + var nvidiaSMIArgs []string + if runtime.GOOS == "windows" { + binPath = `C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe` + nvidiaSMIArgs = []string{"-Command", "exit 1"} + } else { + binPath = "/bin/bash" + nvidiaSMIArgs = []string{"-c", "exit 1"} + } + tests := []struct { ProbeOnStartup bool }{ @@ -28,11 +39,11 @@ func TestOnStartupError(t *testing.T) { } for _, tt := range tests { plugin := &NvidiaSMI{ - BinPath: "/bin/bash", + BinPath: binPath, ProbeOnStartup: tt.ProbeOnStartup, Timeout: config.Duration(time.Second), Log: &testutil.Logger{}, - nvidiaSMIArgs: []string{"-c", "exit 9"}, + nvidiaSMIArgs: nvidiaSMIArgs, } model := models.NewRunningInput(plugin, &models.InputConfig{ Name: "nvidia_smi",