From b6e59aac594f95589dbcdb1a00638e15c76d217a Mon Sep 17 00:00:00 2001 From: Landon Clipp <11232769+LandonTClipp@users.noreply.github.com> Date: Thu, 3 Oct 2024 11:51:54 -0500 Subject: [PATCH] feat(inputs.nvidia-smi): Add `probe_on_startup` option (#15916) --- plugins/inputs/nvidia_smi/README.md | 6 +++ plugins/inputs/nvidia_smi/nvidia_smi.go | 26 +++++++---- plugins/inputs/nvidia_smi/nvidia_smi_test.go | 49 ++++++++++++++++++++ plugins/inputs/nvidia_smi/sample.conf | 6 +++ 4 files changed, 78 insertions(+), 9 deletions(-) diff --git a/plugins/inputs/nvidia_smi/README.md b/plugins/inputs/nvidia_smi/README.md index c493ff357edb2..cb07033a680b6 100644 --- a/plugins/inputs/nvidia_smi/README.md +++ b/plugins/inputs/nvidia_smi/README.md @@ -37,6 +37,12 @@ using the `startup_error_behavior` setting. Available values are: ## Optional: timeout for GPU polling # timeout = "5s" + + ## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero + ## exit code, the plugin will return an error. This is particularly useful + ## if used in conjunction with `startup_error_behavior` to allow the plugin to be + ## disabled if nvidia-smi cannot run successfully. + # probe_on_startup = false ``` ### Linux diff --git a/plugins/inputs/nvidia_smi/nvidia_smi.go b/plugins/inputs/nvidia_smi/nvidia_smi.go index 695b8c6f601ee..375ca8455987b 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi.go @@ -27,12 +27,14 @@ var sampleConfig string // NvidiaSMI holds the methods for this plugin type NvidiaSMI struct { - BinPath string `toml:"bin_path"` - Timeout config.Duration `toml:"timeout"` - Log telegraf.Logger `toml:"-"` - - ignorePlugin bool - once sync.Once + BinPath string `toml:"bin_path"` + Timeout config.Duration `toml:"timeout"` + ProbeOnStartup bool `toml:"probe_on_startup"` + Log telegraf.Logger `toml:"-"` + + ignorePlugin bool + once sync.Once + nvidiaSMIArgs []string } func (*NvidiaSMI) SampleConfig() string { @@ -47,6 +49,11 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error { } smi.BinPath = binPath } + if smi.ProbeOnStartup { + if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil { + return &internal.StartupError{Err: err} + } + } return nil } @@ -60,7 +67,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error { } // Construct and execute metrics query - data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout)) + data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)) if err != nil { return fmt.Errorf("calling %q failed: %w", smi.BinPath, err) } @@ -119,8 +126,9 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error { func init() { inputs.Add("nvidia_smi", func() telegraf.Input { return &NvidiaSMI{ - BinPath: "/usr/bin/nvidia-smi", - Timeout: config.Duration(5 * time.Second), + BinPath: "/usr/bin/nvidia-smi", + Timeout: config.Duration(5 * time.Second), + nvidiaSMIArgs: []string{"-q", "-x"}, } }) } diff --git a/plugins/inputs/nvidia_smi/nvidia_smi_test.go b/plugins/inputs/nvidia_smi/nvidia_smi_test.go index 23c57e5f6a3b6..b30673666f3fe 100644 --- a/plugins/inputs/nvidia_smi/nvidia_smi_test.go +++ b/plugins/inputs/nvidia_smi/nvidia_smi_test.go @@ -4,16 +4,65 @@ import ( "errors" "os" "path/filepath" + "runtime" "testing" "time" "github.com/influxdata/telegraf" + "github.com/influxdata/telegraf/config" "github.com/influxdata/telegraf/internal" "github.com/influxdata/telegraf/models" "github.com/influxdata/telegraf/testutil" "github.com/stretchr/testify/require" ) +func TestOnStartupError(t *testing.T) { + var binPath string + var nvidiaSMIArgs []string + if runtime.GOOS == "windows" { + binPath = `C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe` + nvidiaSMIArgs = []string{"-Command", "exit 1"} + } else { + binPath = "/bin/bash" + nvidiaSMIArgs = []string{"-c", "exit 1"} + } + + tests := []struct { + ProbeOnStartup bool + }{ + { + ProbeOnStartup: true, + }, + { + ProbeOnStartup: false, + }, + } + for _, tt := range tests { + plugin := &NvidiaSMI{ + BinPath: binPath, + ProbeOnStartup: tt.ProbeOnStartup, + Timeout: config.Duration(time.Second), + Log: &testutil.Logger{}, + nvidiaSMIArgs: nvidiaSMIArgs, + } + model := models.NewRunningInput(plugin, &models.InputConfig{ + Name: "nvidia_smi", + }) + require.NoError(t, model.Init()) + + var acc testutil.Accumulator + var ferr *internal.FatalError + err := model.Start(&acc) + + if tt.ProbeOnStartup { + require.False(t, errors.As(err, &ferr)) + require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected) + } else { + require.NoError(t, err) + } + } +} + func TestErrorBehaviorDefault(t *testing.T) { // make sure we can't find nvidia-smi in $PATH somewhere os.Unsetenv("PATH") diff --git a/plugins/inputs/nvidia_smi/sample.conf b/plugins/inputs/nvidia_smi/sample.conf index 8879b3923a2cc..0582ac3cb8bf7 100644 --- a/plugins/inputs/nvidia_smi/sample.conf +++ b/plugins/inputs/nvidia_smi/sample.conf @@ -7,3 +7,9 @@ ## Optional: timeout for GPU polling # timeout = "5s" + + ## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero + ## exit code, the plugin will return an error. This is particularly useful + ## if used in conjunction with `startup_error_behavior` to allow the plugin to be + ## disabled if nvidia-smi cannot run successfully. + # probe_on_startup = false