Skip to content

Commit

Permalink
feat(inputs.nvidia-smi): Add test_on_startup option
Browse files Browse the repository at this point in the history
  • Loading branch information
LandonTClipp committed Sep 19, 2024
1 parent 640eda0 commit 1eec743
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 9 deletions.
27 changes: 18 additions & 9 deletions plugins/inputs/nvidia_smi/nvidia_smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ var sampleConfig string

// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`

ignorePlugin bool
once sync.Once
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
TestOnStartup bool `toml:"test_on_startup"`
Log telegraf.Logger `toml:"-"`

ignorePlugin bool
once sync.Once
nvidiaSMIArgs []string
}

func (*NvidiaSMI) SampleConfig() string {
Expand All @@ -47,6 +49,11 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
}
smi.BinPath = binPath
}
if smi.TestOnStartup {
if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil {
return &internal.StartupError{Err: err}
}
}

return nil
}
Expand All @@ -60,7 +67,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
}

// Construct and execute metrics query
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
if err != nil {
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
}
Expand Down Expand Up @@ -119,8 +126,10 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
func init() {
inputs.Add("nvidia_smi", func() telegraf.Input {
return &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
TestOnStartup: false,
nvidiaSMIArgs: []string{"-q", "-x"},
}
})
}
31 changes: 31 additions & 0 deletions plugins/inputs/nvidia_smi/nvidia_smi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,37 @@ import (
"github.com/stretchr/testify/require"
)

func TestOnStartupError(t *testing.T) {
type testData struct {
TestOnStartup bool
}
tests := []struct {
TestOnStartup bool
}{
{
TestOnStartup: true,
},
{
TestOnStartup: false,
},
}
for _, tt := range tests {
plugin := &NvidiaSMI{
BinPath: "/bin/bash",
TestOnStartup: tt.TestOnStartup,
Log: &testutil.Logger{},
nvidiaSMIArgs: []string{"-c", "exit 9"},
}
var acc testutil.Accumulator
err := plugin.Start(&acc)
if tt.TestOnStartup {
require.Error(t, err)
} else {
require.NoError(t, err)
}
}
}

func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
Expand Down
6 changes: 6 additions & 0 deletions plugins/inputs/nvidia_smi/sample.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@

## Optional: timeout for GPU polling
# timeout = "5s"

## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
# exit code, the plugin will return an error. This is particularly useful
# if used in conjunction with `startup_error_behavior` to allow the plugin to be
# disabled if nvidia-smi cannot run successfully.
# test_on_startup = false

0 comments on commit 1eec743

Please sign in to comment.