Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(inputs.nvidia-smi): Add probe_on_startup option #15916

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions plugins/inputs/nvidia_smi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@ using the `startup_error_behavior` setting. Available values are:

## Optional: timeout for GPU polling
# timeout = "5s"

## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
## exit code, the plugin will return an error. This is particularly useful
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
## disabled if nvidia-smi cannot run successfully.
# probe_on_startup = false
```

### Linux
Expand Down
26 changes: 17 additions & 9 deletions plugins/inputs/nvidia_smi/nvidia_smi.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,14 @@ var sampleConfig string

// NvidiaSMI holds the methods for this plugin
type NvidiaSMI struct {
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
Log telegraf.Logger `toml:"-"`

ignorePlugin bool
once sync.Once
BinPath string `toml:"bin_path"`
Timeout config.Duration `toml:"timeout"`
ProbeOnStartup bool `toml:"probe_on_startup"`
Log telegraf.Logger `toml:"-"`

ignorePlugin bool
once sync.Once
nvidiaSMIArgs []string
}

func (*NvidiaSMI) SampleConfig() string {
Expand All @@ -47,6 +49,11 @@ func (smi *NvidiaSMI) Start(telegraf.Accumulator) error {
}
smi.BinPath = binPath
}
if smi.ProbeOnStartup {
if _, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout)); err != nil {
return &internal.StartupError{Err: err}
}
}

return nil
}
Expand All @@ -60,7 +67,7 @@ func (smi *NvidiaSMI) Gather(acc telegraf.Accumulator) error {
}

// Construct and execute metrics query
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, "-q", "-x"), time.Duration(smi.Timeout))
data, err := internal.CombinedOutputTimeout(exec.Command(smi.BinPath, smi.nvidiaSMIArgs...), time.Duration(smi.Timeout))
if err != nil {
return fmt.Errorf("calling %q failed: %w", smi.BinPath, err)
}
Expand Down Expand Up @@ -119,8 +126,9 @@ func (smi *NvidiaSMI) parse(acc telegraf.Accumulator, data []byte) error {
func init() {
inputs.Add("nvidia_smi", func() telegraf.Input {
return &NvidiaSMI{
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
BinPath: "/usr/bin/nvidia-smi",
Timeout: config.Duration(5 * time.Second),
nvidiaSMIArgs: []string{"-q", "-x"},
srebhan marked this conversation as resolved.
Show resolved Hide resolved
}
})
}
49 changes: 49 additions & 0 deletions plugins/inputs/nvidia_smi/nvidia_smi_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,65 @@ import (
"errors"
"os"
"path/filepath"
"runtime"
"testing"
"time"

"github.com/influxdata/telegraf"
"github.com/influxdata/telegraf/config"
"github.com/influxdata/telegraf/internal"
"github.com/influxdata/telegraf/models"
"github.com/influxdata/telegraf/testutil"
"github.com/stretchr/testify/require"
)

func TestOnStartupError(t *testing.T) {
var binPath string
var nvidiaSMIArgs []string
if runtime.GOOS == "windows" {
binPath = `C:\Windows\System32\WindowsPowerShell\v1.0\powershell.exe`
nvidiaSMIArgs = []string{"-Command", "exit 1"}
} else {
binPath = "/bin/bash"
nvidiaSMIArgs = []string{"-c", "exit 1"}
}

tests := []struct {
ProbeOnStartup bool
}{
{
ProbeOnStartup: true,
},
{
ProbeOnStartup: false,
},
}
for _, tt := range tests {
plugin := &NvidiaSMI{
BinPath: binPath,
ProbeOnStartup: tt.ProbeOnStartup,
Timeout: config.Duration(time.Second),
Log: &testutil.Logger{},
nvidiaSMIArgs: nvidiaSMIArgs,
}
model := models.NewRunningInput(plugin, &models.InputConfig{
Name: "nvidia_smi",
})
require.NoError(t, model.Init())

var acc testutil.Accumulator
var ferr *internal.FatalError
err := model.Start(&acc)

if tt.ProbeOnStartup {
require.False(t, errors.As(err, &ferr))
require.ErrorIs(t, model.Gather(&acc), internal.ErrNotConnected)
} else {
require.NoError(t, err)
}
}
}

func TestErrorBehaviorDefault(t *testing.T) {
// make sure we can't find nvidia-smi in $PATH somewhere
os.Unsetenv("PATH")
Expand Down
6 changes: 6 additions & 0 deletions plugins/inputs/nvidia_smi/sample.conf
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@

## Optional: timeout for GPU polling
# timeout = "5s"

## Optional: Attempt to run nvidia-smi once on startup. If nvidia-smi returns a non-zero
## exit code, the plugin will return an error. This is particularly useful
## if used in conjunction with `startup_error_behavior` to allow the plugin to be
## disabled if nvidia-smi cannot run successfully.
# probe_on_startup = false
Loading