diff --git a/device_test.go b/device_test.go index 749a101..ea38dae 100644 --- a/device_test.go +++ b/device_test.go @@ -6,7 +6,7 @@ package nvidia import ( "testing" - hclog "github.com/hashicorp/go-hclog" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/nomad-device-nvidia/nvml" "github.com/hashicorp/nomad/plugins/device" "github.com/shoenig/test/must" diff --git a/nvml/client.go b/nvml/client.go index b325b12..ed23cc2 100644 --- a/nvml/client.go +++ b/nvml/client.go @@ -4,7 +4,9 @@ package nvml import ( + "cmp" "fmt" + "slices" ) // DeviceData represents common fields for Nvidia device @@ -95,8 +97,7 @@ func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { */ // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library + // NewNvmlClient because this method handles initialization of NVML library driverVersion, err := c.driver.SystemDriverVersion() if err != nil { @@ -108,15 +109,20 @@ func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { return nil, fmt.Errorf("nvidia nvml ListDeviceUUIDs() error: %v\n", err) } - allNvidiaGPUResources := make([]*FingerprintDeviceData, len(deviceUUIDs)) + allNvidiaGPUResources := make([]*FingerprintDeviceData, 0, len(deviceUUIDs)) - for i, element := range deviceUUIDs { - deviceInfo, err := c.driver.DeviceInfoByUUID(element) + for uuid, mode := range deviceUUIDs { + // do not care about phsyical parents of MIGs + if mode == parent { + continue + } + + deviceInfo, err := c.driver.DeviceInfoByUUID(uuid) if err != nil { return nil, fmt.Errorf("nvidia nvml DeviceInfoByUUID() error: %v\n", err) } - allNvidiaGPUResources[i] = &FingerprintDeviceData{ + allNvidiaGPUResources = append(allNvidiaGPUResources, &FingerprintDeviceData{ DeviceData: &DeviceData{ DeviceName: deviceInfo.Name, UUID: deviceInfo.UUID, @@ -130,8 +136,13 @@ func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) { DisplayState: deviceInfo.DisplayState, PersistenceMode: deviceInfo.PersistenceMode, PCIBusID: deviceInfo.PCIBusID, - } + }) + + slices.SortFunc(allNvidiaGPUResources, func(a, b *FingerprintDeviceData) int { + return cmp.Compare(a.DeviceData.UUID, b.DeviceData.UUID) + }) } + return &FingerprintData{ Devices: allNvidiaGPUResources, DriverVersion: driverVersion, @@ -156,23 +167,32 @@ func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { */ // Assumed that this method is called with receiver retrieved from - // NewNvmlClient - // because this method handles initialization of NVML library + // NewNvmlClient because this method handles initialization of NVML library deviceUUIDs, err := c.driver.ListDeviceUUIDs() if err != nil { return nil, fmt.Errorf("nvidia nvml ListDeviceUUIDs() error: %v\n", err) } - allNvidiaGPUStats := make([]*StatsData, len(deviceUUIDs)) + allNvidiaGPUStats := make([]*StatsData, 0, len(deviceUUIDs)) - for i, element := range deviceUUIDs { - deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByUUID(element) + for uuid, mode := range deviceUUIDs { + + // A30/A100 MIG devices have no stats. + // + // https://docs.nvidia.com/datacenter/tesla/mig-user-guide/#telemetry + // + // Is this fixed on H100 or later? Maybe? + if mode == mig || mode == parent { + continue + } + + deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByUUID(uuid) if err != nil { return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByUUID() error: %v\n", err) } - allNvidiaGPUStats[i] = &StatsData{ + allNvidiaGPUStats = append(allNvidiaGPUStats, &StatsData{ DeviceData: &DeviceData{ DeviceName: deviceInfo.Name, UUID: deviceInfo.UUID, @@ -191,7 +211,11 @@ func (c *nvmlClient) GetStatsData() ([]*StatsData, error) { ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache, ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache, ECCErrorsDevice: deviceStatus.ECCErrorsDevice, - } + }) + + slices.SortFunc(allNvidiaGPUStats, func(a, b *StatsData) int { + return cmp.Compare(a.DeviceData.UUID, b.DeviceData.UUID) + }) } return allNvidiaGPUStats, nil } diff --git a/nvml/client_test.go b/nvml/client_test.go index 458522c..57417d5 100644 --- a/nvml/client_test.go +++ b/nvml/client_test.go @@ -11,6 +11,8 @@ import ( "github.com/shoenig/test/must" ) +var _ NvmlDriver = (*MockNVMLDriver)(nil) + type MockNVMLDriver struct { systemDriverCallSuccessful bool listDeviceUUIDsSuccessful bool @@ -19,6 +21,7 @@ type MockNVMLDriver struct { driverVersion string devices []*DeviceInfo deviceStatus []*DeviceStatus + modes []mode } func (m *MockNVMLDriver) Initialize() error { @@ -36,15 +39,15 @@ func (m *MockNVMLDriver) SystemDriverVersion() (string, error) { return m.driverVersion, nil } -func (m *MockNVMLDriver) ListDeviceUUIDs() ([]string, error) { +func (m *MockNVMLDriver) ListDeviceUUIDs() (map[string]mode, error) { if !m.listDeviceUUIDsSuccessful { return nil, errors.New("failed to get device length") } - allNvidiaGPUUUIDs := make([]string, len(m.devices)) + allNvidiaGPUUUIDs := make(map[string]mode) for i, device := range m.devices { - allNvidiaGPUUUIDs[i] = device.UUID + allNvidiaGPUUUIDs[device.UUID] = m.modes[i] } return allNvidiaGPUUUIDs, nil @@ -113,6 +116,7 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { systemDriverCallSuccessful: true, listDeviceUUIDsSuccessful: true, deviceInfoByUUIDCallSuccessful: false, + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -180,6 +184,7 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { listDeviceUUIDsSuccessful: true, deviceInfoByUUIDCallSuccessful: true, driverVersion: "driverVersion", + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -209,16 +214,134 @@ func TestGetFingerprintDataFromNVML(t *testing.T) { }, }, }, + { + Name: "successful migs", + ExpectedError: false, + ExpectedResult: &FingerprintData{ + DriverVersion: "driverVersion", + Devices: []*FingerprintDeviceData{ + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID1", + MemoryMiB: pointer.Of(uint64(16)), + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + }, + PCIBusID: "busId1", + PCIBandwidthMBPerS: pointer.Of(uint(100)), + CoresClockMHz: pointer.Of(uint(100)), + MemoryClockMHz: pointer.Of(uint(100)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID2", + MemoryMiB: pointer.Of(uint64(8)), + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + }, + PCIBusID: "busId2", + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID4", + MemoryMiB: pointer.Of(uint64(8)), + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + }, + PCIBusID: "busId3", + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + DriverConfiguration: &MockNVMLDriver{ + systemDriverCallSuccessful: true, + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, + driverVersion: "driverVersion", + modes: []mode{normal, normal, parent, mig}, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(16)), + PCIBusID: "busId1", + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + PCIBandwidthMBPerS: pointer.Of(uint(100)), + CoresClockMHz: pointer.Of(uint(100)), + MemoryClockMHz: pointer.Of(uint(100)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + UUID: "UUID2", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId2", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + UUID: "UUID3", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + { + UUID: "UUID4", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + DisplayState: "Enabled", + PersistenceMode: "Enabled", + }, + }, + }, + }, } { - cli := nvmlClient{driver: testCase.DriverConfiguration} - fingerprintData, err := cli.GetFingerprintData() - if testCase.ExpectedError { - must.Error(t, err) - } - if !testCase.ExpectedError && err != nil { - must.NoError(t, err) - } - must.Eq(t, testCase.ExpectedResult, fingerprintData) + + t.Run(testCase.Name, func(t *testing.T) { + cli := nvmlClient{driver: testCase.DriverConfiguration} + fingerprintData, err := cli.GetFingerprintData() + if testCase.ExpectedError { + must.Error(t, err) + } + if !testCase.ExpectedError && err != nil { + must.NoError(t, err) + } + must.Eq(t, testCase.ExpectedResult, fingerprintData) + }) } } @@ -248,6 +371,7 @@ func TestGetStatsDataFromNVML(t *testing.T) { systemDriverCallSuccessful: true, listDeviceUUIDsSuccessful: true, deviceInfoAndStatusByUUIDCallSuccessful: false, + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -350,6 +474,7 @@ func TestGetStatsDataFromNVML(t *testing.T) { listDeviceUUIDsSuccessful: true, deviceInfoByUUIDCallSuccessful: true, deviceInfoAndStatusByUUIDCallSuccessful: true, + modes: []mode{normal, normal}, devices: []*DeviceInfo{ { UUID: "UUID1", @@ -403,6 +528,133 @@ func TestGetStatsDataFromNVML(t *testing.T) { }, }, }, + { + Name: "successful migs", + // stats not available on migs + ExpectedError: false, + ExpectedResult: []*StatsData{ + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID1", + MemoryMiB: pointer.Of(uint64(16)), + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + }, + TemperatureC: pointer.Of(uint(1)), + GPUUtilization: pointer.Of(uint(1)), + MemoryUtilization: pointer.Of(uint(1)), + EncoderUtilization: pointer.Of(uint(1)), + DecoderUtilization: pointer.Of(uint(1)), + UsedMemoryMiB: pointer.Of(uint64(1)), + ECCErrorsL1Cache: pointer.Of(uint64(1)), + ECCErrorsL2Cache: pointer.Of(uint64(1)), + ECCErrorsDevice: pointer.Of(uint64(1)), + PowerUsageW: pointer.Of(uint(1)), + BAR1UsedMiB: pointer.Of(uint64(1)), + }, + { + DeviceData: &DeviceData{ + DeviceName: pointer.Of("ModelName"), + UUID: "UUID2", + MemoryMiB: pointer.Of(uint64(8)), + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + }, + TemperatureC: pointer.Of(uint(2)), + GPUUtilization: pointer.Of(uint(2)), + MemoryUtilization: pointer.Of(uint(2)), + EncoderUtilization: pointer.Of(uint(2)), + DecoderUtilization: pointer.Of(uint(2)), + UsedMemoryMiB: pointer.Of(uint64(2)), + ECCErrorsL1Cache: pointer.Of(uint64(2)), + ECCErrorsL2Cache: pointer.Of(uint64(2)), + ECCErrorsDevice: pointer.Of(uint64(2)), + PowerUsageW: pointer.Of(uint(2)), + BAR1UsedMiB: pointer.Of(uint64(2)), + }, + }, + DriverConfiguration: &MockNVMLDriver{ + listDeviceUUIDsSuccessful: true, + deviceInfoByUUIDCallSuccessful: true, + deviceInfoAndStatusByUUIDCallSuccessful: true, + modes: []mode{normal, normal, parent, mig}, + devices: []*DeviceInfo{ + { + UUID: "UUID1", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(16)), + PCIBusID: "busId1", + PowerW: pointer.Of(uint(100)), + BAR1MiB: pointer.Of(uint64(100)), + PCIBandwidthMBPerS: pointer.Of(uint(100)), + CoresClockMHz: pointer.Of(uint(100)), + MemoryClockMHz: pointer.Of(uint(100)), + }, + { + UUID: "UUID2", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId2", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + }, + { // parent, no stats + UUID: "UUID3", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + }, + { // mig, no stats + UUID: "UUID4", + Name: pointer.Of("ModelName"), + MemoryMiB: pointer.Of(uint64(8)), + PCIBusID: "busId3", + PowerW: pointer.Of(uint(200)), + BAR1MiB: pointer.Of(uint64(200)), + PCIBandwidthMBPerS: pointer.Of(uint(200)), + CoresClockMHz: pointer.Of(uint(200)), + MemoryClockMHz: pointer.Of(uint(200)), + }, + }, + deviceStatus: []*DeviceStatus{ + { + TemperatureC: pointer.Of(uint(1)), + GPUUtilization: pointer.Of(uint(1)), + MemoryUtilization: pointer.Of(uint(1)), + EncoderUtilization: pointer.Of(uint(1)), + DecoderUtilization: pointer.Of(uint(1)), + UsedMemoryMiB: pointer.Of(uint64(1)), + ECCErrorsL1Cache: pointer.Of(uint64(1)), + ECCErrorsL2Cache: pointer.Of(uint64(1)), + ECCErrorsDevice: pointer.Of(uint64(1)), + PowerUsageW: pointer.Of(uint(1)), + BAR1UsedMiB: pointer.Of(uint64(1)), + }, + { + TemperatureC: pointer.Of(uint(2)), + GPUUtilization: pointer.Of(uint(2)), + MemoryUtilization: pointer.Of(uint(2)), + EncoderUtilization: pointer.Of(uint(2)), + DecoderUtilization: pointer.Of(uint(2)), + UsedMemoryMiB: pointer.Of(uint64(2)), + ECCErrorsL1Cache: pointer.Of(uint64(2)), + ECCErrorsL2Cache: pointer.Of(uint64(2)), + ECCErrorsDevice: pointer.Of(uint64(2)), + PowerUsageW: pointer.Of(uint(2)), + BAR1UsedMiB: pointer.Of(uint64(2)), + }, + }, + }, + }, } { cli := nvmlClient{driver: testCase.DriverConfiguration} statsData, err := cli.GetStatsData() diff --git a/nvml/driver_linux.go b/nvml/driver_linux.go index 91b5d97..2a650f3 100644 --- a/nvml/driver_linux.go +++ b/nvml/driver_linux.go @@ -40,15 +40,17 @@ func (n *nvmlDriver) SystemDriverVersion() (string, error) { return version, nil } -// List all compute device UUIDs in the system, includes MIG devices -// but excludes their "parent". -func (n *nvmlDriver) ListDeviceUUIDs() ([]string, error) { +// List all compute device UUIDs in the system. +// Includes all instances, including normal GPUs, MIGs, and their physical parents. +// Each UUID is associated with a mode indication which type it is. +func (n *nvmlDriver) ListDeviceUUIDs() (map[string]mode, error) { count, code := nvml.DeviceGetCount() if code != nvml.SUCCESS { return nil, decode("failed to get device count", code) } - var uuids []string + uuids := make(map[string]mode) + for i := 0; i < int(count); i++ { device, code := nvml.DeviceGetHandleByIndex(int(i)) if code != nvml.SUCCESS { @@ -66,7 +68,7 @@ func (n *nvmlDriver) ListDeviceUUIDs() ([]string, error) { return nil, decode("failed to get device %d uuid", code) } - uuids = append(uuids, uuid) + uuids[uuid] = normal continue } if code != nvml.SUCCESS { @@ -78,6 +80,11 @@ func (n *nvmlDriver) ListDeviceUUIDs() ([]string, error) { return nil, decode("failed to get device MIG device count", code) } + uuid, code := nvml.DeviceGetUUID(device) + if code == nvml.SUCCESS { + uuids[uuid] = parent + } + for j := 0; j < int(migCount); j++ { migDevice, code := nvml.DeviceGetMigDeviceHandleByIndex(device, int(j)) if code == nvml.ERROR_NOT_FOUND || code == nvml.ERROR_INVALID_ARGUMENT { @@ -91,7 +98,7 @@ func (n *nvmlDriver) ListDeviceUUIDs() ([]string, error) { if code != nvml.SUCCESS { return nil, decode(fmt.Sprintf("failed to get mig device uuid %d", j), code) } - uuids = append(uuids, uuid) + uuids[uuid] = mig } } @@ -110,7 +117,7 @@ func (n *nvmlDriver) DeviceInfoByUUID(uuid string) (*DeviceInfo, error) { return nil, decode("failed to get device name", code) } - memory, code := nvml.DeviceGetMemoryInfo_v2(device) + memory, code := nvml.DeviceGetMemoryInfo(device) if code != nvml.SUCCESS { return nil, decode("failed to get device memory info", code) } @@ -238,7 +245,7 @@ func (n *nvmlDriver) DeviceInfoAndStatusByUUID(uuid string) (*DeviceInfo, *Devic return nil, nil, decode("failed to get device info", code) } - mem, code := nvml.DeviceGetMemoryInfo_v2(device) + mem, code := nvml.DeviceGetMemoryInfo(device) if code != nvml.SUCCESS { return nil, nil, decode("failed to get device memory utilization", code) } diff --git a/nvml/shared.go b/nvml/shared.go index 2d675f1..17596a2 100644 --- a/nvml/shared.go +++ b/nvml/shared.go @@ -10,6 +10,14 @@ var ( UnavailableLib = errors.New("could not load NVML library") ) +type mode int + +const ( + normal mode = iota + parent + mig +) + // nvmlDriver implements NvmlDriver // Users are required to call Initialize method before using any other methods type nvmlDriver struct{} @@ -19,7 +27,7 @@ type NvmlDriver interface { Initialize() error Shutdown() error SystemDriverVersion() (string, error) - ListDeviceUUIDs() ([]string, error) + ListDeviceUUIDs() (map[string]mode, error) DeviceInfoByUUID(string) (*DeviceInfo, error) DeviceInfoAndStatusByUUID(string) (*DeviceInfo, *DeviceStatus, error) }