diff --git a/cmd/inspect/display.go b/cmd/inspect/display.go index a0478c1b..c03f3a1b 100644 --- a/cmd/inspect/display.go +++ b/cmd/inspect/display.go @@ -144,7 +144,7 @@ func displaySummary(nodeInfos []*NodeInfo) { if hasPendingGPU { buffer.WriteString("PENDING(Allocated)\t") } - buffer.WriteString("GPU Memory(MiB)\n") + buffer.WriteString(fmt.Sprintf("GPU Memory(%s)\n", memoryUnit)) // fmt.Fprintf(w, "NAME\tIPADDRESS\tROLE\tGPU(Allocated/Total)\tPENDING(Allocated)\n") fmt.Fprintf(w, buffer.String()) diff --git a/cmd/inspect/nodeinfo.go b/cmd/inspect/nodeinfo.go index d4728a00..d65e67fb 100644 --- a/cmd/inspect/nodeinfo.go +++ b/cmd/inspect/nodeinfo.go @@ -46,6 +46,7 @@ func buildAllNodeInfos(allPods []v1.Pod, nodes []v1.Node) ([]*NodeInfo, error) { nodeInfos := buildNodeInfoWithPods(allPods, nodes) for _, info := range nodeInfos { if info.gpuTotalMemory > 0 { + setUnit(info.gpuTotalMemory, info.gpuCount) err := info.buildDeviceInfo() if err != nil { log.Warningf("Failed due to %v", err) @@ -71,7 +72,7 @@ func (n *NodeInfo) acquirePluginPod() v1.Pod { } func getTotalGPUMemory(node v1.Node) int { - val, ok := node.Status.Capacity[resourceName] + val, ok := node.Status.Allocatable[resourceName] if !ok { return 0 @@ -81,7 +82,7 @@ func getTotalGPUMemory(node v1.Node) int { } func getGPUCountInNode(node v1.Node) int { - val, ok := node.Status.Capacity[countName] + val, ok := node.Status.Allocatable[countName] if !ok { return int(0) @@ -221,3 +222,25 @@ func isGPUSharingNode(node v1.Node) bool { return ok } + +var ( + memoryUnit = "" +) + +func setUnit(gpuMemory, gpuCount int) { + if memoryUnit != "" { + return + } + + if gpuCount == 0 { + return + } + + gpuMemoryByDev := gpuMemory / gpuCount + + if gpuMemoryByDev > 100 { + memoryUnit = "MiB" + } else { + memoryUnit = "GiB" + } +} diff --git a/cmd/nvidia/main.go b/cmd/nvidia/main.go index a43adff3..05747a7d 100644 --- a/cmd/nvidia/main.go +++ b/cmd/nvidia/main.go @@ -10,15 +10,28 @@ import ( var ( mps = flag.Bool("mps", false, "Enable or Disable MPS") healthCheck = flag.Bool("health-check", false, "Enable or disable Health check") + memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'") ) func main() { flag.Parse() log.V(1).Infoln("Start gpushare device plugin") - - ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck) + ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit)) err := ngm.Run() if err != nil { log.Fatalf("Failed due to %v", err) } } + +func translatememoryUnits(value string) nvidia.MemoryUnit { + memoryUnit := nvidia.MemoryUnit(value) + switch memoryUnit { + case nvidia.MiBPrefix: + case nvidia.GiBPrefix: + default: + log.Warningf("Unsupported memory unit: %s, use memoryUnit Gi as default", value) + memoryUnit = nvidia.GiBPrefix + } + + return memoryUnit +} diff --git a/demo/binpack-1/binpack-1.yaml b/demo/binpack-1/binpack-1.yaml index a51d86f0..6216e2e5 100644 --- a/demo/binpack-1/binpack-1.yaml +++ b/demo/binpack-1/binpack-1.yaml @@ -39,5 +39,5 @@ spec: image: cheyang/gpu-player:v2 resources: limits: - # MiB - aliyun.com/gpu-mem: 8076 \ No newline at end of file + # GiB + aliyun.com/gpu-mem: 2 \ No newline at end of file diff --git a/demo/binpack-1/job.yaml b/demo/binpack-1/job.yaml index 4470bb75..a157c598 100644 --- a/demo/binpack-1/job.yaml +++ b/demo/binpack-1/job.yaml @@ -11,7 +11,7 @@ spec: image: alpine:3.6 resources: limits: - # MiB - aliyun.com/gpu-mem: 8076 + # GiB + aliyun.com/gpu-mem: 2 command: ["sleep","30s"] restartPolicy: Never \ No newline at end of file diff --git a/device-plugin-ds.yaml b/device-plugin-ds.yaml index 043d499d..1c065131 100644 --- a/device-plugin-ds.yaml +++ b/device-plugin-ds.yaml @@ -18,13 +18,14 @@ spec: nodeSelector: gpushare: "true" containers: - - image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-35eccab + - image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-aff8a23 name: gpushare # Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption. command: - gpushare-device-plugin-v2 - -logtostderr - --v=5 + - --memory-unit=GiB resources: limits: memory: "300Mi" diff --git a/pkg/gpu/nvidia/const.go b/pkg/gpu/nvidia/const.go index c3e4cd32..97d9afdb 100644 --- a/pkg/gpu/nvidia/const.go +++ b/pkg/gpu/nvidia/const.go @@ -4,6 +4,9 @@ import ( pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" ) +// MemoryUnit describes GPU Memory, now only supports Gi, Mi +type MemoryUnit string + const ( resourceName = "aliyun.com/gpu-mem" resourceCount = "aliyun.com/gpu-count" @@ -26,4 +29,7 @@ const ( EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED" EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME" EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME" + + GiBPrefix = MemoryUnit("GiB") + MiBPrefix = MemoryUnit("MiB") ) diff --git a/pkg/gpu/nvidia/gpumanager.go b/pkg/gpu/nvidia/gpumanager.go index b462f6b1..98774f48 100644 --- a/pkg/gpu/nvidia/gpumanager.go +++ b/pkg/gpu/nvidia/gpumanager.go @@ -16,7 +16,8 @@ type sharedGPUManager struct { healthCheck bool } -func NewSharedGPUManager(enableMPS, healthCheck bool) *sharedGPUManager { +func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager { + metric = bp return &sharedGPUManager{ enableMPS: enableMPS, healthCheck: healthCheck, diff --git a/pkg/gpu/nvidia/nvidia.go b/pkg/gpu/nvidia/nvidia.go index c27c4d6c..a50b15b4 100644 --- a/pkg/gpu/nvidia/nvidia.go +++ b/pkg/gpu/nvidia/nvidia.go @@ -12,7 +12,10 @@ import ( pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1" ) -var gpuMemory uint +var ( + gpuMemory uint + metric MemoryUnit +) func check(err error) { if err != nil { @@ -28,6 +31,15 @@ func extractRealDeviceID(fakeDeviceID string) string { return strings.Split(fakeDeviceID, "-_-")[0] } +func setGPUMemory(raw uint) { + v := raw + if metric == GiBPrefix { + v = raw / 1024 + } + gpuMemory = v + log.Infof("set gpu memory: %d", gpuMemory) +} + func getGPUMemory() uint { return gpuMemory } @@ -56,7 +68,7 @@ func getDevices() ([]*pluginapi.Device, map[string]uint) { // var KiB uint64 = 1024 log.Infof("# device Memory: %d", uint(*d.Memory)) if getGPUMemory() == uint(0) { - gpuMemory = uint(*d.Memory) + setGPUMemory(uint(*d.Memory)) } for j := uint(0); j < getGPUMemory(); j++ { fakeID := generateFakeDeviceID(d.UUID, j)