Skip to content

Commit

Permalink
use GiB as default unit instead of MiB (#3)
Browse files Browse the repository at this point in the history
* use GiB as default metrics

* fix build

* set gpu metric

* inspect gpushare with different metrics

* Update docker images

* change to units

* compute gpu memory unit by using single gpu device

* Fix build

* Fix build

* Update docker image

* fix docker images

* fix unitMemory type

* Update docker image
  • Loading branch information
cheyang authored and wsxiaozhang committed Mar 1, 2019
1 parent 1d6a6e8 commit 8adf448
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 13 deletions.
2 changes: 1 addition & 1 deletion cmd/inspect/display.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ func displaySummary(nodeInfos []*NodeInfo) {
if hasPendingGPU {
buffer.WriteString("PENDING(Allocated)\t")
}
buffer.WriteString("GPU Memory(MiB)\n")
buffer.WriteString(fmt.Sprintf("GPU Memory(%s)\n", memoryUnit))

// fmt.Fprintf(w, "NAME\tIPADDRESS\tROLE\tGPU(Allocated/Total)\tPENDING(Allocated)\n")
fmt.Fprintf(w, buffer.String())
Expand Down
27 changes: 25 additions & 2 deletions cmd/inspect/nodeinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ func buildAllNodeInfos(allPods []v1.Pod, nodes []v1.Node) ([]*NodeInfo, error) {
nodeInfos := buildNodeInfoWithPods(allPods, nodes)
for _, info := range nodeInfos {
if info.gpuTotalMemory > 0 {
setUnit(info.gpuTotalMemory, info.gpuCount)
err := info.buildDeviceInfo()
if err != nil {
log.Warningf("Failed due to %v", err)
Expand All @@ -71,7 +72,7 @@ func (n *NodeInfo) acquirePluginPod() v1.Pod {
}

func getTotalGPUMemory(node v1.Node) int {
val, ok := node.Status.Capacity[resourceName]
val, ok := node.Status.Allocatable[resourceName]

if !ok {
return 0
Expand All @@ -81,7 +82,7 @@ func getTotalGPUMemory(node v1.Node) int {
}

func getGPUCountInNode(node v1.Node) int {
val, ok := node.Status.Capacity[countName]
val, ok := node.Status.Allocatable[countName]

if !ok {
return int(0)
Expand Down Expand Up @@ -221,3 +222,25 @@ func isGPUSharingNode(node v1.Node) bool {

return ok
}

var (
memoryUnit = ""
)

func setUnit(gpuMemory, gpuCount int) {
if memoryUnit != "" {
return
}

if gpuCount == 0 {
return
}

gpuMemoryByDev := gpuMemory / gpuCount

if gpuMemoryByDev > 100 {
memoryUnit = "MiB"
} else {
memoryUnit = "GiB"
}
}
17 changes: 15 additions & 2 deletions cmd/nvidia/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,28 @@ import (
var (
mps = flag.Bool("mps", false, "Enable or Disable MPS")
healthCheck = flag.Bool("health-check", false, "Enable or disable Health check")
memoryUnit = flag.String("memory-unit", "GiB", "Set memoryUnit of the GPU Memroy, support 'GiB' and 'MiB'")
)

func main() {
flag.Parse()
log.V(1).Infoln("Start gpushare device plugin")

ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck)
ngm := nvidia.NewSharedGPUManager(*mps, *healthCheck, translatememoryUnits(*memoryUnit))
err := ngm.Run()
if err != nil {
log.Fatalf("Failed due to %v", err)
}
}

func translatememoryUnits(value string) nvidia.MemoryUnit {
memoryUnit := nvidia.MemoryUnit(value)
switch memoryUnit {
case nvidia.MiBPrefix:
case nvidia.GiBPrefix:
default:
log.Warningf("Unsupported memory unit: %s, use memoryUnit Gi as default", value)
memoryUnit = nvidia.GiBPrefix
}

return memoryUnit
}
4 changes: 2 additions & 2 deletions demo/binpack-1/binpack-1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ spec:
image: cheyang/gpu-player:v2
resources:
limits:
# MiB
aliyun.com/gpu-mem: 8076
# GiB
aliyun.com/gpu-mem: 2
4 changes: 2 additions & 2 deletions demo/binpack-1/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ spec:
image: alpine:3.6
resources:
limits:
# MiB
aliyun.com/gpu-mem: 8076
# GiB
aliyun.com/gpu-mem: 2
command: ["sleep","30s"]
restartPolicy: Never
3 changes: 2 additions & 1 deletion device-plugin-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ spec:
nodeSelector:
gpushare: "true"
containers:
- image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-35eccab
- image: registry.cn-hangzhou.aliyuncs.com/acs/k8s-gpushare-plugin:v2-1.11-aff8a23
name: gpushare
# Make this pod as Guaranteed pod which will never be evicted because of node's resource consumption.
command:
- gpushare-device-plugin-v2
- -logtostderr
- --v=5
- --memory-unit=GiB
resources:
limits:
memory: "300Mi"
Expand Down
6 changes: 6 additions & 0 deletions pkg/gpu/nvidia/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)

// MemoryUnit describes GPU Memory, now only supports Gi, Mi
type MemoryUnit string

const (
resourceName = "aliyun.com/gpu-mem"
resourceCount = "aliyun.com/gpu-count"
Expand All @@ -26,4 +29,7 @@ const (
EnvAssignedFlag = "ALIYUN_COM_GPU_MEM_ASSIGNED"
EnvResourceAssumeTime = "ALIYUN_COM_GPU_MEM_ASSUME_TIME"
EnvResourceAssignTime = "ALIYUN_COM_GPU_MEM_ASSIGN_TIME"

GiBPrefix = MemoryUnit("GiB")
MiBPrefix = MemoryUnit("MiB")
)
3 changes: 2 additions & 1 deletion pkg/gpu/nvidia/gpumanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ type sharedGPUManager struct {
healthCheck bool
}

func NewSharedGPUManager(enableMPS, healthCheck bool) *sharedGPUManager {
func NewSharedGPUManager(enableMPS, healthCheck bool, bp MemoryUnit) *sharedGPUManager {
metric = bp
return &sharedGPUManager{
enableMPS: enableMPS,
healthCheck: healthCheck,
Expand Down
16 changes: 14 additions & 2 deletions pkg/gpu/nvidia/nvidia.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ import (
pluginapi "k8s.io/kubernetes/pkg/kubelet/apis/deviceplugin/v1beta1"
)

var gpuMemory uint
var (
gpuMemory uint
metric MemoryUnit
)

func check(err error) {
if err != nil {
Expand All @@ -28,6 +31,15 @@ func extractRealDeviceID(fakeDeviceID string) string {
return strings.Split(fakeDeviceID, "-_-")[0]
}

func setGPUMemory(raw uint) {
v := raw
if metric == GiBPrefix {
v = raw / 1024
}
gpuMemory = v
log.Infof("set gpu memory: %d", gpuMemory)
}

func getGPUMemory() uint {
return gpuMemory
}
Expand Down Expand Up @@ -56,7 +68,7 @@ func getDevices() ([]*pluginapi.Device, map[string]uint) {
// var KiB uint64 = 1024
log.Infof("# device Memory: %d", uint(*d.Memory))
if getGPUMemory() == uint(0) {
gpuMemory = uint(*d.Memory)
setGPUMemory(uint(*d.Memory))
}
for j := uint(0); j < getGPUMemory(); j++ {
fakeID := generateFakeDeviceID(d.UUID, j)
Expand Down

0 comments on commit 8adf448

Please sign in to comment.