From d610e2288fff00bd2fbd44658410a5c6ce47431e Mon Sep 17 00:00:00 2001 From: Rohit Kumar Date: Mon, 16 May 2022 16:35:33 +0530 Subject: [PATCH] add README --- GNUmakefile | 6 +-- README.md | 133 +++++++++++++++++++++---------------------------- device.go | 28 ++++------- fingerprint.go | 2 +- stats.go | 2 +- 5 files changed, 73 insertions(+), 98 deletions(-) diff --git a/GNUmakefile b/GNUmakefile index 4db7000..a7a183d 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -1,14 +1,14 @@ -PLUGIN_BINARY=skeleton-device +PLUGIN_BINARY=nomad-nvidia-vgpu-plugin export GO111MODULE=on default: build .PHONY: clean clean: ## Remove build artifacts - rm -rf skeleton-device launcher + rm -rf nomad-nvidia-vgpu-plugin launcher build: - go build -o ${PLUGIN_BINARY} . + go build -o ${PLUGIN_BINARY} ./cmd/main.go .PHONY: eval eval: deps build diff --git a/README.md b/README.md index 98a2092..0203048 100644 --- a/README.md +++ b/README.md @@ -1,82 +1,65 @@ -Nomad Skeleton Device Plugin +Nomad Nvidia Virtual Device Plugin ================== -Skeleton project for [Nomad device plugins](https://www.nomadproject.io/docs/internals/plugins/devices.html). - -This project is intended for bootstrapping development of a new device plugin. - -- Website: https://www.nomadproject.io -- Mailing list: [Google Groups](http://groups.google.com/group/nomad-tool) - -Requirements ------------- - -- [Nomad](https://www.nomadproject.io/downloads.html) 0.9+ -- [Go](https://golang.org/doc/install) 1.11 or later (to build the plugin) - -Building the Skeleton Plugin ---------------------- -[Generate](https://github.com/hashicorp/nomad-skeleton-device-plugin/generate) -a new repository in your account from this template by clicking the `Use this -template` button above. - -Clone the repository somewhere in your computer. This project uses -[Go modules](https://blog.golang.org/using-go-modules) so you will need to set -the environment variable `GO111MODULE=on` or work outside your `GOPATH` if it -is set to `auto` or not declared. - -```sh -$ git clone git@github.com:/git -``` - -Enter the plugin directory and update the paths in `go.mod` and `main.go` to -match your repository path. - -```diff -// go.mod - -- module github.com/hashicorp/nomad-skeleton-device-plugin -+ module github.com// -... +This repo contains a device plugin for [Nomad](https://www.nomadproject.io/) to support exposing a number of virtual GPUs for each physical GPU present on the machine. This enables running workloads which don't consume the whole GPU. + +Installation requirements +----------------------- + +This plugin needs the following dependencies to function: + +* [Nomad](https://www.nomadproject.io/downloads.html) 0.9+ +* GNU/Linux x86_64 with kernel version > 3.10 +* NVIDIA GPU with Architecture > Fermi (2.1) +* NVIDIA drivers >= 340.29 with binary nvidia-smi +* Docker v19.03+ + +Copy the plugin binary to the [plugins directory](https://www.nomadproject.io/docs/configuration/index.html#plugin_dir) and [configure the plugin](https://www.nomadproject.io/docs/configuration/plugin.html) in the client config. Also, see the requirements for the official [nvidia-plugin](https://www.nomadproject.io/plugins/devices/nvidia#installation-requirements). + +```hcl +plugin "nvidia-vgpu" { + config { + ignored_gpu_ids = ["uuid1", "uuid2"] + fingerprint_period = "5s" + vgpus = 16 + } +} ``` -```diff -// main.go - -package main - -import ( - log "github.com/hashicorp/go-hclog" - "github.com/hashicorp/nomad/plugins" - -- "github.com/hashicorp/nomad-skeleton-device-plugin/device" -+ "github.com///device" -) -... -``` - -Build the skeleton plugin. - -```sh -$ make build -``` - -Running the Plugin in Development ---------------------- - -You can test this plugin (and your own device plugins) in development using the -[plugin launcher](https://github.com/hashicorp/nomad/tree/master/plugins/shared/cmd/launcher). The makefile provides -a target for this: - -```sh -$ make eval +Usage +-------------- + +Then use the [device stanza](https://www.nomadproject.io/docs/job-specification/device.html) in the job file to schedule with device support. + +```hcl +job "gpu-test" { + datacenters = ["dc1"] + type = "batch" + + group "smi" { + task "smi" { + driver = "docker" + + config { + image = "nvidia/cuda:11.0-base" + command = "nvidia-smi" + } + + resources { + device "nvidia-vgpu/gpu" { + count = 1 + + # Add an affinity for a particular model + affinity { + attribute = "${device.model}" + value = "Tesla K80" + weight = 50 + } + } + } + } + } +} ``` -Deploying Device Plugins in Nomad ----------------------- -Copy the plugin binary to the -[plugins directory](https://www.nomadproject.io/docs/configuration/index.html#plugin_dir) and -[configure the plugin](https://www.nomadproject.io/docs/configuration/plugin.html) in the client config. Then use the -[device stanza](https://www.nomadproject.io/docs/job-specification/device.html) in the job file to schedule with -device support. (Note, the skeleton plugin is not intended for use in Nomad.) diff --git a/device.go b/device.go index 2674775..c0226ef 100644 --- a/device.go +++ b/device.go @@ -9,12 +9,11 @@ import ( "time" log "github.com/hashicorp/go-hclog" + "github.com/hashicorp/nomad/devices/gpu/nvidia" "github.com/hashicorp/nomad/helper/pluginutils/loader" "github.com/hashicorp/nomad/plugins/base" "github.com/hashicorp/nomad/plugins/device" "github.com/hashicorp/nomad/plugins/shared/hclspec" - - "github.com/hashicorp/nomad/devices/gpu/nvidia" ) const ( @@ -31,9 +30,6 @@ const ( // along with "type" and "model", this can be used when requesting devices: // https://www.nomadproject.io/docs/job-specification/device.html#name vendor = "letmutx" - - // deviceType is the "type" of device being returned - deviceType = device.DeviceTypeGPU ) var ( @@ -69,8 +65,8 @@ var ( hclspec.NewAttr("fingerprint_period", "string", false), hclspec.NewLiteral("\"1m\""), ), - "vgpu_multiplier": hclspec.NewDefault( - hclspec.NewAttr("vgpu_mulitplier", "number", true), + "vgpus": hclspec.NewDefault( + hclspec.NewAttr("vgpus", "number", true), hclspec.NewLiteral("1"), ), }) @@ -78,19 +74,17 @@ var ( // Config contains configuration information for the plugin. type Config struct { - VgpuMultiplier int `codec:"vgpu_multiplier"` + Vgpus int `codec:"vgpus"` } // NvidiaVgpuDevice contains a skeleton for most of the implementation of a // device plugin. type NvidiaVgpuDevice struct { *nvidia.NvidiaDevice - vgpuMultiplier int + vgpus int devices map[string]struct{} deviceLock sync.RWMutex - - log log.Logger } // NewPlugin returns a device plugin, used primarily by the main wrapper @@ -98,11 +92,9 @@ type NvidiaVgpuDevice struct { // Plugin configuration isn't available yet, so there will typically be // a limit to the initialization that can be performed at this point. func NewPlugin(ctx context.Context, log log.Logger) *NvidiaVgpuDevice { - device := nvidia.NewNvidiaDevice(ctx, log) return &NvidiaVgpuDevice{ - NvidiaDevice: device, + NvidiaDevice: nvidia.NewNvidiaDevice(ctx, log), devices: map[string]struct{}{}, - log: log.Named(pluginName), } } @@ -131,8 +123,8 @@ func (d *NvidiaVgpuDevice) SetConfig(c *base.Config) (err error) { return err } - if config.VgpuMultiplier <= 0 { - return fmt.Errorf("invalid value for vgpu_multiplier %q: %v", config.VgpuMultiplier, errors.New("must be >= 1")) + if config.Vgpus <= 0 { + return fmt.Errorf("invalid value for vgpus %q: %v", config.Vgpus, errors.New("must be >= 1")) } if err = d.NvidiaDevice.SetConfig(c); err != nil { @@ -148,11 +140,11 @@ func (d *NvidiaVgpuDevice) SetConfig(c *base.Config) (err error) { func (d *NvidiaVgpuDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) { // Fingerprint returns a channel. The recommended way of organizing a plugin // is to pass that into a long-running goroutine and return the channel immediately. - outCh := make(chan *device.FingerprintResponse) nvOut, err := d.NvidiaDevice.Fingerprint(ctx) if err != nil { return nil, err } + outCh := make(chan *device.FingerprintResponse) go d.doFingerprint(ctx, nvOut, outCh) return outCh, nil } @@ -163,11 +155,11 @@ func (d *NvidiaVgpuDevice) Stats(ctx context.Context, interval time.Duration) (< // Similar to Fingerprint, Stats returns a channel. The recommended way of // organizing a plugin is to pass that into a long-running goroutine and // return the channel immediately. - outCh := make(chan *device.StatsResponse) nvOut, err := d.NvidiaDevice.Stats(ctx, interval) if err != nil { return nil, err } + outCh := make(chan *device.StatsResponse) go d.doStats(ctx, nvOut, outCh) return outCh, nil } diff --git a/fingerprint.go b/fingerprint.go index 8c6aabb..27b15c9 100644 --- a/fingerprint.go +++ b/fingerprint.go @@ -39,7 +39,7 @@ func (d *NvidiaVgpuDevice) nvDeviceToVirtDevices(ctx context.Context, nvFpr *dev } for _, nvDevice := range nvDeviceGroup.Devices { - for i := 0; i < d.vgpuMultiplier; i++ { + for i := 0; i < d.vgpus; i++ { dev := &device.Device{ ID: fmt.Sprintf("%s-%d", nvDevice.ID, i), Healthy: nvDevice.Healthy, diff --git a/stats.go b/stats.go index a06ebc9..c8b4e51 100644 --- a/stats.go +++ b/stats.go @@ -37,7 +37,7 @@ func (d *NvidiaVgpuDevice) nvStatsToVirtstats(nvStats *device.StatsResponse) *de instanceStats := map[string]*device.DeviceStats{} for dev, stats := range group.InstanceStats { - for i := 0; i < d.vgpuMultiplier; i++ { + for i := 0; i < d.vgpus; i++ { dev := fmt.Sprintf("%s-%d", dev, i) instanceStats[dev] = stats }