Skip to content

Commit

Permalink
add README
Browse files Browse the repository at this point in the history
  • Loading branch information
letmutx committed May 16, 2022
1 parent 224361a commit d610e22
Show file tree
Hide file tree
Showing 5 changed files with 73 additions and 98 deletions.
6 changes: 3 additions & 3 deletions GNUmakefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
PLUGIN_BINARY=skeleton-device
PLUGIN_BINARY=nomad-nvidia-vgpu-plugin
export GO111MODULE=on

default: build

.PHONY: clean
clean: ## Remove build artifacts
rm -rf skeleton-device launcher
rm -rf nomad-nvidia-vgpu-plugin launcher

build:
go build -o ${PLUGIN_BINARY} .
go build -o ${PLUGIN_BINARY} ./cmd/main.go

.PHONY: eval
eval: deps build
Expand Down
133 changes: 58 additions & 75 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,82 +1,65 @@
Nomad Skeleton Device Plugin
Nomad Nvidia Virtual Device Plugin
==================

Skeleton project for [Nomad device plugins](https://www.nomadproject.io/docs/internals/plugins/devices.html).

This project is intended for bootstrapping development of a new device plugin.

- Website: https://www.nomadproject.io
- Mailing list: [Google Groups](http://groups.google.com/group/nomad-tool)

Requirements
------------

- [Nomad](https://www.nomadproject.io/downloads.html) 0.9+
- [Go](https://golang.org/doc/install) 1.11 or later (to build the plugin)

Building the Skeleton Plugin
---------------------
[Generate](https://github.com/hashicorp/nomad-skeleton-device-plugin/generate)
a new repository in your account from this template by clicking the `Use this
template` button above.

Clone the repository somewhere in your computer. This project uses
[Go modules](https://blog.golang.org/using-go-modules) so you will need to set
the environment variable `GO111MODULE=on` or work outside your `GOPATH` if it
is set to `auto` or not declared.

```sh
$ git clone [email protected]:<ORG>/<REPO>git
```

Enter the plugin directory and update the paths in `go.mod` and `main.go` to
match your repository path.

```diff
// go.mod

- module github.com/hashicorp/nomad-skeleton-device-plugin
+ module github.com/<ORG>/<REPO>
...
This repo contains a device plugin for [Nomad](https://www.nomadproject.io/) to support exposing a number of virtual GPUs for each physical GPU present on the machine. This enables running workloads which don't consume the whole GPU.

Installation requirements
-----------------------

This plugin needs the following dependencies to function:

* [Nomad](https://www.nomadproject.io/downloads.html) 0.9+
* GNU/Linux x86_64 with kernel version > 3.10
* NVIDIA GPU with Architecture > Fermi (2.1)
* NVIDIA drivers >= 340.29 with binary nvidia-smi
* Docker v19.03+

Copy the plugin binary to the [plugins directory](https://www.nomadproject.io/docs/configuration/index.html#plugin_dir) and [configure the plugin](https://www.nomadproject.io/docs/configuration/plugin.html) in the client config. Also, see the requirements for the official [nvidia-plugin](https://www.nomadproject.io/plugins/devices/nvidia#installation-requirements).

```hcl
plugin "nvidia-vgpu" {
config {
ignored_gpu_ids = ["uuid1", "uuid2"]
fingerprint_period = "5s"
vgpus = 16
}
}
```

```diff
// main.go

package main

import (
log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/plugins"

- "github.com/hashicorp/nomad-skeleton-device-plugin/device"
+ "github.com/<REPO>/<ORG>/device"
)
...
```

Build the skeleton plugin.

```sh
$ make build
```

Running the Plugin in Development
---------------------

You can test this plugin (and your own device plugins) in development using the
[plugin launcher](https://github.com/hashicorp/nomad/tree/master/plugins/shared/cmd/launcher). The makefile provides
a target for this:

```sh
$ make eval
Usage
--------------

Then use the [device stanza](https://www.nomadproject.io/docs/job-specification/device.html) in the job file to schedule with device support.

```hcl
job "gpu-test" {
datacenters = ["dc1"]
type = "batch"
group "smi" {
task "smi" {
driver = "docker"
config {
image = "nvidia/cuda:11.0-base"
command = "nvidia-smi"
}
resources {
device "nvidia-vgpu/gpu" {
count = 1
# Add an affinity for a particular model
affinity {
attribute = "${device.model}"
value = "Tesla K80"
weight = 50
}
}
}
}
}
}
```

Deploying Device Plugins in Nomad
----------------------

Copy the plugin binary to the
[plugins directory](https://www.nomadproject.io/docs/configuration/index.html#plugin_dir) and
[configure the plugin](https://www.nomadproject.io/docs/configuration/plugin.html) in the client config. Then use the
[device stanza](https://www.nomadproject.io/docs/job-specification/device.html) in the job file to schedule with
device support. (Note, the skeleton plugin is not intended for use in Nomad.)
28 changes: 10 additions & 18 deletions device.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@ import (
"time"

log "github.com/hashicorp/go-hclog"
"github.com/hashicorp/nomad/devices/gpu/nvidia"
"github.com/hashicorp/nomad/helper/pluginutils/loader"
"github.com/hashicorp/nomad/plugins/base"
"github.com/hashicorp/nomad/plugins/device"
"github.com/hashicorp/nomad/plugins/shared/hclspec"

"github.com/hashicorp/nomad/devices/gpu/nvidia"
)

const (
Expand All @@ -31,9 +30,6 @@ const (
// along with "type" and "model", this can be used when requesting devices:
// https://www.nomadproject.io/docs/job-specification/device.html#name
vendor = "letmutx"

// deviceType is the "type" of device being returned
deviceType = device.DeviceTypeGPU
)

var (
Expand Down Expand Up @@ -69,40 +65,36 @@ var (
hclspec.NewAttr("fingerprint_period", "string", false),
hclspec.NewLiteral("\"1m\""),
),
"vgpu_multiplier": hclspec.NewDefault(
hclspec.NewAttr("vgpu_mulitplier", "number", true),
"vgpus": hclspec.NewDefault(
hclspec.NewAttr("vgpus", "number", true),
hclspec.NewLiteral("1"),
),
})
)

// Config contains configuration information for the plugin.
type Config struct {
VgpuMultiplier int `codec:"vgpu_multiplier"`
Vgpus int `codec:"vgpus"`
}

// NvidiaVgpuDevice contains a skeleton for most of the implementation of a
// device plugin.
type NvidiaVgpuDevice struct {
*nvidia.NvidiaDevice
vgpuMultiplier int
vgpus int

devices map[string]struct{}
deviceLock sync.RWMutex

log log.Logger
}

// NewPlugin returns a device plugin, used primarily by the main wrapper
//
// Plugin configuration isn't available yet, so there will typically be
// a limit to the initialization that can be performed at this point.
func NewPlugin(ctx context.Context, log log.Logger) *NvidiaVgpuDevice {
device := nvidia.NewNvidiaDevice(ctx, log)
return &NvidiaVgpuDevice{
NvidiaDevice: device,
NvidiaDevice: nvidia.NewNvidiaDevice(ctx, log),
devices: map[string]struct{}{},
log: log.Named(pluginName),
}
}

Expand Down Expand Up @@ -131,8 +123,8 @@ func (d *NvidiaVgpuDevice) SetConfig(c *base.Config) (err error) {
return err
}

if config.VgpuMultiplier <= 0 {
return fmt.Errorf("invalid value for vgpu_multiplier %q: %v", config.VgpuMultiplier, errors.New("must be >= 1"))
if config.Vgpus <= 0 {
return fmt.Errorf("invalid value for vgpus %q: %v", config.Vgpus, errors.New("must be >= 1"))
}

if err = d.NvidiaDevice.SetConfig(c); err != nil {
Expand All @@ -148,11 +140,11 @@ func (d *NvidiaVgpuDevice) SetConfig(c *base.Config) (err error) {
func (d *NvidiaVgpuDevice) Fingerprint(ctx context.Context) (<-chan *device.FingerprintResponse, error) {
// Fingerprint returns a channel. The recommended way of organizing a plugin
// is to pass that into a long-running goroutine and return the channel immediately.
outCh := make(chan *device.FingerprintResponse)
nvOut, err := d.NvidiaDevice.Fingerprint(ctx)
if err != nil {
return nil, err
}
outCh := make(chan *device.FingerprintResponse)
go d.doFingerprint(ctx, nvOut, outCh)
return outCh, nil
}
Expand All @@ -163,11 +155,11 @@ func (d *NvidiaVgpuDevice) Stats(ctx context.Context, interval time.Duration) (<
// Similar to Fingerprint, Stats returns a channel. The recommended way of
// organizing a plugin is to pass that into a long-running goroutine and
// return the channel immediately.
outCh := make(chan *device.StatsResponse)
nvOut, err := d.NvidiaDevice.Stats(ctx, interval)
if err != nil {
return nil, err
}
outCh := make(chan *device.StatsResponse)
go d.doStats(ctx, nvOut, outCh)
return outCh, nil
}
Expand Down
2 changes: 1 addition & 1 deletion fingerprint.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func (d *NvidiaVgpuDevice) nvDeviceToVirtDevices(ctx context.Context, nvFpr *dev
}

for _, nvDevice := range nvDeviceGroup.Devices {
for i := 0; i < d.vgpuMultiplier; i++ {
for i := 0; i < d.vgpus; i++ {
dev := &device.Device{
ID: fmt.Sprintf("%s-%d", nvDevice.ID, i),
Healthy: nvDevice.Healthy,
Expand Down
2 changes: 1 addition & 1 deletion stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func (d *NvidiaVgpuDevice) nvStatsToVirtstats(nvStats *device.StatsResponse) *de

instanceStats := map[string]*device.DeviceStats{}
for dev, stats := range group.InstanceStats {
for i := 0; i < d.vgpuMultiplier; i++ {
for i := 0; i < d.vgpus; i++ {
dev := fmt.Sprintf("%s-%d", dev, i)
instanceStats[dev] = stats
}
Expand Down

0 comments on commit d610e22

Please sign in to comment.