From 90dd9994d6977f3b546b0cc47da01a9c65a92196 Mon Sep 17 00:00:00 2001 From: Vanessasaurus <814322+vsoch@users.noreply.github.com> Date: Thu, 2 May 2024 16:54:16 -0600 Subject: [PATCH] feat: add nextgen jobspec (#10) * feat: add nextgen jobspec Signed-off-by: vsoch --- Makefile | 11 +- README.md | 9 +- examples/nextgen/v1/example1/example.go | 52 +++++++++ examples/nextgen/v1/example1/jobspec.yaml | 70 +++++++++++ pkg/nextgen/v1/convert.go | 68 +++++++++++ pkg/nextgen/v1/jobspec.go | 114 ++++++++++++++++++ pkg/nextgen/v1/schema.go | 8 ++ pkg/nextgen/v1/schema.json | 136 ++++++++++++++++++++++ pkg/nextgen/v1/types.go | 50 ++++++++ 9 files changed, 515 insertions(+), 3 deletions(-) create mode 100644 examples/nextgen/v1/example1/example.go create mode 100644 examples/nextgen/v1/example1/jobspec.yaml create mode 100644 pkg/nextgen/v1/convert.go create mode 100644 pkg/nextgen/v1/jobspec.go create mode 100644 pkg/nextgen/v1/schema.go create mode 100644 pkg/nextgen/v1/schema.json create mode 100644 pkg/nextgen/v1/types.go diff --git a/Makefile b/Makefile index e7b12d2..ad0cdea 100644 --- a/Makefile +++ b/Makefile @@ -3,13 +3,14 @@ COMMONENVVAR=GOOS=$(shell uname -s | tr A-Z a-z) RELEASE_VERSION?=v$(shell date +%Y%m%d)-$(shell git describe --tags --match "v*") .PHONY: all -all: example1 example2 example3 example4 example5 example6 createnew exp1 exp2 +all: example1 example2 example3 example4 example5 example6 createnew exp1 exp2 ng1 .PHONY: build build: go mod tidy mkdir -p ./examples/v1/bin mkdir -p ./examples/experimental/bin + mkdir -p ./examples/nextgen/v1/bin # Build examples .PHONY: createnew @@ -48,8 +49,13 @@ exp1: build exp2: build $(COMMONENVVAR) $(BUILDENVVAR) go build -ldflags '-w' -o ./examples/experimental/bin/example2 examples/experimental/example2/example.go +.PHONY: ng1 +ng1: build + $(COMMONENVVAR) $(BUILDENVVAR) go build -ldflags '-w' -o ./examples/nextgen/v1/bin/example1 examples/nextgen/v1/example1/example.go + + .PHONY: test -test: all +test: build all ./examples/v1/bin/example1 ./examples/v1/bin/example2 ./examples/v1/bin/example3 @@ -59,6 +65,7 @@ test: all ./examples/v1/bin/createnew ./examples/experimental/bin/example1 ./examples/experimental/bin/example2 + ./examples/nextgen/v1/bin/example1 .PHONY: clean clean: diff --git a/README.md b/README.md index d95c0ed..756a8cc 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,13 @@ # Flux Jobspec (Go) -This is a simple library that provides go structures for the Flux Framework [Jobspec](https://flux-framework.readthedocs.io/projects/flux-rfc/en/latest/spec_25.html) for use in other projects. +This is a simple library that provides go structures for: + + - the Flux Framework [Jobspec](https://flux-framework.readthedocs.io/projects/flux-rfc/en/latest/spec_25.html) (package [jobspec](pkg/jobspec)) + - the [Next Generation Jobspec](https://compspec.github.io/jobspec) (package [nextgen](pkg/nextgen/)) + +Note for nextgen, since Go is more strict with typing, we accept a parsed JobSpec, meaning that all resources have been defined in the top level named section, +and are referenced by name in tasks. We will start assuming that a request for the resource groups should be satisfied within the same cluster, and each is a separate +match request. ## Usage diff --git a/examples/nextgen/v1/example1/example.go b/examples/nextgen/v1/example1/example.go new file mode 100644 index 0000000..fc2b862 --- /dev/null +++ b/examples/nextgen/v1/example1/example.go @@ -0,0 +1,52 @@ +package main + +import ( + "flag" + "fmt" + "os" + + v1 "github.com/compspec/jobspec-go/pkg/nextgen/v1" +) + +func main() { + fmt.Println("This example reads, parses, and validates a Jobspec") + + // Assumes running from the root + fileName := flag.String("json", "examples/nextgen/v1/example1/jobspec.yaml", "yaml file") + flag.Parse() + + yamlFile := *fileName + if yamlFile == "" { + flag.Usage() + os.Exit(0) + } + js, err := v1.LoadJobspecYaml(yamlFile) + if err != nil { + fmt.Printf("error reading %s:%s\n", yamlFile, err) + os.Exit(1) + } + + // Validate the jobspec + valid, err := js.Validate() + if !valid || err != nil { + fmt.Printf("schema is not valid:%s\n", err) + os.Exit(1) + } else { + fmt.Println("schema is valid") + } + + out, err := js.JobspecToYaml() + if err != nil { + fmt.Printf("error marshalling %s:%s\n", yamlFile, err) + os.Exit(1) + } + fmt.Println(string(out)) + + // One example of json + out, err = js.JobspecToJson() + if err != nil { + fmt.Printf("error marshalling %s:%s\n", yamlFile, err) + os.Exit(1) + } + fmt.Println(string(out)) +} diff --git a/examples/nextgen/v1/example1/jobspec.yaml b/examples/nextgen/v1/example1/jobspec.yaml new file mode 100644 index 0000000..97239ac --- /dev/null +++ b/examples/nextgen/v1/example1/jobspec.yaml @@ -0,0 +1,70 @@ +version: 1 + +# Resources can be used with tasks or task groups +# They are named so can be referenced in multiple places, +# and used with AND and OR +resources: + mini-mummi: + count: 8 + type: node + with: + - type: gpu + count: 2 + - type: cores + count: 4 + + mummi-gpu: + count: 2 + type: node + with: + - type: gpu + count: 1 + + mummi-cpu: + count: 2 + type: node + with: + - type: cores + count: 4 + + +tasks: +- name: task-1 + resources: common + command: + - bash + - -c + - "echo Starting task 1; sleep 3; echo Finishing task 1" + + # flux batch... +- group: group-1 + +# A group is a "flux batch" +groups: + - name: mini-mummi + resources: mini-mummi + tasks: + + # flux batch + - group: train + + # flux submit to train job + - name: test + replicas: 20 + resources: mummi-cpu + command: + - bash + - -c + - echo Running machine learning test + + # flux batch from mini-mummi group + - name: train + resources: "mummi-gpu|mummi-cpu" + tasks: + + # If a task doesn't have resources, it inherits parent group (uses all) + - name: train + command: + - bash + - -c + - echo running machine learning train \ No newline at end of file diff --git a/pkg/nextgen/v1/convert.go b/pkg/nextgen/v1/convert.go new file mode 100644 index 0000000..6e98fe0 --- /dev/null +++ b/pkg/nextgen/v1/convert.go @@ -0,0 +1,68 @@ +package v1 + +import ( + "fmt" + "strings" +) + +// NewSimpleJobSpec generates a simple jobspec for nodes, command, tasks, and (optionally) a name +func NewSimpleJobspec(name, command string, nodes, tasks int32) (*Jobspec, error) { + + // If no name provided for the slot, use the first + // work of the command + if name == "" { + parts := strings.Split(command, " ") + name = strings.ToLower(parts[0]) + } + if nodes < 1 { + return nil, fmt.Errorf("nodes for the job must be >= 1") + } + if command == "" { + return nil, fmt.Errorf("a command must be provided") + } + + // The node resource is what we are asking for + nodeResource := Resource{ + Type: "node", + Count: nodes, + } + + // The slot is where we are doing an assessment for scheduling + slot := Resource{ + Type: "slot", + Count: int32(1), + Label: name, + } + + // If tasks are defined, this is total tasks across the nodes + // We add to the slot + if tasks != 0 { + taskResource := Resource{ + Type: "core", + Count: tasks, + } + slot.With = []Resource{taskResource} + } + + // And then the entire resource spec is added to the top level node resource + nodeResource.With = []Resource{slot} + + // Resource name matches resources to named set + resourceName := "task-resources" + + // Tasks reference the slot and command + // Note: if we need better split can use "github.com/google/shlex" + cmd := strings.Split(command, " ") + taskResource := Task{ + Command: cmd, + Replicas: 1, + Resources: resourceName, + } + tasklist := []Task{taskResource} + + return &Jobspec{ + Version: jobspecVersion, + Tasks: tasklist, + Resources: Resources{resourceName: nodeResource}, + }, nil +} diff --git a/pkg/nextgen/v1/jobspec.go b/pkg/nextgen/v1/jobspec.go new file mode 100644 index 0000000..80f8185 --- /dev/null +++ b/pkg/nextgen/v1/jobspec.go @@ -0,0 +1,114 @@ +package v1 + +import ( + "encoding/json" + "fmt" + "os" + + "sigs.k8s.io/yaml" + + "github.com/compspec/jobspec-go/pkg/schema" +) + +// LoadJobspecYaml loads a jobspec from a yaml file path +func LoadJobspecYaml(yamlFile string) (*Jobspec, error) { + js := Jobspec{} + file, err := os.ReadFile(yamlFile) + if err != nil { + return &js, err + } + + err = yaml.Unmarshal([]byte(file), &js) + if err != nil { + return &js, err + } + return &js, nil +} + +// JobspectoYaml convets back to yaml (as string) +func (js *Jobspec) JobspecToYaml() (string, error) { + out, err := yaml.Marshal(js) + if err != nil { + return "", err + } + return string(out), nil +} + +// GetResources to get resource groups across the jobspec +// this is intended for graph scheduling +func (js *Jobspec) GetResources(data []byte) ([]Resource, error) { + + // We assume every discovered resource is a unique satisfy + resources := []Resource{} + + // Make sure all task and group resources are known + for _, task := range js.Tasks { + r, err := js.getResources(task) + if err != nil { + return resources, err + } + resources = append(resources, r) + } + for _, group := range js.Groups { + r, err := js.getResources(group) + if err != nil { + return resources, err + } + resources = append(resources, r) + } + return resources, nil +} + +// getResources unwraps resources. If there is a named string, we assume +// in reference to a named resource group. We will need a strategy to combine +// these intelligently when we ask for a match - right now just assuming +// separate groups +func (js *Jobspec) getResources(resources interface{}) (Resource, error) { + resource := Resource{} + switch v := resources.(type) { + case string: + resourceKey := resources.(string) + spec, ok := js.Resources[resourceKey] + if !ok { + return resource, fmt.Errorf("task is missing resource") + } + return spec, nil + case Resource: + return resources.(Resource), nil + default: + return resource, fmt.Errorf("type %s is unknown", v) + } +} + +// JobspectoJson convets back to json string +func (js *Jobspec) JobspecToJson() (string, error) { + out, err := json.MarshalIndent(js, "", " ") + if err != nil { + return "", err + } + return string(out), nil +} + +// Validate converts to bytes and validate with jsonschema +func (js *Jobspec) Validate() (bool, error) { + + // Get back into bytes form + out, err := yaml.Marshal(js) + if err != nil { + return false, err + } + // Validate the jobspec + return schema.Validate(out, schema.SchemaUrl, Schema) + +} + +// Helper function to get a job name, derived from the command +func (js *Jobspec) GetJobName() string { + + // Generic name to fall back tp + name := "app" + if js.Name != "" { + name = js.Name + } + return name +} diff --git a/pkg/nextgen/v1/schema.go b/pkg/nextgen/v1/schema.go new file mode 100644 index 0000000..a737d57 --- /dev/null +++ b/pkg/nextgen/v1/schema.go @@ -0,0 +1,8 @@ +package v1 + +import ( + _ "embed" +) + +//go:embed schema.json +var Schema string diff --git a/pkg/nextgen/v1/schema.json b/pkg/nextgen/v1/schema.json new file mode 100644 index 0000000..9e11452 --- /dev/null +++ b/pkg/nextgen/v1/schema.json @@ -0,0 +1,136 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "$id": "http://github.com/flux-framework/rfc/tree/master/data/spec_24/schema.json", + "title": "jobspec-01", + "description": "JobSpec the Next Generation", + "type": "object", + "required": ["version"], + "properties": { + "name": {"type": "string"}, + "version": { + "description": "the jobspec version", + "type": "integer", + "enum": [1] + }, + "requires": {"$ref": "#/definitions/requires"}, + "resources": { + "type": "object", + "patternProperties": { + "^([a-z]|[|]|&|[0-9]+)+$": {"$ref": "#/definitions/resources"} + } + }, + "attributes": {"$ref": "#/definitions/attributes"}, + "groups": {"type": "array", "items": {"$ref": "#/definitions/group"}}, + "tasks": {"$ref": "#/definitions/tasks"}, + "additionalProperties": false + }, + "definitions": { + "attributes": { + "description": "system, parameter, and user attributes", + "type": "object", + "properties": { + "duration": {"type": "number", "minimum": 0}, + "cwd": {"type": "string"}, + "environment": {"type": "object"} + } + }, + "requires": { + "description": "compatibility requirements", + "type": "object" + }, + "resources": { + "description": "requested resources", + "type": "object", + "required": ["type", "count"], + "properties": { + "type": {"type": "string"}, + "count": {"type": "integer", "minimum": 1}, + "unit": {"type": "string"}, + "with": {"$ref": "#/definitions/with"} + }, + "additionalProperties": false + }, + "with": { + "type": "array", + "minItems": 1, + "items": {"$ref": "#/definitions/with"} + }, + "steps": { + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "type": "string", + "enum": ["stage"] + } + }, + "required": ["name"] + } + }, + "tasks": { + "description": "tasks configuration", + "type": "array", + "items": { + "type": "object", + "properties": { + "requires": {"$ref": "#/definitions/requires"}, + "resources": { + "oneOf": [ + {"$ref": "#/definitions/resources"}, + {"type": "string"} + ] + }, + "attributes": {"$ref": "#/definitions/attributes"}, + "group": {"type": "string"}, + "name": {"type": "string"}, + "depends_on": {"type": "array", "items": {"type": "string"}}, + "replicas": {"type": "number", "minimum": 1, "default": 1}, + "command": { + "type": ["string", "array"], + "minItems": 1, + "items": {"type": "string"} + }, + "steps": {"$ref": "#/definitions/steps"} + } + } + }, + "group": { + "description": "group of tasks (batch)", + "type": "object", + "properties": { + "name": {"type": "string"}, + "requires": {"$ref": "#/definitions/requires"}, + "resources": { + "oneOf": [ + {"$ref": "#/definitions/resources"}, + {"type": "string"} + ] + }, + "attributes": {"$ref": "#/definitions/attributes"}, + "depends_on": {"type": "array", "items": {"type": "string"}}, + "tasks": {"$ref": "#/definitions/tasks"} + }, + "additionalProperties": false + }, + "slot_vertex": { + "description": "special slot resource type - label assigns to task slot", + "type": "object", + "required": ["type", "count", "with", "label"], + "properties": { + "type": {"enum": ["slot"]}, + "count": {"type": "integer", "minimum": 1}, + "unit": {"type": "string"}, + "label": {"type": "string"}, + "exclusive": {"type": "boolean"}, + "with": { + "type": "array", + "minItems": 1, + "maxItems": 2, + "items": {"oneOf": [{"$ref": "#/definitions/intranode_resource_vertex"}]} + } + }, + "additionalProperties": false + } + } +} diff --git a/pkg/nextgen/v1/types.go b/pkg/nextgen/v1/types.go new file mode 100644 index 0000000..98d46ac --- /dev/null +++ b/pkg/nextgen/v1/types.go @@ -0,0 +1,50 @@ +package v1 + +var ( + jobspecVersion = 1 +) + +type Jobspec struct { + Version int `json:"version" yaml:"version"` + Name string `json:"name,omitempty" yaml:"name,omitempty"` + Resources Resources `json:"resources,omitempty" yaml:"resources,omitempty"` + Tasks Tasks `json:"tasks,omitempty" yaml:"tasks,omitempty"` + Groups Groups `json:"groups,omitempty" yaml:"groups,omitempty"` + Requires map[string]string `json:"requires,omitempty" yaml:"requires,omitempty"` +} + +type Environment map[string]string +type Resources map[string]Resource +type Tasks []Task +type Groups []Group + +type Task struct { + Group string `json:"group,omitempty" yaml:"group,omitempty"` + Name string `json:"name,omitempty" yaml:"name,omitempty"` + Replicas int `json:"replicas,omitempty" yaml:"replicas,omitempty"` + Resources string `json:"resources,omitempty" yaml:"resources,omitempty"` + Command []string `json:"command,omitempty" yaml:"command,omitempty"` + Attributes Attributes `json:"attributes,omitempty" yaml:"attributes,omitempty"` +} + +type Group struct { + Name string `json:"name,omitempty" yaml:"name,omitempty"` + Resources string `json:"resources,omitempty" yaml:"resources,omitempty"` + Tasks Tasks `json:"tasks,omitempty" yaml:"tasks,omitempty"` + Attributes Attributes `json:"attributes,omitempty" yaml:"attributes,omitempty"` +} + +type Resource struct { + Type string `yaml:"type,omitempty" json:"type,omitempty"` + Unit string `yaml:"unit,omitempty" json:"unit,omitempty"` + Count int32 `yaml:"count,omitempty" json:"count,omitempty"` + With []Resource `yaml:"with,omitempty" json:"with,omitempty"` + Label string `yaml:"label,omitempty" json:"label,omitempty"` + Exclusive bool `yaml:"exclusive,omitempty" json:"exclusive,omitempty"` +} + +type Attributes struct { + Duration string `yaml:"duration,omitempty" json:"duration,omitempty"` + Cwd string `yaml:"cwd,omitempty" json:"cwd,omitempty"` + Environment Environment `yaml:"environment,omitempty" json:"environment,omitempty"` +}