From 2f6bd58411899547f9c72c8ee20f4ed40a62680f Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Wed, 8 Nov 2017 13:34:17 +0000 Subject: [PATCH 01/33] Minimal bsub emulation for cellranger compat. Not working OS changes. --- cmd/lsf.go | 194 ++++++++++++++++++++++++++++++++++++++ cmd/root.go | 14 ++- internal/utils.go | 42 +++++++++ jobqueue/client.go | 59 +++++++++++- jobqueue/job.go | 19 +++- jobqueue/scheduler/lsf.go | 21 +++-- jobqueue/server.go | 9 ++ jobqueue/serverCLI.go | 4 +- main.go | 18 +++- 9 files changed, 368 insertions(+), 12 deletions(-) create mode 100644 cmd/lsf.go diff --git a/cmd/lsf.go b/cmd/lsf.go new file mode 100644 index 000000000..762236fb7 --- /dev/null +++ b/cmd/lsf.go @@ -0,0 +1,194 @@ +// Copyright © 2017 Genome Research Limited +// Author: Sendu Bala . +// +// This file is part of wr. +// +// wr is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// wr is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with wr. If not, see . + +package cmd + +import ( + "fmt" + // "github.com/VertebrateResequencing/wr/internal" + "bufio" + "github.com/VertebrateResequencing/wr/jobqueue" + jqs "github.com/VertebrateResequencing/wr/jobqueue/scheduler" + "github.com/spf13/cobra" + "io/ioutil" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "time" +) + +// options for this cmd + +// lsfCmd represents the lsf command. +var lsfCmd = &cobra.Command{ + Use: "lsf", + Short: "LSF emulation", + Long: `LSF emulation. + +Many existing pipelines and workflows may be written with the LSF scheduler in +mind, either hard-coded to use it exclusively, or supporting a number of +schedulers including LSF but not, say, OpenStack. + +wr's LSF emulation lets you submit jobs to wr as if it was LSF, providing +compatibility with old pipelines. If the manager has been started in LSF mode, +this could result in greater efficiency compared to directly using the real bsub +command. If you've done a cloud deployment, this allows pipelines that know +nothing about the cloud to distribute their workload in that cloud environment. + +NB: currently the emulation is extremely limited, supporting only the +interactive "console" mode where you run bsub without any arguments, and it only +supports single flags per #BSUB line, and it only pays attention to -J, -n and +-M flags. (This is sufficient for compatibility with 10x Genomic's cellranger +software.) + +The best way to use this LSF emulation is not to call this command yourself +directly, but to use 'wr add --bsubs [other opts]' to add the command that you +expect will call 'bsub'. In cloud deployments, your --cloud_* and --mounts +options will be applied to any job added via bsub emulation, that is it +effectively emulates all the work being done on an LSF farm with shared disk.`, +} + +// bsub sub-command emulates bsub. +var lsfBsubCmd = &cobra.Command{ + Use: "bsub", + Short: "Add jobs using bsub syntax", + Long: `Add jobs to the queue using bsub syntax.`, + Run: func(cmd *cobra.Command, args []string) { + wd, err := os.Getwd() + if err != nil { + die(err.Error()) + } + + // *** use os.Executable() to find the directory of the bsub symlink, + // and inside read the .bsub_config file to get the cloud_* and mount + // options that should be included on job + + job := &jobqueue.Job{ + BsubMode: true, + RepGroup: "bsub", + Cwd: wd, + CwdMatters: true, + Requirements: &jqs.Requirements{Cores: 1, RAM: 1000, Time: 1 * time.Hour}, + } + + r := regexp.MustCompile(`^#BSUB\s+-(\w)\s+(.+)$`) + // rMem := regexp.MustCompile(`mem[>=](\d+)`) + + fmt.Printf("bsub> ") + scanner := bufio.NewScanner(os.Stdin) + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + + if strings.HasPrefix(line, "#") { + matches := r.FindStringSubmatch(line) + if matches != nil { + // *** this does not support the (valid) inclusion of + // multiple options per line + switch matches[1] { + case "J": + job.RepGroup = matches[2] + case "n": + if n, err := strconv.Atoi(matches[2]); err == nil { + job.Requirements.Cores = n + } + case "M": + if n, err := strconv.Atoi(matches[2]); err == nil { + job.Requirements.RAM = n + job.Override = 2 + } + } + } + } else { + if job.Cmd == "" { + job.Cmd = line + } else { + job.Cmd += "; " + line + } + } + + fmt.Printf("bsub> ") + } + + if scanner.Err() != nil { + die(scanner.Err().Error()) + } + + if job.Cmd == "" { + fmt.Println("No command is specified. Job not submitted.") + os.Exit(255) + } + + if job.ReqGroup == "" { + parts := strings.Split(job.Cmd, " ") + job.ReqGroup = filepath.Base(parts[0]) + } + + // connect to the server + jq, err := jobqueue.Connect(addr, "cmds", 10*time.Second) + if err != nil { + die(err.Error()) + } + defer jq.Disconnect() + + // add the job to the queue + inserts, _, err := jq.Add([]*jobqueue.Job{job}, os.Environ(), true) + if err != nil { + die(err.Error()) + } + + if inserts != 1 { + fmt.Println("Duplicate command specified. Job not submitted.") + os.Exit(255) + } + + j, err := jq.GetByEssence(&jobqueue.JobEssence{Cmd: job.Cmd, Cwd: job.Cwd}, false, false) + if err != nil { + die(err.Error()) + } + + fmt.Printf("Job <%d> is submitted to default queue .\n", j.BsubID) + }, +} + +// bjobs sub-command emulates bjobs. +var lsfBjobsCmd = &cobra.Command{ + Use: "bjobs", + Short: "See jobs in bjobs format", + Long: `See jobs that have been added using the lsf bsub command, using bjobs +syntax and being formatted the way bjobs display this information. + +NB: Not yet implemented.`, + Run: func(cmd *cobra.Command, args []string) { + fmt.Println("bjobs not yet implemented") + os.Exit(-1) + }, +} + +func init() { + RootCmd.AddCommand(lsfCmd) + lsfCmd.AddCommand(lsfBsubCmd) + lsfCmd.AddCommand(lsfBjobsCmd) + + // flags specific to these sub-commands + // defaultConfig := internal.DefaultConfig() + // managerStartCmd.Flags().BoolVarP(&foreground, "foreground", "f", false, "do not daemonize") + // managerStartCmd.Flags().StringVarP(&scheduler, "scheduler", "s", defaultConfig.ManagerScheduler, "['local','lsf','openstack'] job scheduler") + // managerStartCmd.Flags().IntVarP(&osRAM, "cloud_ram", "r", defaultConfig.CloudRAM, "for cloud schedulers, ram (MB) needed by the OS image specified by --cloud_os") +} diff --git a/cmd/root.go b/cmd/root.go index 34fd0b7d2..e0661e80f 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -71,7 +71,19 @@ $ wr status`, // the rootCmd. func Execute() { if err := RootCmd.Execute(); err != nil { - os.Exit(-1) + die(err.Error()) + } +} + +func ExecuteLSF(cmd string) { + args := []string{"lsf", cmd} + command, _, err := RootCmd.Find(args) + if err != nil { + die(err.Error()) + } + RootCmd.SetArgs(args) + if err := command.Execute(); err != nil { + die(err.Error()) } } diff --git a/internal/utils.go b/internal/utils.go index 696420d47..c91a6d494 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -22,9 +22,11 @@ package internal import ( "io" + "io/ioutil" "os" "os/exec" "path/filepath" + "runtime" "sort" "strconv" "strings" @@ -189,3 +191,43 @@ func LogPanic(logger log15.Logger, desc string, die bool) { } } } + +// Which returns the full path to the executable with the given name that is +// found first in the set of $PATH directories, ignoring any path that is +// actually a symlink to ourselves. +func Which(exeName string) string { + self, _ := os.Executable() + self, _ = filepath.EvalSymlinks(self) + + for _, dir := range strings.Split(os.Getenv("PATH"), string(os.PathListSeparator)) { + stat, err := os.Stat(dir) + if err != nil || !stat.IsDir() { + continue + } + exes, err := ioutil.ReadDir(dir) + if err != nil { + continue + } + + for _, exe := range exes { + if exe.Name() != exeName { + continue + } + path := filepath.Join(dir, exe.Name()) + + // check that it's not a symlink to ourselves + path, err := filepath.EvalSymlinks(path) + if err != nil || path == self { + continue + } + + // check it's executable + stat, err := os.Stat(path) + if err == nil && (runtime.GOOS == "windows" || stat.Mode()&0111 != 0) { + return path + } + } + } + + return "" +} diff --git a/jobqueue/client.go b/jobqueue/client.go index c6c3915b0..9659cfb05 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -27,6 +27,7 @@ import ( "os" "os/exec" "os/signal" + "path/filepath" "runtime" "strings" "sync" @@ -61,6 +62,10 @@ const ( FailReasonKilled = "killed by user request" ) +// lsfEmulationDir is the name of the directory we store our LSF emulation +// symlinks in +const lsfEmulationDir = ".wr_lsf_emulation" + // these global variables are primarily exported for testing purposes; you // probably shouldn't change them (*** and they should probably be re-factored // as fields of a config struct...) @@ -393,6 +398,46 @@ func (c *Client) Execute(job *Job, shell string) error { stdout := &prefixSuffixSaver{N: 4096} stdoutWait := stdFilter(outReader, stdout) + var onCwd bool + var prependPath string + if job.BsubMode { + // create parent of job.Cwd so we can later mount at job.Cwd + parent := filepath.Dir(job.Cwd) + os.MkdirAll(parent, os.ModePerm) + if fi, err := os.Stat(parent); err != nil || !fi.Mode().IsDir() { + c.Bury(job, FailReasonCwd) + return fmt.Errorf("parent of working directory [%s] could not be created", parent) + } + + // create bsub and bjobs symlinks in a sister dir of job.Cwd + prependPath = filepath.Join(parent, lsfEmulationDir) + os.MkdirAll(prependPath, os.ModePerm) + if fi, err := os.Stat(prependPath); err != nil || !fi.Mode().IsDir() { + c.Bury(job, FailReasonCwd) + return fmt.Errorf("sister of working directory [%s] could not be created", prependPath) + } + + wr, err := os.Executable() + if err != nil { + c.Bury(job, FailReasonAbnormal) + return fmt.Errorf("could not get path to wr: %s", err) + } + bsub := filepath.Join(prependPath, "bsub") + bjobs := filepath.Join(prependPath, "bjobs") + err = os.Symlink(wr, bsub) + if err != nil { + c.Bury(job, FailReasonAbnormal) + return fmt.Errorf("could not create bsub symlink: %s", err) + } + err = os.Symlink(wr, bjobs) + if err != nil { + c.Bury(job, FailReasonAbnormal) + return fmt.Errorf("could not create bjobs symlink: %s", err) + } + + onCwd = job.CwdMatters + } + // we'll run the command from the desired directory, which must exist or // it will fail if fi, errf := os.Stat(job.Cwd); errf != nil || !fi.Mode().IsDir() { @@ -422,7 +467,7 @@ func (c *Client) Execute(job *Job, shell string) error { } // we'll mount any configured remote file systems - err = job.Mount() + err = job.Mount(onCwd) if err != nil { if strings.Contains(err.Error(), "fusermount exited with code 256") { // *** not sure what causes this, but perhaps trying again after a @@ -477,6 +522,18 @@ func (c *Client) Execute(job *Job, shell string) error { env = envOverride(env, []string{"HOME=" + actualCwd}) } } + if prependPath != "" { + // alter env PATH to have prependPath come first + override := []string{"PATH=" + prependPath} + for _, envvar := range env { + pair := strings.Split(envvar, "=") + if pair[0] == "PATH" { + override[0] += ":" + pair[1] + break + } + } + env = envOverride(env, override) + } cmd.Env = env // intercept certain signals (under LSF and SGE, SIGUSR2 may mean out-of- diff --git a/jobqueue/job.go b/jobqueue/job.go index 62f05bd30..97afffaf1 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -166,6 +166,10 @@ type Job struct { // ActualCwd. MountConfigs MountConfigs + // BsubMode true when Add()ing a job will result in the job being assigned + // a BsubID. + BsubMode bool + // The remaining properties are used to record information about what // happened when Cmd was executed, or otherwise provide its current state. // It is meaningless to set these yourself. @@ -228,6 +232,11 @@ type Job struct { // when retrieving jobs with a limit, this tells you how many jobs were // excluded. Similar int + // name of the queue the Job was added to. + Queue string + // unique (for this manager session) id of the job submission, present if + // BsubMode was true when the job was added. + BsubID uint64 // we add this internally to match up runners we spawn via the scheduler to // the Jobs they're allowed to ReserveFiltered(). @@ -385,7 +394,12 @@ func (j *Job) TriggerBehaviours(success bool) error { // otherwise the actual working directory is used as the mount point (and the // parent of that used for unspecified CacheBase). Relative CacheDir options // are treated relative to the CacheBase. -func (j *Job) Mount() error { +// +// If the optional onCwd argument is supplied true, and ActualCwd is not +// defined, then instead of mounting at j.Cwd/mnt, it tries to mount at j.Cwd +// itself. (This will fail if j.Cwd is not empty or already mounted by another +// process.) +func (j *Job) Mount(onCwd ...bool) error { cwd := j.Cwd defaultMount := filepath.Join(j.Cwd, "mnt") defaultCacheBase := cwd @@ -393,6 +407,9 @@ func (j *Job) Mount() error { cwd = j.ActualCwd defaultMount = cwd defaultCacheBase = filepath.Dir(cwd) + } else if len(onCwd) == 1 && onCwd[0] { + defaultMount = j.Cwd + defaultCacheBase = filepath.Dir(j.Cwd) } for _, mc := range j.MountConfigs { diff --git a/jobqueue/scheduler/lsf.go b/jobqueue/scheduler/lsf.go index e2922efbe..848406568 100644 --- a/jobqueue/scheduler/lsf.go +++ b/jobqueue/scheduler/lsf.go @@ -46,6 +46,9 @@ type lsf struct { queues map[string]map[string]int sortedqs map[int][]string sortedqKeys []int + bsubExe string + bjobsExe string + bkillExe string log15.Logger } @@ -65,6 +68,12 @@ func (s *lsf) initialize(config interface{}, logger log15.Logger) error { s.config = config.(*ConfigLSF) s.Logger = logger.New("scheduler", "lsf") + // find the real paths to the main LSF exes, since thanks to wr's LSF + // compatibility mode, might not be the first in $PATH + s.bsubExe = internal.Which("bsub") + s.bjobsExe = internal.Which("bjobs") + s.bkillExe = internal.Which("bkill") + // set up what should be global vars, but we don't really want these taking // up space if the user never uses LSF s.months = map[string]int{ @@ -436,10 +445,10 @@ func (s *lsf) schedule(cmd string, req *Requirements, count int) error { bsubArgs = append(bsubArgs, "-J", name, "-o", "/dev/null", "-e", "/dev/null", cmd) // submit to the queue - bsubcmd := exec.Command("bsub", bsubArgs...) // #nosec + bsubcmd := exec.Command(s.bsubExe, bsubArgs...) // #nosec bsubout, err := bsubcmd.Output() if err != nil { - return Error{"lsf", "schedule", fmt.Sprintf("failed to run bsub %s: %s", bsubArgs, err)} + return Error{"lsf", "schedule", fmt.Sprintf("failed to run %s %s: %s", s.bsubExe, bsubArgs, err)} } // unfortunately, a job can be successfully submitted to the queue but not @@ -460,7 +469,7 @@ func (s *lsf) schedule(cmd string, req *Requirements, count int) error { for { select { case <-ticker.C: - bjcmd := exec.Command("bjobs", "-w", matches[1]) // #nosec + bjcmd := exec.Command(s.bjobsExe, "-w", matches[1]) // #nosec bjout, errf := bjcmd.CombinedOutput() if errf != nil { continue @@ -585,7 +594,7 @@ func (s *lsf) checkCmd(cmd string, max int) (count int, err error) { err = s.parseBjobs(jobPrefix, cb) if len(toKill) > 1 { - killcmd := exec.Command("bkill", toKill...) // #nosec + killcmd := exec.Command(s.bkillExe, toKill...) // #nosec errk := killcmd.Run() if errk != nil { s.Warn("checkCmd bkill failed", "err", errk) @@ -608,7 +617,7 @@ type bjobsCB func(matches []string) // `^(\d+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(jobPrefix\S+)` to your // callback for each bjobs output line. func (s *lsf) parseBjobs(jobPrefix string, callback bjobsCB) error { - bjcmd := exec.Command(s.config.Shell, "-c", "bjobs -w") // #nosec + bjcmd := exec.Command(s.config.Shell, "-c", s.bjobsExe+" -w") // #nosec bjout, err := bjcmd.StdoutPipe() if err != nil { return Error{"lsf", "parseBjobs", fmt.Sprintf("failed to create pipe for [bjobs -w]: %s", err)} @@ -664,7 +673,7 @@ func (s *lsf) cleanup() { s.Error("cleaup parse bjobs failed", "err", err) } if len(toKill) > 1 { - killcmd := exec.Command("bkill", toKill...) // #nosec + killcmd := exec.Command(s.bkillExe, toKill...) // #nosec err = killcmd.Run() if err != nil { s.Warn("cleanup bkill failed", "err", err) diff --git a/jobqueue/server.go b/jobqueue/server.go index 8c5f8b822..ad78646d9 100644 --- a/jobqueue/server.go +++ b/jobqueue/server.go @@ -29,6 +29,7 @@ import ( "os" "os/signal" "sync" + "sync/atomic" "syscall" "time" @@ -80,6 +81,10 @@ var ( ServerLogClientErrors = true ) +// BsubID is used to give added jobs a unique (atomically incremented) id when +// pretending to be bsub. +var BsubID uint64 = 0 + // Error records an error and the operation and item that caused it. type Error struct { Op string // name of the method @@ -1078,6 +1083,10 @@ func (s *Server) createJobs(inputJobs []*Job, envkey string, ignoreComplete bool if s.rc != "" { job.schedulerGroup = job.Requirements.Stringify() } + if job.BsubMode { + atomic.AddUint64(&BsubID, 1) + job.BsubID = atomic.LoadUint64(&BsubID) + } job.Unlock() // in cloud deployments we may bring up a server running an operating diff --git a/jobqueue/serverCLI.go b/jobqueue/serverCLI.go index 8b6e13386..c9a25e888 100644 --- a/jobqueue/serverCLI.go +++ b/jobqueue/serverCLI.go @@ -660,9 +660,11 @@ func (s *Server) itemToJob(item *queue.Item, getStd bool, getEnv bool) *Job { Dependencies: sjob.Dependencies, Behaviours: sjob.Behaviours, MountConfigs: sjob.MountConfigs, + BsubMode: sjob.BsubMode, + BsubID: sjob.BsubID, } - if !sjob.StartTime.IsZero() && state == JobStateReserved { + if state == JobStateReserved && !sjob.StartTime.IsZero() { job.State = JobStateRunning } sjob.RUnlock() diff --git a/main.go b/main.go index 76fb72779..20bce3c79 100644 --- a/main.go +++ b/main.go @@ -98,8 +98,22 @@ options. */ package main -import "github.com/VertebrateResequencing/wr/cmd" +import ( + "github.com/VertebrateResequencing/wr/cmd" + "os" + "path/filepath" +) func main() { - cmd.Execute() + // handle our executable being a symlink named bsub, in which case call + // `wr lsf bsub`; likewise for bjobs + switch filepath.Base(os.Args[0]) { + case "bsub": + cmd.ExecuteLSF("bsub") + case "bjobs": + cmd.ExecuteLSF("bjobs") + default: + // otherwise we call our root command, which handles everything else + cmd.Execute() + } } From 6ef9121f99c5cdfb5fc809ccaab593026ed6a653 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Wed, 15 Nov 2017 13:00:37 +0000 Subject: [PATCH 02/33] Fix display of reserved and lost jobs. --- jobqueue/static.go | 296 ++++++++++++++++++++++----------------------- static/status.html | 6 +- 2 files changed, 153 insertions(+), 149 deletions(-) diff --git a/jobqueue/static.go b/jobqueue/static.go index c48c2c2f0..67b9037f7 100644 --- a/jobqueue/static.go +++ b/jobqueue/static.go @@ -527,7 +527,7 @@ qlA6M7YbpeGZauyoAGxs2Hqu8lPTsSaRMESTdh33YVvUHAGkNV0Au8qOtYb67gP3WoxhA++aEQAaqP4d "/css/wr-0.0.1.css": { local: "static/css/wr-0.0.1.css", size: 1896, - modtime: 1508405984, + modtime: 1509105722, compressed: ` H4sIAAAJbogA/4RVW3OiTBO+z6+gauur2l3XgAckmqsRFPGMJiq5G2GEkYHhMIiwtf/9G5W8b7LZ7DuU ZdHT/fTT3c8M94RCByXCzzuBrz20fTehWejUcQBd1BOyhHx1IIO967vo4sPjHqao0/6xkoi+0Ig3AGAK @@ -3611,7 +3611,7 @@ w9Vny76PsLQ+wiKIuN3aAu90BLP/DQAA//+BYYzv7UQAAA== "/js/wr-0.0.1.js": { local: "static/js/wr-0.0.1.js", size: 4572, - modtime: 1508405584, + modtime: 1509105722, compressed: ` H4sIAAAJbogA/5RXe2/bNhD/35+CCNZaahzZedTJnKZDk64PYGmBPgYMQf+gJcYiTIkCScXJiuyz746k JMqPYDMCRyZ/9/rd8Y4aj8mC37GSUPhTij4QeUtuhaRGE5NTQ2iWkboiRpJ/DieTEVHM1KrUhJJUKsV0 @@ -3647,153 +3647,153 @@ EQAA "/status.html": { local: "static/status.html", - size: 63693, - modtime: 1509012280, + size: 63911, + modtime: 1510750312, compressed: ` -H4sIAAAJbogA/+x9f3fcthHg//kU8F6bXcW7Kzlprj1ZUp8tOY0vdq2z0/T69PRa7hLapcUlNyS4a13q -734z+MGfIAlwuZLShi+xJBIYzAwGg8EAmDl5cvHu/Md/XL4iS7byz744wR/Ed4LF6YAGg7MvCDwnS+q4 -4lf+54oyh8yXThRTdjpI2M3kT4PcZ+Yxn579/T35wByWxCeH4kVaICv5ZDIhH/9PQqM7chNGZONEXpjE -JGGe77G7MXEClwSUutQlszsyC0MWs8hZTz/GZDLJtRjPI2/NSBzNTweHH+PDjz8jzMnX06+nf5iuvAAq -DM5ODkWxOkReKvAcl3VEYxoAAV4YcDxidud7waLYMOfEkrH1hP6ceJvTwf+d/O3F5DxcraHizKcDMg8D -BnBOB69fnVJ3QQfl2oGzoqeDjUe36zBiuQpbz2XLU5duvDmd8D/GxAs85jn+JJ47Pj19lgcGyN2SiPqn -A8SUxktKAdoyojfAk3kcH6bsm3wz/Wb6R84XeD9o4KOuigkrfwjC+W2YMM5JugFyyBJ4WOVfucFbWRHa -+8P0yK490XcsJCvnlpJZwlgYxLzr2BIajsk2jG7J15OtA6JE2ZbSgKj2eLGUWgMcBVeeAVe+NsbyQ7ii -JLwhYRKRcBuQBQ1o5PhkSf01jchNEsxR2lpkextNjoA1z2qabJeDFEDW+SeH2Qg/mYXunfg1A+p6G+K5 -p4PA2YCE+k4c899nTkTEj4lLb5zEh5aiECQTP3oLPnhy8pWCkhBQ1B0PmFAqUy4nm0ActWUFn9ZOUKow -i6BbB3lNhIU0bR1CYyU0i69Kf1YZE/MGBm2UlcrTKAojqOU6zJnMvAA+wIihznx5THIlWtgDqiACCcZ/ -Jy5obpQl4BQoizperfMtMvqJHZPf4RsUqHUX/hSYUiB05rhAxIbWkZn73jeVucrQ7dQn/F8Y/1EA+qCm -lrYmF73mOvh84IQ0FuHgURnchsS7OSaXUQjTxIqcnpLBoDDwGyEkCj03ZIy6BdayMPSZtz4mvxA+8R6T -4esb1IExgf8+JjFwkTC6gunGgYkXRDWgoHg2MONCgTihY1F4RePYWVCy9XyfLELicMUJZVhM/ZvpkHwe -nK28xZKBNiUuMOjkMGlHnxN/CNSb0Jrn1JP7YdWPSxoBzQ7MHGADiBaTGCcuzhQhq1Pymgm+BCEnHwaq -i1NPlAQkZACCfAxnMRQLNjRmqAlBUBnMTEHi+D7w8IbchQnxvVvg9oziaCBLjzHRDiX/+gGBe+xfch4T -3Ib2g5D4IRf+JHYAuf54rhnRhc+VMYHzRMuA+CvYNsdSNVc0Dn7kMxjq5JNZ1Azq9UUtoNcXFmAu68Fc -moPZbQi/CWEM8ilizmrRuQCZmbIQf4wOUsza+1oIDGF3a5iGxR/ptDRjAYH/lf5cJ74/iXAIF0bF3Pfm -tzAjRGAPTQHNGy9aXcD4FuptcPaaDWOwMLggi3EvmjFgmcnA33HQqxo0mIcJmNIRdWt5LMua93tNA8T5 -Nfaj1DE9dl+DDqn5tItpISeoGsMi/frrNyvmS+omgCF5jdOz1ax5jiI6OiBn5JnxlHkFggIKKqK4IG0W -7u+wpF7Crx/vtFRDzNt4Ya4J3htw540jmDM6sFQAu/QgYq6Qq8WMA01xAeMHRktP2ttEb8mxYqS4XC9e -gVn6VgznwdmF+Ltda92fMuILbemwOSbPjo5+/zwleUtByeI/k3gFFuJ6snKihVa55EGJQsfkiDgJC5/X -qaLlt5UKz0EduahU4HeYqmGOWq19CuZnYYEMqy7gZVUuvODGx+4AeWWOn42Gw+W37dowR10eMgpxES6X -5iNTTRmFiwg6f1AkFcY5dP/quBFOHawJOi7yf0xiFnlrHM24EqLFb0qzS9eG+gafCnRy9HApIeUgpdml -vnN3OcdB/JQMf89NeSslXoREXcE/8yWQXgeUoWbqQL7oby3XosR5kUfQTWsauDRgPXWVhNZ7Z0m4+e6S -r35lHQY0hZ17C6w8t59BxSH13EscZtZD2D8gmo++f7r3RhL00xdJgGO4794QULP+kC8efX8Uv4nlSec+ -8sO4H9WGgHruIQSZdY+f8488wj7asR9mSdSP4gJAXu/GgACa9YX4+9564X6MdkT2q6++4t7bO8qIhzby -CmbQEqV5eYjCLRE2Z4sJn24B+ZNP8eTbOtv9JoxWBXlJZisPeiKiPyc0ZrCE+0sUJmtDK9kL1gmbLFpq -8Fr5dUCu2gSWDaGy3Fm4WKBwSwe5fJvuasECAlfZwml+OniFXjACUD20QrwbD/5iIXH8OCQxpdyjLbaz -cPvTgQURrEpWTuDGBBoFbbf12BJKOSwHYTo4y/4wWjRzYuTCE6U6XYMhqznyMGILY3Tj+AlFlrfyupFz -sKQ1GcNyZVz24amNU4G4EAMYf/nGFv7deukBBST9bbIGG30y96K5n/Oim7nyWpjZOAaRl10HYVk3VAel -VHFxGDHc3VCDIB4dTH0aLEBK6nTdybLksKnXSNUhrt1/1eCA70Zqn37kj6MD0O8RZUkUEH/quYBdhD/+ -TJ6RYzJ5Rj4ftCz0W30GTV5IK2eBYFKbw6BuesjNCEaOBFP/QUpLuw/BzHVQBrir+6AOXqe1KeGeLYcf -DNJYD07kOROuk1ZecDo4KrxxPp0OQEwabYyqp2FMlDNt7USgTafxMtyCSHPFdSHW+WPiMBYhmGHWXhBu -hwWAJmYKZ1ZuHHfzV3Aoejns7KpIMTPeem63FnmxX49o6LwbLeIhqzQKSAFsNyHp5inhcOrEZAcnSYrf -oxMVdJjsW06qfpVGGXmPxRvkIweui2x08c1wGHq56OiWSbF6DBKx7/4veXKae1/4UZr6X4Hr1PudvEEc -Sk3/d3UEpZg9BgnQOCP2LRUV31GjWOD5lgaZyIB1EYoO3icOQi8ROzieUsQeRCbup98rvqrGfn/JfUUN -PZ+B69LznfxdHIi+7zu6ulK0HrDf97Z8oIyW+rtpbZCW7rg4gPr9Lg4QYGFxQMsuvUbMHka7J/M5/L7v -oawOApgP53NZo0EGikC7SIGC0J8YKIiZHKg3DyII3R3e+NTXkzxM/VUuZY7nx+0OeK23RRxiq3eSFI7n -xDEXhsK5NxAGvGRB8aTmUK7Kh+Tf/y68lUuw4VhVxhVNoSa30LPv68gDVO6KRYTNlhUSKrFQRqjyUvs4 -u2e15LArVFOCYrgp0/E8X1q/zRunObe14uqtyZtWwVA2EW5odOOH28mnY+4nNESSg4lXju+fnXh17sHz -rfvSiXN+6NpiqYTNQz8EnQIK7i7nJvTwV96YGX1mepgXzemct3j6rX2kFKr3w0kOKuXmiuNRe0hPoNmd -O7wxSw7tcwZMj2eSW3oHk4juaK4eiA3BLjt7wfB6C4sBSWZT0632gQKFveC6xlJpKr+2lL36tKZzPHX6 -/sXbHqhT4ADadDV7/epcHFB9TIT+6K1oj5QiODyMm0T8cuLe6M1pm/diQ5e6F158a2/k2HBOVGBnaZME -27Rjn4Dh1unwAjWZifWXl+Zs5C3YsdJULQnkLWXtHEyoPnQFh7N/efoOzLz31InDYM+ClGuzan1ZtZ3h -wM4uI7rht/yRjiSyHNwpYjYSUahYoOhJHxTJzsC77g9Ak04SMxGxEUcObn/j0lrQX33yUIXtXVtiO7BG -dLt0m36u8RiC2x/veekSp7BFlGe76UUQYck0UYmdfWDuu4TZc03Ut+BMoWJhECMCnQZuCi5/miHngqm7 -joIOEmh2ip9G/NI8rCgFHkMwJr702XMs8uWCPTe96ldLZRd9UACgblH2wSikLAgDipTdP0l2I0nWsNYF -3cfBqyh62HEACDyKcQB4PO5xsCuj/rPHQSfkOs26l9S5tV/GitqaSRfBdVzGcph2XOpCMFiceCezJ3ol -tMLF0kdG8KvA7Y1cDusxE/t3x/eZta9C1NbQq8B19lVwyPsn+/zybz1SLaE9dqK/D+MOVq+W4u/l+YNH -SCF5fdkjkSK6yv0sh3h7F7gYsggUlFHQ0QoUPLvobAbW8O3Clm8c3GOd9L2+JoRLcST91+rbeKK8G19+ -SUapd22AQSejDUapyu9cDtS5teJbfnbpYP+d9l9nuOwwl+t8pqKjOroX92UbpNj25kjtm8w33oYqUkUE -lfsn9jdjotzsb8aEHQW/GRNmGP9mTKTPTsZENuvI463ipbXbq6Ol0M0R2skJ+sg8lr9e8blQN1z3LyBp -U49YRlIc/8tlgp/VnHv0fsQibe1xS0aK5n+jcJgVtD7qFWysD99YcrpDXwNWu3XxPo4B7W24n2/v4ZzF -95j/4HyJp7X7MtwQpIT4azbgXtKlg4eionvQtVlbj1jTZkj+h+jZzuelb4Al/J4fdaIb71OHGzYfvJXn -O3bW/9O6M+kSWHbwU0S7V/F8Oq2ws8XLbueYeBSh2AElQ9WJLjKqoSN/RosTcsBTwETZUb4bcZRvf+ve -bhUqzkUVEsNOc+wnuvh7ugo3lIcVGZyJP8xCEvXME3HP//Fw5JJiHpoHZEgWEOPxMAWs6YcVErWp8gg4 -gqH4RUD+B2GFvedeXmL6EXOifAxnxFmvYYKKeT6IMSYtEelS5mHiuzw/TEJ5SLhc4hmea4bEyXxJeLaV -gDLM0IWxcKTufY55UjB4HLYA0Jw5E+lTbryAjjGhCs/BEtENxvAX6Vd4LJ2YU4Z3s1YO8+a8znZJAw5M -ZXUBgDChUneqLlUZZX9IGbC//AyDs3PxB7kwzq5R6N/dBUL5Dq2vyGUMELHx8rRbmm3mDDZUOHgqvpvG -scJJ3lk1QIpFfJqEH/boPODFvrYbzW3N9RzI0+HR+MgqdB3N9edyfD9e7Jj8Uml+48WYsfFYwnuL5X4S -78aVwq7n+OHiHC9CDznESbwaVouJjHV4WRoxwJ++M6N+oY3veRnymXyu1sfLklgr4HmUhrlaL+HLj6BK -fRixw7EEL75fyIvgGnhiMaGH+B3/1gazAPIzX3NXOk1mLMwCcR5iUtEBz9lSQ4IuSmIh8gcOjtEB34WT -w0evnF5ElGfWihP5y9YJ+NRQsw4Q+OQyYSxpfVyBQs6MNISpDF5K89FPB7UBqFSkUQlmoB8hOdpp+y0Y -Hjl16bi5dU9N+1jgPL/s4asenG4xxSqdO0lMa5G/KdwqEuj/uR7/RhVQ2L0yILFDO+0fy9J1aiVd9y4q -xIFWc3m3qjxpJlln3tTy4RYt0vr+ExbTiIlseWiFgZHniHCMKqEdEjpfAdkxC9fQyXSeYIK758S5QZcG -toDG2tYBoQV+eb6y9WIURXQWCjPkoPbWe7cujrgF0E4cL+f4GLM47UE51Da05PiQ8QWRnpCbmSvBlRhG -VsDQZIXB04GQaoJXgaGZii3q9JaA1anNVhdythC1xSwpUV8G02rlsRecrsL2LYsSegA/ZDgn0cfTubP2 -mON7/4/y9FRvKAMmiJg3GHx6ODCIk7xnxG/AVLHE/Fkr3lZaV/UgDIgH7UI7TuzOAqNVhQrJzamRCaik -6QiLMyeY04Z1utaO1Y3iqikbMzdM2CGNov7MWYBpa8v6izGRVi1zbcxa1ZaJTauqynznvPK7hGEI989G -dmaVfa4MXBUL7Htgnruw550Nw4bplvIdEYcOhkYrARps6pcB7uIndMV05mG28d8bG+n6vvgIaPfBQrre -gYezbDuvLw4CyD1zMNty64F/gO4O/APQvTFOobk/xr0KNl4UBpg+hPyEAfKgmT54CB+NedhoEOpaqbMF -dZkQZGr4GltZu3hJs8nXJ9PsOE2W0zwUv8rdUo+jj7/q6BQ2+JfzcH33nHx99Ox/jvHfP5K/0AAXHe9p -TJ1oviRvvBWuS6daqx0zYWAD2dsSQdn7at98dDaOeFvC7zachmu0keIpGCE0+tsaGAmK+JSbus/rKT88 -BJGnWxBg6vPtS7BaMEOI8oYnxa1ZlcOCu3yT+Ceo+hargkGoGUtORDDlO2Kx9OLqrWeeD975OfEiaE4m -ejnltMzwOgsOiBdR5NyNDmrqijpg9gDiVhVnjisysls2qJID29VSS/JyrdoKMiajCqgJ9YbD5qLSBd9a -7t2Lmu9bkFQMciUkJzIrhXwI6Ja0kA9F+ZiA0t98e1QtVcc1XG6/VHmnoXIqfSPP1QlcWjXrXgklS74i -3tfVxkfmZZEZ+19f4FLHc/V39j9raP5sRZ/MT1ugbhUvGslTUlglDjM+82TPJgSmhadv4wVSCe32T6bK -EAYU6lFKo34el0bH0cEUlB6YZ6NfSCpDx2WZ+nxQnS4lWBU2tGfAItZo30BlNKmewfLYpT3DlEFSe+8u -kTJmb2KwB9gqS8UehGEPUGX8/D2Iwz54EPruP3nqJgB81CQz/8TouwkYwFCuqqVq5iSlla6Goo1rMTdL -UG6mUusUqXdDRiVIRWyujeaYAoCM5OsaPax9q32JxheHBYTp8IQBfM09hJWPSmtqPwvdp/8kNZj2I9dD -2i9Sm1zrzAfFaEHIGTlq4ilSvEow7aDvcXPh2dERORRMqI+4AwbwlsJc6Pj8oMn/+hM/brIJPZc4ZJYs -iBfA6itkMYucdRqAvQncDBdf26UHKwF5zCQGrBAOblPwIw2TFV7Tg4JNcG7QD0ojvjWQMNxNoJ+8GAbU -nI4J3fBTKWGyWCL+AR5laQImOIgRiJEtjTzkvHCBf2sazUEQPuDf0ehqlGPuVw0ydTAmLUVzEtZWOJW3 -1oKZ9LUVVbLYVi6TzIPrMUhGjYAqvoGljsFmMsa95y+ikWAoLBYbAOjYiUr1eiTBXh1d21TPzXkZiGcW -INKpLav+tU11MYNllb+xqKwmqqz2Hyxqq/koq/1tXW29PsWn3vjWqmtc4tbrGanta0p8NpwnzddN6lre -KbnSTCSFQm/C8JYvMH+pmykr+Szt1r7eIsA9XX0DlRegqWLKCGCEunJLZ3EIOrCaUwcnha0XuOF2+nc6 -+8ALwQrmlGCH4ym/5vVhzm8wXSfxcjT4R5hEZBaFW3hL3BBW+JjjN07WayCfpG3EA91KiFA/pk3tbdVC -OQU0Gmzj48PDAcyHfjjn4QmmSxB7dPDBu8Fx4QtHAt4eCsT/udXikWtuGgYhqIDcqrLWkMnXilEK//eH -d3+dYuqoYOHd3IFQytDNx2QwT6KIHwb+fFA3otrQmsPgLq52WxGr9tZ5GARUVIfZGkVl5QQOHplcOngI -AChHHfKkjkv4HB5iJmeYO8VZ03UIUzUeamHRHT8SSidAM8i3F4ujF/O0zel0aqFNMtJXmqV+40L9I94p -OCW8Q9ZgVdARnaKD9KC2Bo4LrDUFPrzbBpcRSEHE7kbD76JwxX1Ew4OmFgVj5DnYIFnN0MfDjy3MxW2p -xprRArDF5q+GSlsMrxtr8HlTerkaCyJhEXdSDJ46vv900EYFxyfznxVUd3P8RTmcUwO/qCrLnI0WrQzV -oZIq6StNG1fR4rpmBVBC0qrh9sL4DD1c2kcL/ZKrUno/zhtNM/tx5lQa2odzp9rIXpw9lWb24PyptLEX -Z1ClFZmRa9/NpHl89k9Ona/LdjzsBKXBf2UuyTvVr/dJmcvfrpyUici6g8hlM9sFD75BUwYgrWtDIAZO -M03DbU608lNj5OUfNe109q+VH24ApEAtXG1VzPhiLIPV6nUrP/WrxvxjVCjvpStRlzro8u+LvrnsS94t -l3tb8Mhl73POuOxl5u0otSk0b/l9qiprHXflx9iRp2OSvWOv/Bg4+mxgVX2CZcefDbROPkINnFafoQ2w -knvR1IdYfsx9itoRUPHS1YyHhnL1TkTtWGkoVes61I2jRszTUdVQKj/GWl2QOrbv5JIsP1aio4YWvzgp -2salLw4ROzggcvy0vxI74jBYpt/Bet0LmOWYxYiQY+KGeJqbuHQeUTx8hdATcV7Gaqjh+eLn0k0UUXHl -1IvVRYsl9ddW8AS/YjxJ5AWwAIchG+MAzoa02XwuweFVWjBJV6hK6hwWugfF5pbecedhZqeOSxbnOGc7 -jlMrcJzZc+PMMhvnbaxx0VoynHHxwcNJI8TOA9SOnsOPE/In+PH0qc1cgk/BlEBar7zra34pQTmMPSuL -Qj1VmDl4djkfzEwMu5L7Z+DJfy4De7T5tJZn8wZCFUTThkL56b7BYE1f0bclvLWKXgP4Nb6witNs6tNg -wZZkQp71gDRqS3mFEfQtbgT4vOlxepeO4CYICSO3YZM1B22VgOWGE4Nwmoq4BmBGiTuleMdLnrI0yDGO -3tgQA1+P4ScCcXz4iYzlk2wAk0WqmU2AlVaWZl1S2QOy6tnmsdPqH76JwtUYiG0sGG89Nl+OhPM5c3Yb -qaG5Az2fOTKNRiAipV+zmY3gGUyft+1FFWqp87MrcqmhvAf0pMu0G2rSNt8HWsrJ2hExtSDYA2rCMdsN -L7EE2QNSypPbDS217OkNsR20RnY2i28+l7dsyjtUBxhCLFf+qlzgWg/hxzBVMm0Arko1rsmZ2ik7xwuV -ZooK1LfcTucrjSELh4RFThB76Eobp7MYfA0W7YsPXHQALOko4LMb3wHlkwwfl8SZ8/uesIQE69EIv/ZJ -yJJRkxKj2gWs1P0mjZyemrukxGLGkgxzF9m72Uc6Z1M0gZupOFBWkA3ypgT05QltLmG8i1mY3nPjzozo -LhM8PmBg7TDF42OogPNI2k31WjQtJ/tOiNpM+hokrab9bghaTf86FO0MgE5IWhgCGgxtTIFO6FmZBBoE -7YyCTihmW7bGbcizJE+szpLknxKVmZvWnE5z/0QHFSL3yh+MIal3+wH50TLxdDcuazchuQuH/Jk8I8dE -c8kt/yDD0YI24TMugQO6lQY3/hgdkEkXm0hBObOwF3h7sqKBA8d4QseHuzZWFL3ycc6OjUGOA7BMI2+j -jFNTcNyGfQ4G7ND3CcigsJPDgJIFHhOMcD9rjDauKcCVE91ir6ZmN4aBpHjdPY+xKTQeSpJH2kKKvYDg -jWqz/T3svyfEZlFjM4YbTcGas7u6x2wUt9rnetryXp3eiLuqwL4mT61XHJx2G9HvhFc3tPpzXHNdcGSi -BXbQvU3q1UCrstBENFgIBfmBhuIavKtHIndkVHv61vDkrf352XQopde70RUhDsrqbpIbehlQz+GJa36c -msdjoi7GJ3MKhx1MfQJQy4mYN0/83Gnf58RxXa5aGQZB41gazWKCP3JQFAIymyoFQIkfaVZJApEyFQGY -R5HDAMBmSUvQ4xHIfWSjCjibb1W7qrMVIqZjGoHM6MIJ5K0CkRDTvG4QbiuhCDI4hoAE6vlki7sf8sJH -7m2lTHpKRiNAmBs9nOgDcogHAUwUET4m+1r4aOMbiH0OaL7+yH3+yc3SJUjWE1apPnBW3neJKXsdMOw2 -vxuD8UEpcHD/5410IdWQLzxMdluvun3mXFuddpzx0WJ45V3bi656DK0aXnRsJXM7TpH4FNW6EEQcc3ub -pF5fGt0N8dgwJtTjUSwdrn5mjiujZYzBAsc9SX7oDNRpG6yspgiQ6cVcN+EFrtZpT+if1/FLxzXzBJYj -gxhz1NhJqYlaotC8ABz31G9v40XHjuMRQBIf/pZXk3j/yY3qNnAwdYtTU3x9BUuuKNs2yIIINUJBJSFg -4BE1HtGxtXwuAk8hFkrb/KfTSmkcFanmyNOnnumSPEY4CgBoIcNtCU/FWhFygX1n7MaGym+cmHFVJ2df -+WebcOUgcFN3VDR7jepmHYUhpcx3+R7OUyNmY4m3cb+mUXHMb0xhLx7ne9Tw5D0PUcr7T9XO3pjCSEWg -fPGgIiGGAIVQ6KEpgWkHZTi/pSOQK+Nc+KLOk5z2S/WtQfQiXLnMmUolwpdDkbgJi1lH1NGgSjU5C/CC -77PAX6mpBopo9crnK4Y6mZyHQRz6dOqHi9FAgsLFCbRJxGW+gYqjodAA20cLyvBm71CErhuOiUL5uAyf -3/nVDzjgFN6lxRNPdxQ4hu5wpA+0hTzRLq/njtOr1Uvd3GB4Q7zcK3yBGsv42TD33NxQvKXMA+fxI7S1 -UTtEtA4+N7T1KGaSU6voC7ERmO9VVbn52jnA4KX46jWtM872JnW3y6uvNAjJLb9eUVLbiB2Res9Ngf4Q -EluGXZGR6/s+0eGGJPaZ8O3ivQ8vmPuJC1KX7h52wvYNXv3oD1W+T9iRcS/5Fl6PyMg9wY7onMu9th4R -SrfvLFHKoOmQGYtL9K3ho9IVXpOBkpa29Cp0itmYf6TPgWeRTL0OWkya4dS4NHTRKtVTP8cX+Ta6sovu -UtsdquemnlvnPuXnvFSyq0r4zabe4EEVwjVBwWlaFqVISMD11JU50RIsVFelKWiorvy7Fy2Fhe9u596o -ISvXQfomdLTx7movzkkrM7+hmoFJpS705m2qHAljkTbtWAqUNqJKNytV2kOwQMfQvrnkADW6rBjov+IZ -FpkW6hRhMXK/aYDbLGa/cQ0YOB9YYSJC+26MHvuWWD95DHmlptg5KWYjAHyFpesGoI55I55ZZE8deaFO -3+s5VMw5YNeNMvS/XfBlkdk+DQqV9ktbj4jGsNg0rd/E4yJh+2Zxmg6ghlPFlASWbJbpAbrwOcuuYMNq -0aDidQqjkd1FCvfK7yx5gJ72UvoCO26rZALW3M6lVbfgtWxudIXMzkA0qo8SfXvlNQ02epJLqQ7smKyy -DVgz+VWwseGubIdLMlRtYmqJnr0wFX3eoXgt80iKtD6x9HjoQEmDAOsVN9L1TKrmk7TrmWquSNO5tpi7 -sc4NKkqVHYU1vkHBHcPCt/TOsGSUWlZGxWNhcRmVFckFLQpjfkTD4llGRMMK/BZGpazxchUGzY/hi1Kv -5ofeWPbmWHZU41AsiIf8ayR+NA3LYjWZsUs2Z1wNRIOrgB/onXml1ImJNZUxbl6dSw2vK5Z5xhWVVAil -xeVph8qYn9O8eiZiHMB36Z/mIESqN063t/Lw8M5T8szCSZLP3ZaXN8f36+SLh0LhC6icaq1dsTcASpGo -W6g1uivSRVy9vLfsgZTc6DXy2AJELRDrRLKluhIaLYRUvFqAfJdTVc1i1gCoPlxo2976Q/bhDzgN1eig -TsSae0BESCAqRkHi9eBC3M1pZulosnEyGTuYaoyiWiOoXi1ladRtLNDqJCpmzmGEkIbpL+0KNu+PGAo8 -pJv9XKaFNQXSZuK2sUClt++JDwhumP1mzQmsxTXOA7EClsiPiRPZtt5DMOMS2n5M3LiUeZIfRjAwP/Uj -Yobagr5fZvyA+Sz64ALm8B6qn5Yc4Eio/dz7pf8CUOiVfgnXlgXnolpKPb/Hj8jtjw1GvhGBlsx27qij -nV7M0863craS560lWZvpRoxsIj2TCYwXv7y+OCZZmrdau017rjOtZ7QKN+GezBdN8XSRPBdV40UWBauZ -40Zx7fFF400rCTteAJfg32MijyyacEdiJE85tjKmaGryw3ebldydruSXLFa9DafOeu3fvfT4nBCPoOaY -/G40/B8iWcDwoJgvpZgh9OQQ86mefXHCk52effH/AwAA//9BfgWuzfgAAA== +H4sIAAAJbogA/+x9f3cbtxHg//kUMK8NqZik5KS59mRJfbbkNL7Ytc5O0+vT02uXXIhca7nL7GJJ61J/ +95sZYH//ApZLSWmzL7GkXWAwMxgMBgNg5uTJxbvzH/9x+Yotxco9++IEfzDX8hanA+4Nzr5g8JwsuWXL +X+nPFRcWmy+tIOTidBCJm8mfBpnPwhEuP/v7e/ZBWCIKTw7li6RAWvLJZMI+/p+IB3fsxg/YxgocPwpZ +JBzXEXdjZnk28zi3uc1md2zm+yIUgbWefgzZZJJpMZwHzlqwMJifDg4/hocff0aYk6+nX0//MF05HlQY +nJ0cymJ1iLyMwRMu64CH3AMCHN8jPEJx5zreIt8wcWIpxHrCf46czeng/07+9mJy7q/WUHHm8gGb+54A +OKeD169Oub3gg2Jtz1rx08HG4du1H4hMha1ji+WpzTfOnE/ojzFzPEc4ljsJ55bLT59lgQFytyzg7ukA +MeXhknOAtgz4DfBkHoaHCfsm30y/mf6R+ALvBw18rKqiw8ofPH9+60eCOMk3QA5bAg/L/Cs2eKsqQnt/ +mB6ZtSf7TvhsZd1yNouE8L2Quk4soeGQbf3gln092VogSlxsOfdY3B4VS6jVwFFy5Rlw5WttLD/4K878 +G+ZHAfO3HltwjweWy5bcXfOA3UTeHKWtRba3weQIWPOspsl2OUgApJ1/cpiO8JOZb9/JX1OgtrNhjn06 +8KwNSKhrhSH9PrMCJn9MbH5jRS60FPggmfjRWdDgychXAkpBQFG3HGBCoUyxnGoCcawsK/m0trxChVkA +3TrIaiIsVNHWITRWQDP/qvBnmTEhNTBoo6xQngeBH0At2xLWZOZ48AFGDLfmy2OWKdHCHlAFAUgw/jux +QXOjLAGnQFnU8WqdbVHwT+KY/Q7foECtu/Anx5QcoTPLBiI2vI7MzPe+qcxUhm7nLqN/YfwHHuiDmlqV +NUn0muvg84EIaSxC4FEZ3PrMuTlml4EP08SKnZ6ywSA38BshRDF6ti8Et3OsFb7vCmd9zH5hNPEes+Hr +G9SBIYP/PkYhcJEJvoLpxoKJF0TV46B4NjDjQoEw4mNZeMXD0FpwtnVcly18ZpHihDIi5O7NdMg+D85W +zmIpQJsyGxh0chi1o0/EHwL1OrRmOfXkflj145IHQLMFMwfYALLFKMSJi5giZXXKXgvJF88n8mGg2jj1 +BJHHfAEg2Ed/FkIxb8NDgZoQBFXAzORFlusCD2/YnR8x17kFbs84jga2dISQ7XD2rx8QuCP+peYxyW1o +3/OZ65PwR6EFyPXH84oRnftcGhM4T7QMiL+CbXOsVHNJ4+BHmsFQJ5/MgmZQry9qAb2+MABzWQ/mUh/M +bkP4jQ9jkKaIuahF5wJkZip8/DE6SDBr72spMEzcrWEaln8k09JMeAz+j/XnOnLdSYBDODcq5q4zv4UZ +IQB7aApo3jjB6gLGt1Rvg7PXYhiChUGCLMe9bEaDZToDf8dBH9fg3tyPwJQOuF3LY1VWv99rGmDWr7Ef +lY7psfsadEjNp11MCzVB1RgWyddfv1kxX3I7AgzZa5yejWbNcxTR0QE7Y8+0p8wrEBRQUAHHBWmzcH+H +Jasl/PrxTks1xLwNF/qa4L0Gd95YkjmjA0MFsEsPIuYxcrWYEdAEFzB+YLT0pL119JYaK1qKy3bCFZil +b+VwHpxdyL/btdb9KSNaaCuHzTF7dnT0++cJyVsOShb/mYQrsBDXk5UVLCqVSxaULHTMjpgVCf95nSpa +fluq8BzUkY1KBX6HqRrmqNXa5WB+5hbIsOoCXpblwvFuXOwOkFdhueloOFx+264NM9RlIaMQ5+GSNB/p +asrAXwTQ+YM8qTDOoftXx41w6mBN0HGR/WMSisBZ42jGlRDPf4s1u3JtxN/gU45OQg+XEkoOEppt7lp3 +l3McxE/Z8Pdkyhsp8Twkbkv+6S+BqnVAEWqqDtSL/tZyLUqcijyCblpzz+ae6KmrFLTeO0vBzXaXevUr +6zCgye/cW2Dl2f0MKoLUcy8RzLSHsH9ANB99/3Tvjcjrpy8iD8dw370hoab9oV48+v7If5PLk8595Pph +P6oNAfXcQwgy7R434x95hH20Yz/MoqAfxQWAnN6NAQk07Qv59731wv0Y7YjsV199Rd7bOy6YgzbyCmbQ +AqVZeQj8LZM2Z4sJn2wBuZNP4eTbOtv9xg9WOXmJZisHeiLgP0c8FLCE+0vgR2tNK9nx1pGYLFpqUK3s +OiBTbQLLBj+23IW/WKBwKwe5epvsasECAlfZ0ml+OniFXjAGUB20QpwbB/4SPrPc0Gch5+TRlttZuP1p +wYIIViUry7NDBo2Ctts6YgmlLJGBMB2cpX9oLZqJGLXwRKlO1mDIakIeRmxujG4sN+LI8lZeN3IOlrQ6 +Y1itjIs+vHjjVCIuxQDGX7axhXu3XjpAAUt+m6zBRp/MnWDuZrzoeq68FmY2jkHkZddBWNQN5UGpVFzo +BwJ3N+JBEI4Opi73FiAldbruZFlw2NRrpPIQr9x/rcAB343iffqROw4OQL8HXESBx9ypYwN2Af74M3vG +jtnkGft80LLQb/UZNHkhjZwFkkltDoO66SEzI2g5EnT9Bwkt7T4EPddBEeCu7oM6eJ3Wpow8WxYdDKqw +HqzAsSakk1aOdzo4yr2xPp0OQEwabYyyp2HMYmfa2gpAm07Dpb8FkSbFdSHX+WNmCREgmGHanudvhzmA +OmYKMSszjrv5KwhKtRx2dlUkmGlvPbdbi1Ts1yMaVd6NFvFQVRoFJAe2m5B085QQnDox2cFJkuD36EQF +HSb7lpOyX6VRRt5j8Qb5yIDrIhtdfDMEo1ouOrplEqweg0Tsu/8Lnpzm3pd+lKb+j8F16v1O3iCCUtP/ +XR1BCWaPQQIqnBH7loqS76hRLPB8S4NMpMC6CEUH7xOBqJaIHRxPCWIPIhP30+8lX1Vjv78kX1FDz6fg +uvR8J38XAanu+46urgStB+z3vS0fuOCF/m5aGySlOy4OoH6/iwMEmFsc8KJLrxGzh9Hu0XwOv+97KMcH +AfSH87mq0SADeaBdpCCG0J8YxBBTOYjfPIggdHd441NfT/Ew8VfZXFiOG7Y74Cu9LfIQW72TJHc8JwxJ +GHLn3kAY8JIFx5OaQ7UqH7J//zv3Vi3BhuO4Mq5ocjXJQk+/rwMHULnLF5E2W1pIqsRcGanKC+3j7J7W +UsMuVy0WFM1NmY7n+ZL6bd64inNbK1JvTd60EoaqCX/DgxvX304+HZOfUBNJAhOuLNc9O3Hq3IPnW/ul +FWb80LXFEgmb+64POgUU3F3GTejgr9SYHn16epiKZnTOWzz91j5SctX74SSBSri5IjxqD+lJNLtzhxoz +5NA+Z8DkeCa75XcwiVQdza0GYkKwLc5eCLzeIkJAUpjUtMt9EIPCXrBtbanUlV9Tyl59WvM5njp9/+Jt +D9TF4ADadDV7/epcHlB9TIT+6Kx4j5QiODyMGwV0OXFv9Ga0zXu5ocvtCye8NTdyTDgnK4izpEmGbZqx +T8Kw63R4jprUxPrLS302UgtmrNRVSxJ5Q1k7BxOqD11BcPYvT9+BmfeeW6Hv7VmQMm2WrS+jtlMcxNll +wDd0yx/piALDwZ0gZiIRuYo5ip70QZHqDLzr/gA0VUliKiIm4kjg9jcujQX91ScHVdjetSW2A2tEu0u3 +Vc81jkBw++M9lS5wCltEeTabXiQRhkyTlcTZB2G/i4Q512R9A87kKuYGMSLQaeAm4LKnGTIumLrrKOgg +gWan+GlEl+ZhRSnxGIIx8aUrnmORLxfiue5Vv1oqu+iDHID4FmUfjELKPN/jSNn9k2Q2klQNY13QfRy8 +CoKHHQeAwKMYB4DH4x4HuzLqP3scdEKu06x7ya1b82WsrF0x6SK4jstYgmnGpS4Eg8WJdzJ7oldBy10s +fWQEv/Ls3sglWI+Z2L9briuMfRWydgW9MbjOvgqCvH+yzy//1iPVCtpjJ/p7P+xg9VZS/L06f/AIKWSv +L3skUkZXuZ/lELV3gYshg0BBKQUdrUDJs4vOZmAN3y5M+UbgHuuk7/Q1IVzKI+m/Vt/Gk9i78eWXbJR4 +1wYYdDLYYJSq7M7lID63ln9LZ5cO9t9p/3WGyw5zeZXPVHZUR/fivmyDBNveHKl9k/nG2fCYVBlB5f6J +/c2YKDb7mzFhRsFvxoQexr8ZE8mzkzGRzjrqeKt8aez26mgpdHOEdnKCPjKP5a9XfC7iG677F5CkqUcs +IwmO/+UyQWc15w6/H7FIWnvckpGg+d8oHHoFjY96eRvjwzeGnO7Q14DVbl28j2NAexvu59t7OGfxPeY/ +OF/iae2+DDcEqSD+mg24l3xp4aGo4B50bdrWI9a0KZL/IXq283npG2AJ3fPjVnDjfOpww+aDs3Jcy8z6 +f1p3Jl0BSw9+ymj3cTyfTivsdPGy2zkmiiIUWqBkeHyii41q6Mie0SJCDigFTJAe5buRR/n2t+7tVqHk +XIxDYphpjv1EF3/PV/6GU1iRwZn8Qy8kUc88kff8Hw9HLjnmoXlAhqQBMR4PU8CaflghiTdVHgFHMBS/ +DMj/IKww99yrS0w/Yk6Uj/6MWes1TFAh5YMYY9ISmS5l7keuTflhIk4h4TKJZyjXDAuj+ZJRthWPC8zQ +hbFwlO59jnlSMHgctgDQrLmQ6VNuHI+PMaEK5WAJ+AZj+Mv0KxRLJyTK8G7WyhLOnOpsl9wjYHFWFwAI +Eyq3p/GlKq3sDwkD9pefYXB2Lv9gF9rZNXL9u7tAxL5D4ytyKQNkbLws7YZmmz6DNRUOnorvpnGMcFJ3 +VjWQEgFNk/DDHJ0HvNjXdqO5rbmeA3laFI2PrXzbqrj+XIzvR8WO2S+l5jdOiBkbjxW8t1juJ/luXCps +O5brL87xIvSQIE7C1bBcTGasw8vSiAH+dK0Zd3NtfE9l2Gf2uVwfL0tiLY/yKA0ztV7Clx9BlbowYodj +BV5+v1AXwSvgycVENcTv6FsbzBzIz7TmLnWayliYBuI8xKSiA8rZUkNCVZTEXOQPHByjA9qFU8OnWjm9 +CDhl1goj9cvW8mhqqFkHSHwymTCWvD6uQC5nRhLCVAUv5dnop4PaAFRxpFEFZlA9QjK08/ZbMBQ5dWnZ +mXVPTftY4Dy77KFVD063mGKVz60o5LXI3+RuFUn0/1yPf6MKyO1eaZDYoZ32j0XpOjWSrnsXFWZBq5m8 +W2WeNJNcZd7U8uEWLdL6/pMW00jIbHlohYGRZ8lwjHFCOyR0vgKyQ+GvoZP5PMIEd8+ZdYMuDWwBjbWt +BUIL/HLc2NYLURTRWSjNkIPaW+/dujggC6CdOCpnuRizOOlBNdQ2vOD4UPEFkR6fzMyV5EoII8sTaLLC +4OlASDnBq8RQT8XmdXpLwOrEZqsLOZuL2qKXlKgvg2m1csQLoiu3fSuCiB/ADxXOSfbxdG6tHWG5zv/j +lJ7qDRfABBnzBoNPDwcacZL3jPgNmCqGmD9rxdtI68Y9CAPiQbvQjBO7s0BrVRGH5CZqVAIqZTrC4szy +5rxhnV5px1aN4rIpGwrbj8QhD4L+zFmAaWrLuosxU1atsE3M2rgtHZs2rqrynVPld5HAEO6ftezMMvts +FbgqlNj3wDx7Yc47E4YNky3lOyYPHQy1VgLc29QvA+zFT+iK6czDdOO/Nzby9X3xEdDug4V8vQMPZ+l2 +Xl8cBJB75mC65dYD/wDdHfgHoHtjXIzm/hj3yts4ge9h+hD2EwbIg2b64CF81OZho0FY1UqdLViVCUGl +hq+xlSsXL0k2+fpkmh2nyWKah/xXtVvqEPr4axWd0gb/cu6v756zr4+e/c8x/vtH9hfu4aLjPQ+5FcyX +7I2zwnXptNJqx0wY2ED6tkBQ+r7cNx+tjSXfFvC79af+Gm2kcApGCA/+tgZGgiI+JVP3eT3lh4cg8nwL +Asxd2r4EqwUzhMTe8Ci/NRvnsCCXbxT+BFXfYlUwCCvGkhUwTPmOWCydsHzrmfLBWz9HTgDNqUQvp0TL +DK+z4IB4EQTW3eigpq6sA2YPIG5UcWbZMiO7YYNxcmCzWvGSvFirtoKKyRgH1IR6w2FzUeWCby337kXN +9y1IKga5kpIT6JVCPnh8y1rIh6I0JqD0N98elUvVcQ2X2y/jvNNQOZG+kWNXCVxSNe1eBSVNviLf19XG +R+VlURn7X1/gUsexq+/sf66g+bMRfSo/bY66VbhoJC+WwjJxmPGZkj3rEJgUnr4NF0gltNs/mXGGMKCw +GqUk6udxYXQcHUxB6YF5NvqFJTJ0XJSpzwfl6VKBjcOG9gxYxhrtG6iKJtUzWIpd2jNMFSS19+6SKWP2 +JgZ7gB1nqdiDMOwBqoqfvwdx2AcPfNf+J6VuAsBHTTLzT4y+G4EBDOXKWqpmToq10tVQtnEt52YFyk5V +ap0idW7YqAApj8211hyTA5CSfF2jhyvfVr5E44tgAWFVeMIAviYPYeljrDUrP0vdV/1JabDKj6SHKr8o +bXJdZT7EjJaEnLGjJp4ixasI0w66DpkLz46O2KFkQn3EHTCAtxzmQsulgyb/60903GTjOzaz2CxaMMeD +1ZcvQhFY6yQAexO4GS6+tksHVgLqmEkIWCEc3KagIw2TFV7Tg4JNcG7QD8oD2hqIBO4m8E9OCANqzseM +b+hUih8tloi/h0dZmoBJDmIEYmRLIw+JFzbwb82DOQjCB/w7GF2NMsz9qkGmDsaspWhGwtoKJ/LWWjCV +vraisSy2lUsl8+B6DJJRI6Ax38BSx2AzKePe04tgJBkKi8UGAFXsRKV6PVJgr46uTapn5rwUxDMDEMnU +llb/2qS6nMHSyt8YVI4nqrT2Hwxqx/NRWvvbutrV+hSfeuO7Ul3jErdezyhtX1Pis+Y8qb9uiq/lnbKr +iokkV+iN79/SAvOXupmylM/SbO3rLDzc061uoPQCNFXIBQOMUFdu+Sz0QQeWc+rgpLB1PNvfTv/OZx+o +EKxgThl2OJ7ya14fZvwG03UULkeDf/hRwGaBv4W3zPZhhY85fsNovQbyWdJGOKhaCTHuhrypvW28UE4A +jQbb8PjwcADzoevPKTzBdAlijw4+eDc4zn0hJODtoUT8n9tKPDLNTX3PBxWQWVXWGjLZWiFK4f/+8O6v +U0wd5S2cmzsQShW6+ZgN5lEQ0GHgzwd1I6oNrTkM7vxqtxWxcm+d+57HZXWYrVFUVpZn4ZHJpYWHAIBy +1CFP6riEz+EhZnKGuVOeNV37MFXjoRYR3NGRUD4BmkG+nVAevZgnbU6nUwNtkpK+qljqNy7UP+KdglNG +HbIGq4KP+BQdpAe1NXBcYK0p8OHd1rsMQAoCcTcafhf4K/IRDQ+aWpSMUedgvWg1Qx8PHVuYy9tSjTWD +BWCLzV8NY20xvG6sQfOm8nI1FkTCAnJSDJ5arvt00EYF4ZP6z3Kquzn+ohrOiYGfV5VFzgaLVoZWoZIo +6auKNq6CxXXNCqCApFHD7YXxGTq4tA8W1UuuUun9OG8qmtmPM6fU0D6cO+VG9uLsKTWzB+dPqY29OINK +raiMXPtuJsnjs39y6nxdpuNhJygN/it9Sd6pfr1PSl/+duWkSkTWHUQmm9kueNAGTRGAsq41gWg4zSoa +bnOiFZ8aIy/7xNNOZ/9a8SEDIAFq4GorY0aLsRRWq9et+NSvGrOPVqGsl65AXeKgy77P++bSL1m3XOZt +ziOXvs8449KXqbej0KbUvMX3iaqsddwVH21HXhWTzB17xUfD0WcCq+wTLDr+TKB18hFWwGn1GZoAK7gX +dX2IxUffp1g5Akpeuprx0FCu3olYOVYaStW6DqvGUSPmyahqKJUdY60uyCq27+SSLD5GohMPLbo4KdvG +pS8OETM4IHJ02j8WO2YJWKbfwXrd8YThmMWIkGNm+3iam9l8HnA8fIXQI3lexmio4fni58pNFHB55dQJ +44sWS+6ujeBJfoV4ksjxYAEOQzbEAZwOab35XIHDq7Rgkq5QldQ5LKoeFJtbfkfOw9ROHRcsznHGdhwn +VuA4tefGqWU2ztpY47y1pDnj4oOHk0aInQOoHT2HHyfsT/Dj6VOTuQSfnCmBtF4519d0KSF2GDtGFkX8 +lGFm4JnlfNAzMcxK7p+BJ/+5DOzR5qu0PJs3EMogmjYUik/3DQZj+vK+LemtjenVgF/jCys5zaYu9xZi +ySbsWQ9Io7ZUVxhB3+JGgEtNj5O7dAw3QZgf2A2brBloqwgsN5wYpNNUxjUAM0reKcU7XuqUpUaOcfTG ++hj4egw/EYjlwk9kLE2yHkwWiWbWAVZYWep1SWkPyKhnm8dOq3/4JvBXYyC2sWC4dcR8OZLO59TZraWG +5hb0fOrI1BqBiFT1mk1vBM9g+rxtLxqjljg/uyKXGMp7QE+5TLuhpmzzfaAVO1k7IhYvCPaAmnTMdsNL +LkH2gFTsye2GVrzs6Q2xHbRGejaLNp+LWzbFHaoDDCGWKX9VLHBdDeFHP1EybQCuCjWu2Vm8U3aOFyr1 +FBWob7WdTiuNofCHTASWFzroShsnsxh89Rbtiw9cdAAs5Sig2Y12QGmSoXHJrDnd94QlJFiPWvi1T0KG +jJoUGNUuYIXu12nk9FTfJSUXM4Zk6LvI3s0+8rmYogncTMVBbAWZIK9LQF+e0OYS2ruYuek9M+70iO4y +weMDBtYOUzw+mgo4i6TZVF+JpuFk3wlRk0m/Akmjab8bgkbTfxWKZgZAJyQNDIEKDE1MgU7oGZkEFQia +GQWdUEy3bLXbUGdJnhidJck+BSpTN60+nfr+iQ4qRO2VPxhDEu/2A/KjZeLpblzWbkKSC4f9mT1jx6zi +klv2QYajBa3DZ1wCe3yrDG78MTpgky42UQzlzMBeoPZURQ0HjvaEjg+5NlYcvfJhxo4NQY49sEwDZxMb +p7rgyIZ9Dgbs0HUZyKC0k32PswUeEwxwP2uMNq4uwJUV3GKvJmY3hoHkeN09i7EuNAolSZG2kGLHY3ij +Wm9/D/vvCTNZ1JiM4UZTsObsbtWjN4pb7fNq2rJend6IuyrBvmZPjVccRLuJ6HfCqxta/TmuSRcc6WiB +HXRvk3rV0KrC1xEN4UNBOtCQX4N39UhkjoxWnr7VPHlrfn42GUrJ9W50RciDslU3yTW9DKjn8MQ1Haem +eEzcxvhkVu6wg65PAGpZgXDmkZs57fucWbZNqlVgEDTCUmsu3KrUfgmr4lx/utOgrKVGVS6is65WAZro +THTcMrImDiFMYegwgrBe1hN0mXhqI1qrArJgxheWp24UyGSYerqATAl/WwpDkMLRBCRZmE20uPsBL3zU +vlbSxU/ZaAQIk8FDRB+wQzwEoKOE8NHZ08KnMraB3OOA5uuP22efzAxdgGQ8WRXqA2fVXZeQi9eewG5z +uzEYH5QCC/d+3ij3UQ350rtktu1atcecaavTbjM+lRheOdfmohs/mhYNFR0byVy/RvI9DbX+xlM7/fkJ +Sw4zJHNv0+/rS61bL44Yhow7FJ/TIuU6s2wVB2QMawvcbaXjdKDn22ClNWXoTyckzYtX01ondNnlr8OX +lq3n4yzGPNHmqLb7tSIeS4zmBeC4p357Gy46dhzFNolc+FtduqL+U1vwbeDAKJHnwWjlCIvJIN0QScMj +NUJBFShh4OE7ilXZWj4TWygX5aVtdq/SuUmEGKXE2dOnjq6zIUQ4MQDQsZobLk4cRUbKBfadtoMeKr+x +QkGKXCk89WebcGUgkBE/yhv0WnXTjsJgWfr7lw/ng5K2hsJbu1+TeD/6d8GwF4+zPap5p4CCr1L/xbXT +N7owEhEoXqkoSYgmQCkU1dBigWkHpTm/JSOQlHEmMFPnSa7yS/mtRlwmXJPNRZwkhRZ6gbzji/lU4kNP +pWpqFqCC79OQZon5AYpo9cqlpUydTM59L/RdPnX9xWigQOGqCdpk8priII4QEqMBll0lKM07y0MZlG84 +ZjHKx0X4dJu5esABp/CWMJ7luuPAMXT0I32gLdRZfXXxeJxcGl9WzQ2ad9+LvUJL71BFBoe55+aG4/1r +CglIh4Nr45HIOCQ0N7T1KObIi/0DF3KLM9urceXmC/UAg0rRsjqpM053XavuzZdfVSCkNjN7RSneIO2I +1HsyBfpDSG6GdkVGOR76RIcMSewz6bXGGy2ON3cjG6Qu2RfthO0bvNTSH6q0A9qRcS9pc7JHZNRuZ0d0 +ztUuYo8IJRuThiil0KqQGcvwAK2BsZIVXpOBkpQ29Jl0ikaZfZRHhfJjJj6VSkya4dQ4bKricMZP/Ryf +59voyixuTW13xD03dew6xzCdYIvTeJUCizb1BoWL8NcMBadpWZQgoQDXU1fkREsY1KoqTeFQq8q/e9FS +WLpLdu6NGrIyHVTdRBVt1F3txYm0IvMbqmmYVPFV5axNlSFhLBPCHSuBqowV081KVfYQLNAxaHEm7UGN +LsunMCj5vWUOiTpFmM9JoBu6N81GoF0DBs4HkZuI0L4b41ZCSxSjLIZUqSkqUILZCABfYem6AVjFvBHl +TNlTR17E9wqqOZTPpmDWjSqpgVlYaZmzPwl3lfRLW4/IxrDYNKnfxOM8YftmcZLooIZT+WQLhmxWiQ+6 +8DnNG2HCatlgzOsERiO78xTuld9pWoRq2guJGcy4HadJMOZ2JmG8Aa9Vc6MrZHYKolF9FOjbK6+5t6km +uZDEwYzJcR4FYya/8jYm3FXtkCRD1SamFujZC1PR5+3L1ypDpkxYFCqPRxUoZRBgvfwRgWomlTNlmvVM +OQum7lybz0pZ5waVpYqOwhrfoOSOZuFbfqdZMkgsK63iobS4tMrKtIkGhTHzo2bxNNejZgW6X1Iqq71c +hUHzo/+i0KvZoTdWvTlWHdU4FHPiof4ayR9NwzJfTeUiU81pVwPRIBXwA7/Tr5Q4MbFmbIzrVyepobpy +maddMZYKqbRInnaojJlH9aunIkYAvkv+1Achk9gR3c7KwWNJT9kzAydJNitdVt4s162TLwryQguojGqt +XbE3AEqQqFuoNborkkVcvby37IEU3Og18tgCJF4g1olkS/VYaCohJOLVAuS7jKpqFrMGQPWBUNv21h+y +D3/AaahGB3UiVt8DIoMdcTkKIqcHF+JuTjNDR5OJk0nbwVRjFNUaQfVqKU0Qb2KBlidROXMOA4Q0TH5p +V7BZf8RQ4qHc7Ocq4a0ukDYTt40FceL+nviA4Ibpb8acwFqkcR6IFbBEfkycSLf1HoIZl9D2Y+LGpcoA +/TCCgZm3HxEz4i3o+2XGD5ipow8uYHbyYfzTkAOERLyfe7/0XwAKvdKv4Jqy4FxWS6inCAWI3P7YoOUb +kWipPO5WfLTTCSmhfitnSxnsWtLQ6W7EqCaSM5nAePnL64tjliawq7XbKs91JvW0VuE63FOZsDmeLlLn +omq8yLJgOSfeKKw9vqi9aaVghwvgEvx7zNSRRR3uKIzUKcdWxuRNTTp8t1mp3elS5sx81Vt/aq3X7t1L +h+aEcAQ1x+x3o+H/kGkQhgf5TDD53Kcnh5gp9uyLE0rjevbF/w8AAP//dONGGqf5AAA= `, }, diff --git a/static/status.html b/static/status.html index 99188a247..891332f9d 100644 --- a/static/status.html +++ b/static/status.html @@ -801,10 +801,10 @@
if (self.detailsOA && rg == self.detailsRepgroup) { // the user has clicked on a progress bar for // a particular repgroup; add to its details + var walltime = json['Walltime']; if (json['State'] == "running") { // have Walltime on running jobs auto- // increment - var walltime = json['Walltime']; var began = new Date(); var now = ko.observable(new Date()); json['LiveWalltime'] = ko.computed(function() { @@ -819,6 +819,10 @@
}; }, 1000); } + } else { + json['LiveWalltime'] = ko.computed(function() { + return walltime; + }); } self.detailsOA.push(json); } From 347bfc7e62fdf51d6eee5281da4752f06f2d7a7d Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Thu, 16 Nov 2017 14:09:18 +0000 Subject: [PATCH 03/33] bsub emulation now works in to OpenStack, with support for mounts. --- cmd/add.go | 17 ++++++++-- cmd/lsf.go | 34 +++++++++++++------ jobqueue/client.go | 76 +++++++++++++++++++++++++++++++++--------- jobqueue/job.go | 31 ++++++++++++++--- jobqueue/server.go | 2 +- jobqueue/serverREST.go | 10 ++++++ 6 files changed, 136 insertions(+), 34 deletions(-) diff --git a/cmd/add.go b/cmd/add.go index 7cb7e354b..baea0f75d 100644 --- a/cmd/add.go +++ b/cmd/add.go @@ -58,6 +58,7 @@ var cmdOsPrefix string var cmdOsUsername string var cmdPostCreationScript string var cmdOsRAM int +var cmdBsubMode bool // addCmd represents the add command var addCmd = &cobra.Command{ @@ -73,7 +74,7 @@ command as one of the name:value pairs. The possible options are: cmd cwd cwd_matters change_home on_failure on_success on_exit mounts req_grp memory time override cpus disk priority retries rep_grp dep_grps deps cmd_deps -cloud_os cloud_username cloud_ram cloud_script env +cloud_os cloud_username cloud_ram cloud_script env bsub_mode If any of these will be the same for all your commands, you can instead specify them as flags (which are treated as defaults in the case that they are @@ -235,13 +236,23 @@ base variables as they were at the moment in time you run 'wr add', so to set a certain environment variable for all commands, you could instead just set it prior to calling 'wr add'. In the remote case the command will use base variables as they were on the machine where the command is executed when that -machine was started.`, +machine was started. + +"bsub_mode" is a boolean that results in the job being assigned a unique (for +this manager session) job id, and turns on bsub emulation, which means that if +your Cmd calls bsub, it will instead result in a command being added to wr. The +new job will have this job's mount and cloud_* options.`, Run: func(combraCmd *cobra.Command, args []string) { // check the command line options if cmdFile == "" { die("--file is required") } + bsubMode := "" + if cmdBsubMode { + bsubMode = deployment + } + jd := &jobqueue.JobDefaults{ RepGrp: cmdRepGroup, ReqGrp: reqGroup, @@ -258,6 +269,7 @@ machine was started.`, CloudUser: cmdOsUsername, CloudScript: cmdPostCreationScript, CloudOSRam: cmdOsRAM, + BsubMode: bsubMode, } if jd.RepGrp == "" { @@ -477,6 +489,7 @@ func init() { addCmd.Flags().StringVar(&cmdPostCreationScript, "cloud_script", "", "in the cloud, path to a start-up script that will be run on the servers created to run these commands") addCmd.Flags().StringVar(&cmdEnv, "env", "", "comma-separated list of key=value environment variables to set before running the commands") addCmd.Flags().BoolVar(&cmdReRun, "rerun", false, "re-run any commands that you add that had been previously added and have since completed") + addCmd.Flags().BoolVar(&cmdBsubMode, "bsub", false, "enable bsub emulation mode") addCmd.Flags().IntVar(&timeoutint, "timeout", 120, "how long (seconds) to wait to get a reply from 'wr manager'") } diff --git a/cmd/lsf.go b/cmd/lsf.go index 762236fb7..f52f12619 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -19,13 +19,12 @@ package cmd import ( - "fmt" - // "github.com/VertebrateResequencing/wr/internal" "bufio" + "encoding/json" + "fmt" "github.com/VertebrateResequencing/wr/jobqueue" jqs "github.com/VertebrateResequencing/wr/jobqueue/scheduler" "github.com/spf13/cobra" - "io/ioutil" "os" "path/filepath" "regexp" @@ -68,24 +67,37 @@ effectively emulates all the work being done on an LSF farm with shared disk.`, // bsub sub-command emulates bsub. var lsfBsubCmd = &cobra.Command{ Use: "bsub", - Short: "Add jobs using bsub syntax", - Long: `Add jobs to the queue using bsub syntax.`, + Short: "Add a job using bsub syntax", + Long: `Add a job to the queue using bsub syntax.`, Run: func(cmd *cobra.Command, args []string) { wd, err := os.Getwd() if err != nil { die(err.Error()) } - // *** use os.Executable() to find the directory of the bsub symlink, - // and inside read the .bsub_config file to get the cloud_* and mount - // options that should be included on job - job := &jobqueue.Job{ - BsubMode: true, + BsubMode: deployment, RepGroup: "bsub", Cwd: wd, CwdMatters: true, Requirements: &jqs.Requirements{Cores: 1, RAM: 1000, Time: 1 * time.Hour}, + Retries: uint8(0), + } + + // since bsub calls can't communicate possibly necessary cloud_* and + // mount options for the job we're going to add, we read these from an + // environment variable that got created when a job was added to the + // queue with --bsub option; since this arrangement is in theory + // "optional", we ignore errors + if jsonStr := os.Getenv("WR_BSUB_CONFIG"); jsonStr != "" { + configJob := &jobqueue.Job{} + if err := json.Unmarshal([]byte(jsonStr), configJob); err == nil { + job.MountConfigs = configJob.MountConfigs + job.Requirements.Other = configJob.Requirements.Other + job.BsubMode = configJob.BsubMode + deployment = configJob.BsubMode + initConfig() + } } r := regexp.MustCompile(`^#BSUB\s+-(\w)\s+(.+)$`) @@ -158,7 +170,7 @@ var lsfBsubCmd = &cobra.Command{ os.Exit(255) } - j, err := jq.GetByEssence(&jobqueue.JobEssence{Cmd: job.Cmd, Cwd: job.Cwd}, false, false) + j, err := jq.GetByEssence(&jobqueue.JobEssence{Cmd: job.Cmd, Cwd: job.Cwd, MountConfigs: job.MountConfigs}, false, false) if err != nil { die(err.Error()) } diff --git a/jobqueue/client.go b/jobqueue/client.go index 9659cfb05..3a86a2d9e 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -22,6 +22,7 @@ package jobqueue import ( "bytes" + "encoding/json" "fmt" "io/ioutil" "os" @@ -400,38 +401,45 @@ func (c *Client) Execute(job *Job, shell string) error { var onCwd bool var prependPath string - if job.BsubMode { + if job.BsubMode != "" { // create parent of job.Cwd so we can later mount at job.Cwd parent := filepath.Dir(job.Cwd) os.MkdirAll(parent, os.ModePerm) if fi, err := os.Stat(parent); err != nil || !fi.Mode().IsDir() { - c.Bury(job, FailReasonCwd) + c.Bury(job, nil, FailReasonCwd) return fmt.Errorf("parent of working directory [%s] could not be created", parent) } + } + var mountCouldFail bool + host, err := os.Hostname() + if err != nil { + host = "localhost" + } + if job.BsubMode != "" { + parent := filepath.Dir(cmd.Dir) // create bsub and bjobs symlinks in a sister dir of job.Cwd prependPath = filepath.Join(parent, lsfEmulationDir) os.MkdirAll(prependPath, os.ModePerm) if fi, err := os.Stat(prependPath); err != nil || !fi.Mode().IsDir() { - c.Bury(job, FailReasonCwd) + c.Bury(job, nil, FailReasonCwd) return fmt.Errorf("sister of working directory [%s] could not be created", prependPath) } - wr, err := os.Executable() if err != nil { - c.Bury(job, FailReasonAbnormal) + c.Bury(job, nil, fmt.Sprintf("could not get path to wr: %s", err)) return fmt.Errorf("could not get path to wr: %s", err) } bsub := filepath.Join(prependPath, "bsub") bjobs := filepath.Join(prependPath, "bjobs") err = os.Symlink(wr, bsub) - if err != nil { - c.Bury(job, FailReasonAbnormal) + if err != nil && !os.IsExist(err) { + c.Bury(job, nil, fmt.Sprintf("could not create bsub symlink: %s", err)) return fmt.Errorf("could not create bsub symlink: %s", err) } err = os.Symlink(wr, bjobs) - if err != nil { - c.Bury(job, FailReasonAbnormal) + if err != nil && !os.IsExist(err) { + c.Bury(job, nil, fmt.Sprintf("could not create bjobs symlink: %s", err)) return fmt.Errorf("could not create bjobs symlink: %s", err) } @@ -441,12 +449,15 @@ func (c *Client) Execute(job *Job, shell string) error { // we'll run the command from the desired directory, which must exist or // it will fail if fi, errf := os.Stat(job.Cwd); errf != nil || !fi.Mode().IsDir() { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + os.MkdirAll(job.Cwd, os.ModePerm) + if _, errf = os.Stat(job.Cwd); errf != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("working directory [%s] does not exist%s", job.Cwd, extra) } - return fmt.Errorf("working directory [%s] does not exist%s", job.Cwd, extra) } var actualCwd, tmpDir string if job.CwdMatters { @@ -466,9 +477,21 @@ func (c *Client) Execute(job *Job, shell string) error { job.ActualCwd = actualCwd } + // if we are a child job of another running on the same host, we expect + // mounting to fail since we're running in the same directory as our + // parent + if jsonStr := job.Getenv("WR_BSUB_CONFIG"); jsonStr != "" { + configJob := &Job{} + if err := json.Unmarshal([]byte(jsonStr), configJob); err == nil && configJob.Host == host { + mountCouldFail = true + // *** but the problem with this is, the parent job could finish + // while we're still running, and unmount!... + } + } + // we'll mount any configured remote file systems err = job.Mount(onCwd) - if err != nil { + if err != nil && !mountCouldFail { if strings.Contains(err.Error(), "fusermount exited with code 256") { // *** not sure what causes this, but perhaps trying again after a // few seconds will help? @@ -476,7 +499,7 @@ func (c *Client) Execute(job *Job, shell string) error { err = job.Mount() } if err != nil { - buryErr := fmt.Errorf("failed to mount remote file system(s): %s", err) + buryErr := fmt.Errorf("failed to mount remote file system(s): %s (%s)", err, os.Environ()) errb := c.Bury(job, nil, FailReasonMount, buryErr) if errb != nil { buryErr = fmt.Errorf("%s (and burying the job failed: %s)", buryErr.Error(), errb) @@ -533,6 +556,27 @@ func (c *Client) Execute(job *Job, shell string) error { } } env = envOverride(env, override) + + // add an environment variable of this job as JSON, so that any cloud_* + // or mount options can be copied to child jobs created via our bsub + // symlink. (It will also need to know our deployment, stored in + // BsubMode, and to know the host we're running on in case our children + // run on the same host as us and therefore any mounts are expected to + // fail) + simplified := &Job{ + MountConfigs: job.MountConfigs, + Requirements: job.Requirements, + BsubMode: job.BsubMode, + Host: host, + } + jobJSON, err := json.Marshal(simplified) + if err != nil { + c.Bury(job, nil, fmt.Sprintf("could not convert job to JSON: %s", err)) + return fmt.Errorf("could not convert job to JSON: %s", err) + } + env = envOverride(env, []string{ + "WR_BSUB_CONFIG=" + string(jobJSON), + }) } cmd.Env = env diff --git a/jobqueue/job.go b/jobqueue/job.go index 97afffaf1..d5e3502a4 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -166,9 +166,9 @@ type Job struct { // ActualCwd. MountConfigs MountConfigs - // BsubMode true when Add()ing a job will result in the job being assigned - // a BsubID. - BsubMode bool + // BsubMode set to either Production or Development when Add()ing a job will + // result in the job being assigned a BsubID. + BsubMode string // The remaining properties are used to record information about what // happened when Cmd was executed, or otherwise provide its current state. @@ -235,7 +235,7 @@ type Job struct { // name of the queue the Job was added to. Queue string // unique (for this manager session) id of the job submission, present if - // BsubMode was true when the job was added. + // BsubMode was set when the job was added. BsubID uint64 // we add this internally to match up runners we spawn via the scheduler to @@ -349,6 +349,23 @@ func (j *Job) EnvAddOverride(env []string) error { return err } +// Getenv is like os.Getenv(), but for the environment variables stored in the +// the job, including any overrides. Returns blank if Env() would have returned +// an error. +func (j *Job) Getenv(key string) (value string) { + env, err := j.Env() + if err != nil { + return + } + for _, envvar := range env { + pair := strings.Split(envvar, "=") + if pair[0] == key { + return pair[1] + } + } + return +} + // StdOut returns the decompressed job.StdOutC, which is the head and tail of // job.Cmd's STDOUT when it ran. If the Cmd hasn't run yet, or if it output // nothing to STDOUT, you will get an empty string. Note that StdOutC is only @@ -400,6 +417,9 @@ func (j *Job) TriggerBehaviours(success bool) error { // itself. (This will fail if j.Cwd is not empty or already mounted by another // process.) func (j *Job) Mount(onCwd ...bool) error { + // j.Lock() + // defer j.Unlock() + cwd := j.Cwd defaultMount := filepath.Join(j.Cwd, "mnt") defaultCacheBase := cwd @@ -542,6 +562,9 @@ func (j *Job) Mount(onCwd ...bool) error { // directories between the mount point(s) and Cwd if not CwdMatters and the // mount point was (within) ActualCwd. func (j *Job) Unmount(stopUploads ...bool) (logs string, err error) { + // j.Lock() + // defer j.Unlock() + var doNotUpload bool if len(stopUploads) == 1 { doNotUpload = stopUploads[0] diff --git a/jobqueue/server.go b/jobqueue/server.go index ad78646d9..8cf557234 100644 --- a/jobqueue/server.go +++ b/jobqueue/server.go @@ -1083,7 +1083,7 @@ func (s *Server) createJobs(inputJobs []*Job, envkey string, ignoreComplete bool if s.rc != "" { job.schedulerGroup = job.Requirements.Stringify() } - if job.BsubMode { + if job.BsubMode != "" { atomic.AddUint64(&BsubID, 1) job.BsubID = atomic.LoadUint64(&BsubID) } diff --git a/jobqueue/serverREST.go b/jobqueue/serverREST.go index 0214e9e98..dd6ad3295 100644 --- a/jobqueue/serverREST.go +++ b/jobqueue/serverREST.go @@ -76,6 +76,7 @@ type JobViaJSON struct { CloudUser string `json:"cloud_username"` CloudScript string `json:"cloud_script"` CloudOSRam *int `json:"cloud_ram"` + BsubMode string `jsob:"bsub_mode"` } // JobDefaults is supplied to JobViaJSON.Convert() to provide default values for @@ -113,6 +114,7 @@ type JobDefaults struct { // CloudOSRam is the number of Megabytes that CloudOS needs to run. Defaults // to 1000. CloudOSRam int + BsubMode string compressedEnv []byte osRAM string } @@ -183,6 +185,7 @@ func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { var deps Dependencies var behaviours Behaviours var mounts MountConfigs + var bsubMode string if jvj.RepGrp == "" { repg = jd.RepGrp @@ -336,6 +339,11 @@ func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { mounts = jd.MountConfigs } + bsubMode = jvj.BsubMode + if bsubMode == "" && jd.BsubMode != "" { + bsubMode = jd.BsubMode + } + // scheduler-specific options other := make(map[string]string) if jvj.CloudOS != "" { @@ -384,6 +392,7 @@ func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { EnvOverride: envOverride, Behaviours: behaviours, MountConfigs: mounts, + BsubMode: bsubMode, }, nil } @@ -534,6 +543,7 @@ func restJobsAdd(r *http.Request, s *Server) ([]*Job, int, error) { CloudUser: r.Form.Get("cloud_username"), CloudScript: r.Form.Get("cloud_script"), CloudOSRam: urlStringToInt(r.Form.Get("cloud_ram")), + BsubMode: r.Form.Get("bsub_mode"), } if r.Form.Get("cwd_matters") == restFormTrue { jd.CwdMatters = true From 80ac51e838eed61cb0af36d1166c0cbb32a0d1b2 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Wed, 21 Mar 2018 16:40:28 +0000 Subject: [PATCH 04/33] Update for changes to jobqueue.Connect(). --- cmd/lsf.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cmd/lsf.go b/cmd/lsf.go index f52f12619..3c2fcd6a9 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -1,4 +1,4 @@ -// Copyright © 2017 Genome Research Limited +// Copyright © 2018 Genome Research Limited // Author: Sendu Bala . // // This file is part of wr. @@ -22,15 +22,16 @@ import ( "bufio" "encoding/json" "fmt" - "github.com/VertebrateResequencing/wr/jobqueue" - jqs "github.com/VertebrateResequencing/wr/jobqueue/scheduler" - "github.com/spf13/cobra" "os" "path/filepath" "regexp" "strconv" "strings" "time" + + "github.com/VertebrateResequencing/wr/jobqueue" + jqs "github.com/VertebrateResequencing/wr/jobqueue/scheduler" + "github.com/spf13/cobra" ) // options for this cmd @@ -153,7 +154,7 @@ var lsfBsubCmd = &cobra.Command{ } // connect to the server - jq, err := jobqueue.Connect(addr, "cmds", 10*time.Second) + jq, err := jobqueue.Connect(addr, 10*time.Second) if err != nil { die(err.Error()) } From c17b7ee6e18a60b29d34ca130dcbdbce44fdbfd4 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Wed, 21 Mar 2018 16:41:14 +0000 Subject: [PATCH 05/33] Correct parent dir in bsub mode to be absolute; set LSF_* env vars. --- jobqueue/client.go | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/jobqueue/client.go b/jobqueue/client.go index 3a86a2d9e..a910614e0 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -416,7 +416,16 @@ func (c *Client) Execute(job *Job, shell string) error { host = "localhost" } if job.BsubMode != "" { - parent := filepath.Dir(cmd.Dir) + jobCwd := job.Cwd + if jobCwd == "" { + jobCwd = "." + } + absJobCwd, err := filepath.Abs(jobCwd) + if err != nil { + c.Bury(job, nil, FailReasonCwd) + return fmt.Errorf("failed to make cmd dir absolute: %s", err) + } + parent := filepath.Dir(absJobCwd) // create bsub and bjobs symlinks in a sister dir of job.Cwd prependPath = filepath.Join(parent, lsfEmulationDir) @@ -427,19 +436,19 @@ func (c *Client) Execute(job *Job, shell string) error { } wr, err := os.Executable() if err != nil { - c.Bury(job, nil, fmt.Sprintf("could not get path to wr: %s", err)) + c.Bury(job, nil, FailReasonCwd) return fmt.Errorf("could not get path to wr: %s", err) } bsub := filepath.Join(prependPath, "bsub") bjobs := filepath.Join(prependPath, "bjobs") err = os.Symlink(wr, bsub) if err != nil && !os.IsExist(err) { - c.Bury(job, nil, fmt.Sprintf("could not create bsub symlink: %s", err)) + c.Bury(job, nil, FailReasonCwd) return fmt.Errorf("could not create bsub symlink: %s", err) } err = os.Symlink(wr, bjobs) if err != nil && !os.IsExist(err) { - c.Bury(job, nil, fmt.Sprintf("could not create bjobs symlink: %s", err)) + c.Bury(job, nil, FailReasonCwd) return fmt.Errorf("could not create bjobs symlink: %s", err) } @@ -576,6 +585,10 @@ func (c *Client) Execute(job *Job, shell string) error { } env = envOverride(env, []string{ "WR_BSUB_CONFIG=" + string(jobJSON), + "LSF_SERVERDIR=/dev/null", + "LSF_LIBDIR=/dev/null", + "LSF_ENVDIR=/dev/null", + "LSF_BINDIR=" + prependPath, }) } cmd.Env = env From e6b165bdc0829a54c1211e90a3d94441044e61f2 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Thu, 3 May 2018 09:27:37 +0100 Subject: [PATCH 06/33] Fix #35: memory and cpu usage now correct for cmds running docker. wr add gets new --monitor_docker arg, REST API takes monitor_docker arg for POST and status, REST and web interface show the monitor_docker option. Monitoring docker also kills the docker if cmd is killed. Commands are only killed for using over 90% physical memory, not just for going over expected memory. Moved random string generation code from scheduler to new internal.utils method RandomString(). --- cmd/add.go | 20 ++- cmd/status.go | 10 +- glide.lock | 60 ++++++- glide.yaml | 4 + internal/docker.go | 50 ++++++ internal/utils.go | 28 +++ jobqueue/client.go | 164 ++++++++++++++++-- jobqueue/job.go | 15 ++ jobqueue/jobqueue_test.go | 155 +++++++++++++++-- jobqueue/scheduler/scheduler.go | 23 +-- jobqueue/serverCLI.go | 69 ++++---- jobqueue/serverREST.go | 88 +++++----- jobqueue/serverWebI.go | 24 +-- jobqueue/static.go | 298 ++++++++++++++++---------------- static/status.html | 6 + 15 files changed, 730 insertions(+), 284 deletions(-) create mode 100644 internal/docker.go diff --git a/cmd/add.go b/cmd/add.go index d55a9f86b..960ccc420 100644 --- a/cmd/add.go +++ b/cmd/add.go @@ -59,6 +59,7 @@ var cmdOsRAM int var cmdPostCreationScript string var cmdCloudConfigs string var cmdFlavor string +var cmdMonitorDocker string // addCmd represents the add command var addCmd = &cobra.Command{ @@ -74,8 +75,8 @@ command as one of the name:value pairs. The possible options are: cmd cwd cwd_matters change_home on_failure on_success on_exit mounts req_grp memory time override cpus disk priority retries rep_grp dep_grps deps cmd_deps -cloud_os cloud_username cloud_ram cloud_script cloud_config_files cloud_flavor -env +monitor_docker cloud_os cloud_username cloud_ram cloud_script cloud_config_files +cloud_flavor env If any of these will be the same for all your commands, you can instead specify them as flags (which are treated as defaults in the case that they are @@ -221,6 +222,19 @@ name:value pairs (if cwd doesn't matter for a cmd, provide it as an empty string). These are static dependencies; once resolved they do not get re- evaluated. +"monitor_docker" turns on monitoring of a docker container identified by the +given string, which could be the container's --name or path to its --cidfile. +This will add the container's peak RAM and total CPU usage to the reported RAM +and CPU usage of this job. If the special argument "?" is supplied, monitoring +will apply to the first new docker container that appears after the command +starts to run. NB: in ? mode, if multiple jobs that run docker containers start +running at the same time on the same machine, the reported stats could be wrong +for one or more of those jobs. Requires that docker is installed on the machine +where the job will run (and that the command uses docker to run a container). +NB: does not handle monitoring of multiple docker containers run by a single +command. A side effect of monitoring a container is that if you use wr to kill +the job for this command, wr will also kill the container. + The "cloud_*" related options let you override the defaults of your cloud deployment. For example, if you do 'wr cloud deploy --os "Ubuntu 16" --os_ram 2048 -u ubuntu -s ~/my_ubuntu_post_creation_script.sh', any commands you add @@ -301,6 +315,7 @@ func init() { addCmd.Flags().IntVarP(&cmdRet, "retries", "r", 3, "[0-255] number of automatic retries for failed commands") addCmd.Flags().StringVar(&cmdCmdDeps, "cmd_deps", "", "dependencies of your commands, in the form \"command1,cwd1,command2,cwd2...\"") addCmd.Flags().StringVarP(&cmdGroupDeps, "deps", "d", "", "dependencies of your commands, in the form \"dep_grp1,dep_grp2...\"") + addCmd.Flags().StringVar(&cmdMonitorDocker, "monitor_docker", "", "monitor resource usage of docker container with given --name or --cidfile path") addCmd.Flags().StringVar(&cmdOnFailure, "on_failure", "", "behaviours to carry out when cmds fails, in JSON format") addCmd.Flags().StringVar(&cmdOnSuccess, "on_success", "", "behaviours to carry out when cmds succeed, in JSON format") addCmd.Flags().StringVar(&cmdOnExit, "on_exit", `[{"cleanup":true}]`, "behaviours to carry out when cmds finish running, in JSON format") @@ -367,6 +382,7 @@ func parseCmdFile(jq *jobqueue.Client) ([]*jobqueue.Job, bool, bool) { Priority: cmdPri, Retries: cmdRet, Env: cmdEnv, + MonitorDocker: cmdMonitorDocker, CloudOS: cmdOsPrefix, CloudUser: cmdOsUsername, CloudScript: cmdPostCreationScript, diff --git a/cmd/status.go b/cmd/status.go index b25ebd454..5dcab2668 100644 --- a/cmd/status.go +++ b/cmd/status.go @@ -123,6 +123,14 @@ very many (tens of thousands+) commands.`, homeChanged = "Changed home: true\n" } } + var dockerMonitored string + if job.MonitorDocker != "" { + dockerID := job.MonitorDocker + if dockerID == "?" { + dockerID += " (first container started after cmd)" + } + dockerMonitored = fmt.Sprintf("Docker container monitoring turned on for: %s\n", dockerID) + } var behaviours string if len(job.Behaviours) > 0 { behaviours = fmt.Sprintf("Behaviours: %s\n", job.Behaviours) @@ -135,7 +143,7 @@ very many (tens of thousands+) commands.`, } other = fmt.Sprintf("Resource requirements: %s\n", strings.Join(others, ", ")) } - fmt.Printf("\n# %s\nCwd: %s\n%s%s%s%sId: %s; Requirements group: %s; Priority: %d; Attempts: %d\nExpected requirements: { memory: %dMB; time: %s; cpus: %d disk: %dGB }\n", job.Cmd, cwd, mounts, homeChanged, behaviours, other, job.RepGroup, job.ReqGroup, job.Priority, job.Attempts, job.Requirements.RAM, job.Requirements.Time, job.Requirements.Cores, job.Requirements.Disk) + fmt.Printf("\n# %s\nCwd: %s\n%s%s%s%s%sId: %s; Requirements group: %s; Priority: %d; Attempts: %d\nExpected requirements: { memory: %dMB; time: %s; cpus: %d disk: %dGB }\n", job.Cmd, cwd, mounts, homeChanged, dockerMonitored, behaviours, other, job.RepGroup, job.ReqGroup, job.Priority, job.Attempts, job.Requirements.RAM, job.Requirements.Time, job.Requirements.Cores, job.Requirements.Disk) switch job.State { case jobqueue.JobStateDelayed: diff --git a/glide.lock b/glide.lock index e94a2f27a..e32403c0a 100644 --- a/glide.lock +++ b/glide.lock @@ -1,5 +1,5 @@ -hash: e2203d54798631b547a000a0aaebf4722f85fed3f9074270d6182ab2c6390e4e -updated: 2018-04-26T17:01:38.374633759+01:00 +hash: b5179681947c4f7148802f6b580d9e0914b9d18a767f8d3067851dcb8634334e +updated: 2018-05-03T09:22:29.966469534+01:00 imports: - name: code.cloudfoundry.org/bytefmt version: b31f603f5e1e047fdb38584e1a922dcc5c4de5c8 @@ -11,6 +11,39 @@ imports: version: 48ea1b39c25fc1bab3506fbc712ecbaa842c4d2d - name: github.com/dgryski/go-farm version: 2de33835d10275975374b37b2dcfd22c9020a1f5 +- name: github.com/docker/distribution + version: 83389a148052d74ac602f5f1d62f86ff2f3c4aa5 + subpackages: + - digestset + - reference +- name: github.com/docker/docker + version: 51a9119f6b817bbae21805ec05787d462c9492cd + subpackages: + - api + - api/types + - api/types/blkiodev + - api/types/container + - api/types/events + - api/types/filters + - api/types/image + - api/types/mount + - api/types/network + - api/types/registry + - api/types/strslice + - api/types/swarm + - api/types/swarm/runtime + - api/types/time + - api/types/versions + - api/types/volume + - client +- name: github.com/docker/go-connections + version: 7395e3f8aa162843a74ed6d48e79627d9792ac55 + subpackages: + - nat + - sockets + - tlsconfig +- name: github.com/docker/go-units + version: 47565b4f722fb6ceae66b95f853feed578a4a51c - name: github.com/fanatic/go-infoblox version: ece4e23a370d0bbac46d17f386f73d0d124a8d97 - name: github.com/fatih/color @@ -18,7 +51,7 @@ imports: - name: github.com/go-ini/ini version: ace140f73450505f33e8b8418216792275ae82a7 - name: github.com/go-mangos/mangos - version: d99fcacf432eddb854ef80587abee023e23c5263 + version: ebfcfe7a6525c954fa08327634a7cd053e65f975 subpackages: - protocol/rep - protocol/req @@ -30,6 +63,10 @@ imports: - oleutil - name: github.com/go-stack/stack version: 259ab82a6cad3992b4e21ff5cac294ccb06474bc +- name: github.com/gogo/protobuf + version: 1ef32a8b9fc3f8ec940126907cedb5998f6318e4 + subpackages: + - proto - name: github.com/gophercloud/gophercloud version: 46e00f63eeda0660d8a47cbfd5592a4d5dbf47b8 repo: https://github.com/sb10/gophercloud.git @@ -52,7 +89,7 @@ imports: - openstack/utils - pagination - name: github.com/gorilla/websocket - version: eb925808374e5ca90c83401a40d711dc08c0c0f6 + version: 21ab95fa12b9bdd8fecf5fa3586aad941cc98785 - name: github.com/grafov/bcast version: e9affb593f6c871f9b4c3ee6a3c77d421fe953df - name: github.com/hanwen/go-fuse @@ -88,6 +125,8 @@ imports: version: efa589957cd060542a26d2dd7832fd6a6c6c3ade - name: github.com/mattn/go-isatty version: 6ca4dbf54d38eea1a992b3c722a76a5d1c4cb25c +- name: github.com/Microsoft/go-winio + version: ab35fc04b6365e8fcb18e6e9e41ea4a02b10b175 - name: github.com/minio/minio-go version: 3d2d02921f0510e9d1f66ef77a265b8dddd36992 subpackages: @@ -98,6 +137,13 @@ imports: - pkg/set - name: github.com/mitchellh/go-homedir version: b8bc1bf767474819792c23f32d8286a45736f1c6 +- name: github.com/opencontainers/go-digest + version: c9281466c8b2f606084ac71339773efd177436e7 +- name: github.com/opencontainers/image-spec + version: e562b04403929d582d449ae5386ff79dd7961a11 + subpackages: + - specs-go + - specs-go/v1 - name: github.com/pkg/errors version: 816c9085562cd7ee03e7f8188a1cfd942858cded - name: github.com/pkg/sftp @@ -113,7 +159,7 @@ imports: - name: github.com/sevlyar/go-daemon version: 01bb5caedcc4ad839213b4ff96c363f6fab40adb - name: github.com/shirou/gopsutil - version: cd915bdc31582b0a56405ede7fa2f4ab043f851b + version: 57f370e13068146efe1cb7129f79e5d51da8a242 subpackages: - cpu - host @@ -151,8 +197,12 @@ imports: - name: golang.org/x/net version: 61147c48b25b599e5b561d2e9c4f3e1ef489ca41 subpackages: + - context + - context/ctxhttp - idna + - internal/socks - lex/httplex + - proxy - publicsuffix - name: golang.org/x/sys version: 3b87a42e500a6dc65dae1a55d0b641295971163e diff --git a/glide.yaml b/glide.yaml index 76e2732ee..bddc5a8b2 100644 --- a/glide.yaml +++ b/glide.yaml @@ -64,6 +64,10 @@ import: version: ^1.6.0 - package: github.com/hashicorp/go-multierror - package: github.com/fanatic/go-infoblox +- package: github.com/docker/docker + subpackages: + - client + - api/types testImport: - package: github.com/smartystreets/goconvey version: master diff --git a/internal/docker.go b/internal/docker.go new file mode 100644 index 000000000..466588360 --- /dev/null +++ b/internal/docker.go @@ -0,0 +1,50 @@ +// Copyright © 2018 Genome Research Limited +// Author: Sendu Bala . +// +// This file is part of wr. +// +// wr is free software: you can redistribute it and/or modify +// it under the terms of the GNU Lesser General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// wr is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with wr. If not, see . + +package internal + +// this file has functions for dealing with docker + +import ( + "context" + "encoding/json" + + "github.com/docker/docker/api/types" + docker "github.com/docker/docker/client" +) + +// DockerStats asks docker for the current memory usage (RSS) and total CPU +// usage of the given container. +func DockerStats(dockerClient *docker.Client, containerID string) (memMB int, cpuSec int, err error) { + stats, err := dockerClient.ContainerStats(context.Background(), containerID, false) + if err != nil { + return 0, 0, err + } + + var ds *types.Stats + err = json.NewDecoder(stats.Body).Decode(&ds) + if err != nil { + return 0, 0, err + } + + memMB = int(ds.MemoryStats.Stats["rss"] / 1024 / 1024) // bytes to MB + cpuSec = int(ds.CPUStats.CPUUsage.TotalUsage / 1000000000) // nanoseconds to seconds + + err = stats.Body.Close() + return memMB, cpuSec, err +} diff --git a/internal/utils.go b/internal/utils.go index dbe7846ee..c280038b1 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -27,6 +27,7 @@ import ( "io" "io/ioutil" "log" + "math/rand" "net/url" "os" "os/exec" @@ -44,6 +45,14 @@ import ( const gb = uint64(1.07374182e9) // for byte to GB conversion +// for the RandomString implementation +const ( + randBytes = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + randIdxBits = 6 // 6 bits to represent a rand index + randIdxMask = 1<= 0; { + if remain == 0 { + cache, remain = src.Int63(), randIdxMax + } + if idx := int(cache & randIdxMask); idx < len(randBytes) { + b[i] = randBytes[idx] + i-- + } + cache >>= randIdxBits + remain-- + } + return string(b) +} diff --git a/jobqueue/client.go b/jobqueue/client.go index e7fdde6c7..90cbf00df 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -22,6 +22,7 @@ package jobqueue import ( "bytes" + "context" "crypto/tls" "crypto/x509" "fmt" @@ -29,12 +30,16 @@ import ( "os" "os/exec" "os/signal" + "path/filepath" "runtime" "strings" "sync" "syscall" "time" + "github.com/VertebrateResequencing/wr/internal" + "github.com/docker/docker/api/types" + docker "github.com/docker/docker/client" "github.com/go-mangos/mangos" "github.com/go-mangos/mangos/protocol/req" "github.com/go-mangos/mangos/transport/tlstcp" @@ -53,6 +58,7 @@ const ( FailReasonExit = "command exited non-zero" FailReasonRAM = "command used too much RAM" FailReasonTime = "command used too much time" + FailReasonDocker = "could not interact with docker" FailReasonAbnormal = "command failed to complete normally" FailReasonLost = "lost contact with runner" FailReasonSignal = "runner received a signal to stop" @@ -68,6 +74,7 @@ const ( var ( ClientTouchInterval = 15 * time.Second ClientReleaseDelay = 30 * time.Second + ClientPercentMemoryKill = 90 RAMIncreaseMin float64 = 1000 RAMIncreaseMultLow = 2.0 RAMIncreaseMultHigh = 1.3 @@ -498,6 +505,42 @@ func (c *Client) Execute(job *Job, shell string) error { } cmd.Env = env + // if docker monitoring has been requested, try and get the docker client + // now and fail early if we can't + var dockerClient *docker.Client + existingDockerContainers := make(map[string]bool) + var monitorDocker, getFirstDockerContainer bool + if job.MonitorDocker != "" { + monitorDocker = true + dockerClient, err = docker.NewClientWithOpts(docker.FromEnv) + if err != nil { + buryErr := fmt.Errorf("failed to create docker client: %s", err) + errb := c.Bury(job, nil, FailReasonDocker, buryErr) + if errb != nil { + buryErr = fmt.Errorf("%s (and burying the job failed: %s)", buryErr.Error(), errb) + } + return buryErr + } + + // if we've been asked to monitor the first container that appears, note + // existing containers + if job.MonitorDocker == "?" { + getFirstDockerContainer = true + containers, errc := dockerClient.ContainerList(context.Background(), types.ContainerListOptions{}) + if errc != nil { + buryErr := fmt.Errorf("failed to get docker containers: %s", errc) + errb := c.Bury(job, nil, FailReasonDocker, buryErr) + if errb != nil { + buryErr = fmt.Errorf("%s (and burying the job failed: %s)", buryErr.Error(), errb) + } + return buryErr + } + for _, container := range containers { + existingDockerContainers[container.ID] = true + } + } + } + // intercept certain signals (under LSF and SGE, SIGUSR2 may mean out-of- // time, but there's no reliable way of knowing out-of-memory, so we will // just treat them all the same) @@ -546,6 +589,7 @@ func (c *Client) Execute(job *Job, shell string) error { // update peak mem used by command, touch job and check if we use too much // resources, every 15s. Also check for signals peakmem := 0 + dockerCPU := 0 ticker := time.NewTicker(ClientTouchInterval) //*** this should be less than the ServerItemTTR set when the server started, not a fixed value memTicker := time.NewTicker(1 * time.Second) // we need to check on memory usage frequently ranoutMem := false @@ -557,10 +601,29 @@ func (c *Client) Execute(job *Job, shell string) error { var stateMutex sync.Mutex stopChecking := make(chan bool, 1) go func() { + var dockerContainerID string + + killCmd := func() error { + errk := cmd.Process.Kill() + + if dockerContainerID != "" { + // kill the docker container as well + errd := dockerClient.ContainerKill(context.Background(), dockerContainerID, "SIGKILL") + if errk == nil { + errk = errd + } else { + errk = fmt.Errorf("%s, and killing the docker container failed: %s", errk.Error(), errd.Error()) + } + } + + return errk + } + + CHECKING: for { select { case <-sigs: - killErr = cmd.Process.Kill() + killErr = killCmd() stateMutex.Lock() signalled = true stateMutex.Unlock() @@ -572,7 +635,7 @@ func (c *Client) Execute(job *Job, shell string) error { if errc != nil { closeErr = errc } - return + break CHECKING case <-ticker.C: stateMutex.Lock() if !ranoutTime && time.Now().After(endT) { @@ -585,7 +648,7 @@ func (c *Client) Execute(job *Job, shell string) error { kc, errf := c.Touch(job) if kc { - killErr = cmd.Process.Kill() + killErr = killCmd() stateMutex.Lock() killCalled = true stateMutex.Unlock() @@ -597,7 +660,7 @@ func (c *Client) Execute(job *Job, shell string) error { if errc != nil { closeErr = errc } - return + break CHECKING } if errf != nil { // we may have lost contact with the manager; this is OK. We @@ -606,22 +669,92 @@ func (c *Client) Execute(job *Job, shell string) error { } case <-memTicker.C: mem, errf := currentMemory(job.Pid) + + var cpuS int + if monitorDocker { + if dockerContainerID == "" { + if getFirstDockerContainer { + // look for a new container + containers, errc := dockerClient.ContainerList(context.Background(), types.ContainerListOptions{}) + if errc == nil { + for _, container := range containers { + if _, exists := existingDockerContainers[container.ID]; !exists { + dockerContainerID = container.ID + break + } + } + } + } else { + // job.MonitorDocker might be a file path + cidPath := job.MonitorDocker + if !strings.HasPrefix(cidPath, "/") { + cidPath = filepath.Join(cmd.Dir, cidPath) + } + _, errs := os.Stat(cidPath) + if errs == nil { + b, errr := ioutil.ReadFile(cidPath) + if errr == nil { + dockerContainerID = strings.TrimSuffix(string(b), "\n") + } + } + + // or might be a name; check names of all new + // containers + if dockerContainerID == "" { + containers, errc := dockerClient.ContainerList(context.Background(), types.ContainerListOptions{}) + if errc == nil { + CONTAINERS: + for _, container := range containers { + if _, exists := existingDockerContainers[container.ID]; !exists { + for _, name := range container.Names { + name = strings.TrimPrefix(name, "/") + if name == job.MonitorDocker { + dockerContainerID = container.ID + break CONTAINERS + } + } + } + } + } + } + } + } + + if dockerContainerID != "" { + dockerMem, thisDockerCPU, errs := internal.DockerStats(dockerClient, dockerContainerID) + if errs == nil { + if dockerMem > mem { + mem = dockerMem + } + cpuS = thisDockerCPU + } + } + } + stateMutex.Lock() if errf == nil && mem > peakmem { peakmem = mem if peakmem > job.Requirements.RAM { - // we don't allow things to use too much memory, or we - // could screw up the machine we're running on - killErr = cmd.Process.Kill() - ranoutMem = true - stateMutex.Unlock() - return + maxRAM, errp := internal.ProcMeminfoMBs() + + // we don't allow cmds to use both more than exepected + // and more than 90% of phsical memory, or we could + // screw up the machine we're running on + if errp == nil && peakmem >= ((maxRAM/100)*ClientPercentMemoryKill) { + killErr = killCmd() + ranoutMem = true + stateMutex.Unlock() + break CHECKING + } } } + if cpuS > dockerCPU { + dockerCPU = cpuS + } stateMutex.Unlock() case <-stopChecking: - return + break CHECKING } } }() @@ -814,6 +947,13 @@ func (c *Client) Execute(job *Job, shell string) error { finalStdErr = append(finalStdErr, errsew.Error()...) } + // *** following is useful when debugging; need a better way to see these + // errors from runner clients... + // if myerr != nil { + // finalStdErr = append(finalStdErr, "\n\nExecution errors:\n"...) + // finalStdErr = append(finalStdErr, myerr.Error()...) + // } + finalStdOut := bytes.TrimSpace(stdout.Bytes()) if errsow != nil { finalStdOut = append(finalStdOut, "\n\nSTDOUT handling problems:\n"...) @@ -831,7 +971,7 @@ func (c *Client) Execute(job *Job, shell string) error { Cwd: actualCwd, Exitcode: exitcode, PeakRAM: peakmem, - CPUtime: cmd.ProcessState.SystemTime(), + CPUtime: cmd.ProcessState.SystemTime() + time.Duration(dockerCPU)*time.Second, Stdout: finalStdOut, Stderr: finalStdErr, Exited: true, diff --git a/jobqueue/job.go b/jobqueue/job.go index a447ba818..1615ef61b 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -168,6 +168,21 @@ type Job struct { // ActualCwd. MountConfigs MountConfigs + // MonitorDocker turns on monitoring of a docker container identified by its + // --name or path to its --cidfile, adding its peak RAM and CPU usage to the + // reported RAM and CPU usage of this job. + // + // If the special argument "?" is supplied, monitoring will apply to the + // first new docker container that appears after the Cmd starts to run. + // NB: if multiple jobs that run docker containers start running at the same + // time on the same machine, the reported stats could be wrong for one or + // more of those jobs. + // + // Requires that docker is installed on the machine where the job will run + // (and that the Cmd uses docker to run a container). NB: does not handle + // monitoring of multiple docker containers run by a single Cmd. + MonitorDocker string + // The remaining properties are used to record information about what // happened when Cmd was executed, or otherwise provide its current state. // It is meaningless to set these yourself. diff --git a/jobqueue/jobqueue_test.go b/jobqueue/jobqueue_test.go index f773c1a2d..cc23d4201 100644 --- a/jobqueue/jobqueue_test.go +++ b/jobqueue/jobqueue_test.go @@ -987,10 +987,10 @@ func TestJobqueue(t *testing.T) { // and permission problems on the exe? }) - Convey("If a job uses too much memory it is killed and we recommend more next time", func() { + Convey("If a job uses more memory than expected it is not killed, but we recommend more next time", func() { jobs = nil cmd := "perl -e '@a; for (1..3) { push(@a, q[a] x 50000000); sleep(1) }'" - jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "fake_group", Requirements: standardReqs, Retries: uint8(3), RepGroup: "run_out_of_mem"}) + jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "highmem", Requirements: standardReqs, Retries: uint8(0), RepGroup: "too_much_mem"}) RecMBRound = 1 inserts, already, err := jq.Add(jobs, envVars, true) So(err, ShouldBeNil) @@ -1001,20 +1001,73 @@ func TestJobqueue(t *testing.T) { So(err, ShouldBeNil) So(job.Cmd, ShouldEqual, cmd) So(job.State, ShouldEqual, JobStateReserved) + So(job.Requirements.RAM, ShouldEqual, 10) err = jq.Execute(job, config.RunnerExecShell) - So(err, ShouldNotBeNil) - jqerr, ok := err.(Error) - So(ok, ShouldBeTrue) - So(jqerr.Err, ShouldEqual, FailReasonRAM) - So(job.State, ShouldEqual, JobStateDelayed) + So(err, ShouldBeNil) + So(job.State, ShouldEqual, JobStateComplete) So(job.Exited, ShouldBeTrue) - So(job.Exitcode, ShouldEqual, -1) - So(job.FailReason, ShouldEqual, FailReasonRAM) - So(job.Requirements.RAM, ShouldEqual, 1200) - jq.Delete([]*JobEssence{{Cmd: cmd}}) + So(job.Exitcode, ShouldEqual, 0) + + cmd2 := "echo another high mem job" + jobs = append(jobs, &Job{Cmd: cmd2, Cwd: "/tmp", ReqGroup: "highmem", Requirements: standardReqs, Retries: uint8(0), RepGroup: "too_much_mem"}) + inserts, already, err = jq.Add(jobs, envVars, true) + So(err, ShouldBeNil) + So(inserts, ShouldEqual, 1) + So(already, ShouldEqual, 1) + + job, err = jq.Reserve(50 * time.Millisecond) + So(err, ShouldBeNil) + So(job.Cmd, ShouldEqual, cmd2) + So(job.State, ShouldEqual, JobStateReserved) + So(job.Requirements.RAM, ShouldBeGreaterThanOrEqualTo, 100) + jq.Release(job, &JobEndState{}, "") + + deleted, errd := jq.Delete([]*JobEssence{{Cmd: cmd}}) + So(errd, ShouldBeNil) + So(deleted, ShouldEqual, 0) + deleted, errd = jq.Delete([]*JobEssence{{Cmd: cmd2}}) + So(errd, ShouldBeNil) + So(deleted, ShouldEqual, 1) }) + maxRAM, errp := internal.ProcMeminfoMBs() + if errp == nil && maxRAM > 80000 { // authors high memory system + Convey("If a job uses close to all memory on machine it is killed and we recommend more next time", func() { + jobs = nil + cmd := "perl -e '@a; for (1..1000) { push(@a, q[a] x 80000000000) }'" + jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "fake_group", Requirements: standardReqs, Retries: uint8(3), RepGroup: "run_out_of_mem"}) + RecMBRound = 1 + inserts, already, err := jq.Add(jobs, envVars, true) + So(err, ShouldBeNil) + So(inserts, ShouldEqual, 1) + So(already, ShouldEqual, 0) + + job, err := jq.Reserve(50 * time.Millisecond) + So(err, ShouldBeNil) + So(job.Cmd, ShouldEqual, cmd) + So(job.State, ShouldEqual, JobStateReserved) + + ClientPercentMemoryKill = 10 + err = jq.Execute(job, config.RunnerExecShell) + ClientPercentMemoryKill = 90 + So(err, ShouldNotBeNil) + jqerr, ok := err.(Error) + So(ok, ShouldBeTrue) + So(jqerr.Err, ShouldEqual, FailReasonRAM) + So(job.State, ShouldEqual, JobStateDelayed) + So(job.Exited, ShouldBeTrue) + So(job.Exitcode, ShouldEqual, -1) + So(job.FailReason, ShouldEqual, FailReasonRAM) + So(job.Requirements.RAM, ShouldBeGreaterThanOrEqualTo, 10000) + deleted, errd := jq.Delete([]*JobEssence{{Cmd: cmd}}) + So(errd, ShouldBeNil) + So(deleted, ShouldEqual, 1) + }) + } else { + SkipConvey("Skipping test that uses most of machine memory", func() {}) + } + RecMBRound = 100 // revert back to normal Convey("The stdout/err of jobs is only kept for failed jobs, and cwd&TMPDIR&HOME get set appropriately", func() { @@ -3334,6 +3387,15 @@ func TestJobqueueWithOpenStack(t *testing.T) { Logger: testLogger, } + dockerInstallScript := `sudo mkdir -p /etc/docker/ +sudo bash -c "echo '{ \"bip\": \"192.168.3.3/24\", \"dns\": [\"8.8.8.8\",\"8.8.4.4\"], \"mtu\": 1380 }' > /etc/docker/daemon.json" +sudo apt-get -y install --no-install-recommends apt-transport-https ca-certificates curl software-properties-common +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" +sudo apt-get -yq update +sudo apt-get -y install docker-ce +sudo usermod -aG docker ` + osUser + Convey("You can connect with an OpenStack scheduler", t, func() { server, _, token, errs = Serve(osConfig) So(errs, ShouldBeNil) @@ -3343,6 +3405,77 @@ func TestJobqueueWithOpenStack(t *testing.T) { So(err, ShouldBeNil) defer jq.Disconnect() + Convey("You can run cmds that start docker containers and get correct memory and cpu usage", func() { + var jobs []*Job + other := make(map[string]string) + other["cloud_script"] = dockerInstallScript + + jobs = append(jobs, &Job{Cmd: "docker run sendu/usememory:v1", Cwd: "/tmp", ReqGroup: "docker", Requirements: &jqs.Requirements{RAM: 3, Time: 5 * time.Second, Cores: 1, Other: other}, Override: uint8(2), Retries: uint8(0), RepGroup: "first_docker", MonitorDocker: "?"}) + + dockerName := "jobqueue_test." + internal.RandomString() + jobs = append(jobs, &Job{Cmd: "docker run --name " + dockerName + " sendu/usememory:v1", Cwd: "/tmp", ReqGroup: "docker", Requirements: &jqs.Requirements{RAM: 3, Time: 5 * time.Second, Cores: 1, Other: other}, Override: uint8(2), Retries: uint8(0), RepGroup: "named_docker", MonitorDocker: dockerName}) + + dockerCidFile := "jobqueue_test.cidfile" + jobs = append(jobs, &Job{Cmd: "docker run --cidfile " + dockerCidFile + " sendu/usecpu:v1 && rm " + dockerCidFile, Cwd: "/tmp", ReqGroup: "docker2", Requirements: &jqs.Requirements{RAM: 1, Time: 5 * time.Second, Cores: 2, Other: other}, Override: uint8(2), Retries: uint8(0), RepGroup: "cidfile_docker", MonitorDocker: dockerCidFile}) + + inserts, already, err := jq.Add(jobs, envVars, true) + So(err, ShouldBeNil) + So(inserts, ShouldEqual, 3) + So(already, ShouldEqual, 0) + + // wait for the jobs to get run + done := make(chan bool, 1) + go func() { + limit := time.After(180 * time.Second) + ticker := time.NewTicker(1 * time.Second) + for { + select { + case <-ticker.C: + if !server.HasRunners() { + got, errg := jq.GetIncomplete(0, "", false, false) + if errg != nil { + fmt.Printf("GetIncomplete failed: %s\n", errg) + } + if len(got) == 0 { + ticker.Stop() + done <- true + return + } + } + continue + case <-limit: + ticker.Stop() + done <- false + return + } + } + }() + So(<-done, ShouldBeTrue) + + expectedRAM := 2000 + got, err := jq.GetByRepGroup("first_docker", 0, JobStateComplete, false, false) + So(err, ShouldBeNil) + So(len(got), ShouldEqual, 1) + So(got[0].PeakRAM, ShouldBeGreaterThanOrEqualTo, expectedRAM) + So(got[0].WallTime(), ShouldBeBetweenOrEqual, 5*time.Second, 15*time.Second) + So(got[0].CPUtime, ShouldBeLessThan, 4*time.Second) + + got, err = jq.GetByRepGroup("named_docker", 0, JobStateComplete, false, false) + So(err, ShouldBeNil) + So(len(got), ShouldEqual, 1) + So(got[0].PeakRAM, ShouldBeGreaterThanOrEqualTo, expectedRAM) + + got, err = jq.GetByRepGroup("cidfile_docker", 0, JobStateComplete, false, false) + So(err, ShouldBeNil) + So(len(got), ShouldEqual, 1) + So(got[0].PeakRAM, ShouldBeLessThan, 100) + So(got[0].WallTime(), ShouldBeBetweenOrEqual, 5*time.Second, 15*time.Second) + So(got[0].CPUtime, ShouldBeGreaterThan, 5*time.Second) + + // *** want to test that when we kill a running job, its docker + // is also immediately killed... + }) + Convey("You can run a cmd with a per-cmd set of config files", func() { // create a config file locally localConfigPath := filepath.Join(runnertmpdir, "test.config") diff --git a/jobqueue/scheduler/scheduler.go b/jobqueue/scheduler/scheduler.go index f0a4a09b7..6a64b474d 100644 --- a/jobqueue/scheduler/scheduler.go +++ b/jobqueue/scheduler/scheduler.go @@ -42,7 +42,6 @@ package scheduler import ( "crypto/md5" // #nosec - not used for cryptographic purposes here "fmt" - "math/rand" "sort" "sync" "time" @@ -54,11 +53,7 @@ import ( ) const ( - randBytes = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" - randIdxBits = 6 // 6 bits to represent a rand index - randIdxMask = 1<= 0; { - if remain == 0 { - cache, remain = src.Int63(), randIdxMax - } - if idx := int(cache & randIdxMask); idx < len(randBytes) { - b[i] = randBytes[idx] - i-- - } - cache >>= randIdxBits - remain-- - } - name += "_" + string(b) + name += "_" + internal.RandomString() } return name diff --git a/jobqueue/serverCLI.go b/jobqueue/serverCLI.go index f76dc82ac..f49ed22e8 100644 --- a/jobqueue/serverCLI.go +++ b/jobqueue/serverCLI.go @@ -463,8 +463,11 @@ func (s *Server) handleRequest(m *mangos.Message) error { removedJobs := false for _, jobkey := range keys { item, err := s.q.Get(jobkey) + if err != nil || item == nil { + continue + } iState := item.Stats().State - if err != nil || iState == queue.ItemStateRun { + if iState == queue.ItemStateRun { continue } @@ -478,7 +481,6 @@ func (s *Server) handleRequest(m *mangos.Message) error { } continue } - err = s.q.Remove(jobkey) if err == nil { deleted++ @@ -670,37 +672,38 @@ func (s *Server) itemToJob(item *queue.Item, getStd bool, getEnv bool) *Job { req := &scheduler.Requirements{} *req = *sjob.Requirements // copy reqs since server changes these, avoiding a race condition job := &Job{ - RepGroup: sjob.RepGroup, - ReqGroup: sjob.ReqGroup, - DepGroups: sjob.DepGroups, - Cmd: sjob.Cmd, - Cwd: sjob.Cwd, - CwdMatters: sjob.CwdMatters, - ChangeHome: sjob.ChangeHome, - ActualCwd: sjob.ActualCwd, - Requirements: req, - Priority: sjob.Priority, - Retries: sjob.Retries, - PeakRAM: sjob.PeakRAM, - Exited: sjob.Exited, - Exitcode: sjob.Exitcode, - FailReason: sjob.FailReason, - StartTime: sjob.StartTime, - EndTime: sjob.EndTime, - Pid: sjob.Pid, - Host: sjob.Host, - HostID: sjob.HostID, - HostIP: sjob.HostIP, - CPUtime: sjob.CPUtime, - State: state, - Attempts: sjob.Attempts, - UntilBuried: sjob.UntilBuried, - ReservedBy: sjob.ReservedBy, - EnvKey: sjob.EnvKey, - EnvOverride: sjob.EnvOverride, - Dependencies: sjob.Dependencies, - Behaviours: sjob.Behaviours, - MountConfigs: sjob.MountConfigs, + RepGroup: sjob.RepGroup, + ReqGroup: sjob.ReqGroup, + DepGroups: sjob.DepGroups, + Cmd: sjob.Cmd, + Cwd: sjob.Cwd, + CwdMatters: sjob.CwdMatters, + ChangeHome: sjob.ChangeHome, + ActualCwd: sjob.ActualCwd, + Requirements: req, + Priority: sjob.Priority, + Retries: sjob.Retries, + PeakRAM: sjob.PeakRAM, + Exited: sjob.Exited, + Exitcode: sjob.Exitcode, + FailReason: sjob.FailReason, + StartTime: sjob.StartTime, + EndTime: sjob.EndTime, + Pid: sjob.Pid, + Host: sjob.Host, + HostID: sjob.HostID, + HostIP: sjob.HostIP, + CPUtime: sjob.CPUtime, + State: state, + Attempts: sjob.Attempts, + UntilBuried: sjob.UntilBuried, + ReservedBy: sjob.ReservedBy, + EnvKey: sjob.EnvKey, + EnvOverride: sjob.EnvOverride, + Dependencies: sjob.Dependencies, + Behaviours: sjob.Behaviours, + MountConfigs: sjob.MountConfigs, + MonitorDocker: sjob.MonitorDocker, } if !sjob.StartTime.IsZero() && state == JobStateReserved { diff --git a/jobqueue/serverREST.go b/jobqueue/serverREST.go index 23d2223c0..c96f97ae1 100644 --- a/jobqueue/serverREST.go +++ b/jobqueue/serverREST.go @@ -75,6 +75,7 @@ type JobViaJSON struct { OnSuccess BehavioursViaJSON `json:"on_success"` OnExit BehavioursViaJSON `json:"on_exit"` Env []string `json:"env"` + MonitorDocker string `json:"monitor_docker"` CloudOS string `json:"cloud_os"` CloudUser string `json:"cloud_username"` CloudScript string `json:"cloud_script"` @@ -106,14 +107,15 @@ type JobDefaults struct { DepGroups []string Deps Dependencies // Env is a comma separated list of key=val pairs. - Env string - OnFailure Behaviours - OnSuccess Behaviours - OnExit Behaviours - MountConfigs MountConfigs - CloudOS string - CloudUser string - CloudFlavor string + Env string + OnFailure Behaviours + OnSuccess Behaviours + OnExit Behaviours + MountConfigs MountConfigs + MonitorDocker string + CloudOS string + CloudUser string + CloudFlavor string // CloudScript is the local path to a script. CloudScript string // CloudConfigFiles is the config files to copy in cloud.Server.CopyOver() format @@ -183,7 +185,7 @@ func (jd *JobDefaults) DefaultCloudOSRam() string { // properties of this JobViaJSON. The Job will not be in the queue until passed // to a method that adds jobs to the queue. func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { - var cmd, cwd, rg, repg string + var cmd, cwd, rg, repg, monitorDocker string var mb, cpus, disk, override, priority, retries int var dur time.Duration var envOverride []byte @@ -344,6 +346,12 @@ func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { mounts = jd.MountConfigs } + if jvj.MonitorDocker == "" { + monitorDocker = jd.MonitorDocker + } else { + monitorDocker = jvj.MonitorDocker + } + // scheduler-specific options other := make(map[string]string) if jvj.CloudOS != "" { @@ -393,21 +401,22 @@ func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { } return &Job{ - RepGroup: repg, - Cmd: cmd, - Cwd: cwd, - CwdMatters: cwdMatters, - ChangeHome: changeHome, - ReqGroup: rg, - Requirements: &jqs.Requirements{RAM: mb, Time: dur, Cores: cpus, Disk: disk, Other: other}, - Override: uint8(override), - Priority: uint8(priority), - Retries: uint8(retries), - DepGroups: depGroups, - Dependencies: deps, - EnvOverride: envOverride, - Behaviours: behaviours, - MountConfigs: mounts, + RepGroup: repg, + Cmd: cmd, + Cwd: cwd, + CwdMatters: cwdMatters, + ChangeHome: changeHome, + ReqGroup: rg, + Requirements: &jqs.Requirements{RAM: mb, Time: dur, Cores: cpus, Disk: disk, Other: other}, + Override: uint8(override), + Priority: uint8(priority), + Retries: uint8(retries), + DepGroups: depGroups, + Dependencies: deps, + EnvOverride: envOverride, + Behaviours: behaviours, + MountConfigs: mounts, + MonitorDocker: monitorDocker, }, nil } @@ -581,21 +590,22 @@ func restJobsStatus(r *http.Request, s *Server) ([]*Job, int, error) { func restJobsAdd(r *http.Request, s *Server) ([]*Job, int, error) { // handle possible ?query parameters jd := &JobDefaults{ - Cwd: r.Form.Get("cwd"), - RepGrp: r.Form.Get("rep_grp"), - ReqGrp: r.Form.Get("req_grp"), - CPUs: urlStringToInt(r.Form.Get("cpus")), - Disk: urlStringToInt(r.Form.Get("disk")), - Override: urlStringToInt(r.Form.Get("override")), - Priority: urlStringToInt(r.Form.Get("priority")), - Retries: urlStringToInt(r.Form.Get("retries")), - DepGroups: urlStringToSlice(r.Form.Get("dep_grps")), - Env: r.Form.Get("env"), - CloudOS: r.Form.Get("cloud_os"), - CloudUser: r.Form.Get("cloud_username"), - CloudScript: r.Form.Get("cloud_script"), - CloudFlavor: r.Form.Get("cloud_flavor"), - CloudOSRam: urlStringToInt(r.Form.Get("cloud_ram")), + Cwd: r.Form.Get("cwd"), + RepGrp: r.Form.Get("rep_grp"), + ReqGrp: r.Form.Get("req_grp"), + CPUs: urlStringToInt(r.Form.Get("cpus")), + Disk: urlStringToInt(r.Form.Get("disk")), + Override: urlStringToInt(r.Form.Get("override")), + Priority: urlStringToInt(r.Form.Get("priority")), + Retries: urlStringToInt(r.Form.Get("retries")), + DepGroups: urlStringToSlice(r.Form.Get("dep_grps")), + Env: r.Form.Get("env"), + MonitorDocker: r.Form.Get("monitor_docker"), + CloudOS: r.Form.Get("cloud_os"), + CloudUser: r.Form.Get("cloud_username"), + CloudScript: r.Form.Get("cloud_script"), + CloudFlavor: r.Form.Get("cloud_flavor"), + CloudOSRam: urlStringToInt(r.Form.Get("cloud_ram")), } if r.Form.Get("cwd_matters") == restFormTrue { jd.CwdMatters = true diff --git a/jobqueue/serverWebI.go b/jobqueue/serverWebI.go index f883b00c2..8cf1e24cf 100644 --- a/jobqueue/serverWebI.go +++ b/jobqueue/serverWebI.go @@ -64,17 +64,18 @@ type jstatusReq struct { // jstatus is the job info we send to the status webpage (only real difference // to Job is that some of the values are converted to easy-to-display forms). type jstatus struct { - Key string - RepGroup string - DepGroups []string - Dependencies []string - Cmd string - State JobState - Cwd string - CwdBase string - HomeChanged bool - Behaviours string - Mounts string + Key string + RepGroup string + DepGroups []string + Dependencies []string + Cmd string + State JobState + Cwd string + CwdBase string + HomeChanged bool + Behaviours string + Mounts string + MonitorDocker string // ExpectedRAM is in Megabytes. ExpectedRAM int // ExpectedTime is in seconds. @@ -474,6 +475,7 @@ func jobToStatus(job *Job) jstatus { HomeChanged: job.ChangeHome, Behaviours: job.Behaviours.String(), Mounts: job.MountConfigs.String(), + MonitorDocker: job.MonitorDocker, ExpectedRAM: job.Requirements.RAM, ExpectedTime: job.Requirements.Time.Seconds(), RequestedDisk: job.Requirements.Disk, diff --git a/jobqueue/static.go b/jobqueue/static.go index 7005f118e..d87705ebf 100644 --- a/jobqueue/static.go +++ b/jobqueue/static.go @@ -3611,7 +3611,7 @@ w9Vny76PsLQ+wiKIuN3aAu90BLP/DQAA//+BYYzv7UQAAA== "/js/wr-0.0.1.js": { local: "static/js/wr-0.0.1.js", size: 4979, - modtime: 1523622099, + modtime: 1524833213, compressed: ` H4sIAAAJbogA/5RY/3PbthX/XX8FpqYWGSuUZLu2a8ft1W7T5m5Jd06zu52s3UEkLPJEEhwAWtYS7W/f ewBIgPrirbqcTAGf9/2D98CMRmSRPbGSUPgnBF0T/kgec06VJCqlitAkIXVFFCf/mYzHQyKYqkUpCSUx @@ -3650,155 +3650,155 @@ N19fhX2PieY/dpCFWhXwhcUBeBm629hfLKa5PJASRs/u9vRk1iKay1wzs2CSJezz/fs7XlS8hHoFTsjF "/status.html": { local: "static/status.html", - size: 63647, - modtime: 1523634258, + size: 64026, + modtime: 1525271187, compressed: ` -H4sIAAAJbogA/+x9f3fbNrLo//0UiN5uJTeS7LTb9/oc2z2JnW5zm2x8k2z37vHx2aVEWGJMkVr+kOLb -zXe/MwPwN0gCNBWrveVpY5sEBjODwWAwAGZOHl28OX//98sXbBmt3LMvTvAHcy1vcTrg3uDsCwbPyZJb -tviV/lzxyGLzpRWEPDodxNHN5LtB7nPkRC4/+9tb9i6yojg8ORQvvshKPJpM2If/jHlwx278gG2swPHj -kMWR4zrR3ZhZns08zm1us9kdm/l+FEaBtZ5+CNlkkmspnAfOOmJhMD8dHH4IDz/8C2FOvp5+Pf3TdOV4 -UGFwdnIoipUReJ6AJRzWAQ+5Bwg7vkfth9Gd63iLYoNE+TKK1hP+r9jZnA7+a/LXZ5Nzf7WGijOXD9jc -9yKAczp4+eKU2ws+KNf2rBU/HWwcvl37QZSrsHXsaHlq840z5xP6Y8wcz4kcy52Ec8vlp0/ywAC5WxZw -93SAmPJwyTlAWwb8BngxD8PDlG2Tb6bfTP8f8QPeDxr4p6rSxMKfPH9+68cRcZBvgAy2BN5V+VZu6FZW -hHb+ND3Sa0f0VeSzlXXL2SyOIt8LqauiJTQYsq0f3LKvJ1sLRIZHW849lrRDxVLqNHATXHgCXPi6Fbt3 -/ooz/4b5ccD8rccW3OOB5bIld9c8YDexN0epapHdbTA5AlY8KTWl398pgKyTTw6zkXsy8+27POq2s2GO -fTrwrA1IoWuFIf0+swImfkxsfmPFLrQS+CB9+NFZ0ADJyVAKSkJAcbYcYECpTLmcbALxU5YVPFpbXqnC -LICuHOS1CxZStHUIjZXQLL6Sf1YZEhLgQRtFpfI8CPwAatlWZE1mjgcfYFRwa748ZrkSLWyBYR6AtOK/ -Exu0MMoPcAgUQR2P1vkWI/4xOmZ/wDcoRGsTvqiJm1k2IL7hdaTlvvdNWa4ydDF3Gf0L4zvwYLzX1FLW -JDFrroPPOyKksQiBx0F/6zPn5phdBj6o/RU7PWWDQWGAN0KIE/RsP4q4XWBt5Ptu5KyP2S+MJs5jNnx5 -gzouZPDfhzgELrKIr2D6sGACBfH0OCiYDcycUCCM+VgUXvEwtBacbR3XZQufWaQYoUwUcvdmOmSfBmcr -Z7GMQFsyGxh0chi3o0/EHwL1OrTmOfXo87Dq/ZIHQLMFMwPM6aLFOMQJiZgiZHXKXkaCL55P5MPgtHFq -CWKP+RGAYB/8WQjFvA0PI9R6IKgRzDxebLku8PCG3fkxc51b4PaM42hgSyeKRDuc/fMnBO5E/5TzlOA2 -tO/5zPVJ+OPQAuT647liYBc+V8YEzgctA+IvYKscSzVc0TL4kWYq1L8ns6AZ1MuLWkAvLwzAXNaDudQH -c78h/MqHMUjTwjyqRecCZGYa+fhjdJBi1t7XQmBYdLeGKVf8kU5Fs8hj8H+iP9ex604CHMKFUTF3nfkt -zAIB2DtTQPPGCVYXML6FehucvYyGIVgSJMhi3ItmNFimM/DvOeiTGtyb+zGYxgG3a3ksy+r3e00DzPo1 -9qPUMT12X4MOqflkak7ISanGmEi//vpNifmS2zFgyF7ilGw0U56jWI4O2Bl7oj1NXoFwgFIKOC4qmwX6 -Byyplurr/Z2Kaoh5HS70R/9bDe68sgRzRgeGg/4+PYiYJ8jVYkZAU1zA4IHR0pPG1tFVcqxoKSvbCVdg -ir4Ww3lwdiH+btdUu1VAtHCWjpZj9uTo6I9PUzK3HJQp/jMJV2AJricrK1goFUoelCh0zI6YFUf+0zr1 -s/y2UuEpqCAbFQn8DlMyzEWrtcvBzCwsemF1BfyryoLj3bjYBSCjkeVmI+Bw+W27BsxRl4eMgluESxJ8 -pKsdA38RQIcPiqTC2IYuXx03wqmDNUFnRP6PSRgFzhpHMK54ePFbos2luyL5Bp8KdBJ6uGSQcpDSbHPX -uruc48B9zIZ/JJPdSHEXIXFb8E9/qaMe92WomQqQL/pbs7UobiqyB9205p7NvainrpLQeu8sCTffXfLV -r6zDgCa/c2+BZWf3M6gIUs+9RDCzHsL+AdHc+/7p3hux109fxB6O4b57Q0DN+kO+2Pv+KH4TS5LOfeT6 -YT+qDQH13EMIMuseN+cH2cM+umc/zOKgH8UFgJzejQEBNOsL8fdn64XdGeqI4FdffUWe2TseMQft4hXM -miXq8jIQ+Fsm7MwWsz3d0nEnH8PJt3X2+o0frAoyEs9WDnA/4P+KeRjBUu3PgR+vNS1jx1vH0WTRUoNq -5W3/XLUJLBX8xFqP/MUCBVo6v+XbdJcKFg24mhYO8dPBC/RwMYDqoOXh3DjwV+Qzyw19FnJO3mqxPYVb -mBYsgmAlsrI8O2TQKGi4rRMtoZQV5SBMB2fZH1qLYyJGLjBRktN1F7KakIdRWhiXG8uNObK8ldeNnIOl -q864lSvgsn8u2QAViAsxgDGXb2zh3q2XDlDA0t8ma7DLJ3MnmLs5D7mem66FmY3jDnnZZScUH9QH1YEo -VVnoBxHuViSCH44Opi73FiAZdTrtZFlyxhQ0T3UkK7dNFc3iu1GypT5yx8EBqO6AR3HgMXfq2IBQgD++ -Z0/YMZs8YZ8OWtbwre6AJqeikR9A8KXNF1Cn+XPKXstHoOsaSGlpdw/oeQXKAO/rGaiD12nZychRZdFZ -HYVhYAWONSHVs3K808FR4Y318XQAYtJoPlSdCGOW+MbWVgBKcxou/S2INOmnC7GEHzMrigIEM8za8/zt -sABQxwIhZuWGbjdXBEFRy2FnL0SKmfbucbshSMV+PaKhcly0iIes0iggBbDdhKSbE4Tg1InJPfwfKX57 -JyroC9m1nFRdJo0y8haLN8hHDlwX2ejidiEYarno6HFJsdoHidh1/5ecNM29L1wkTf2fgOvU+50cPQSl -pv+7+nhSzPZBAhR+hl1LRcUt1CgWeESlQSYyYF2EooNjiUCoJeIePqUUsQeRic/T7xU3VGO/Pyc3UEPP -Z+C69HwnVxYBUfd9Ry9WitYD9vvOlg884qX+blobpKU7Lg6gfr+LAwRYWBzwsueuEbOH0e7xfA6/73oo -J3v8+sP5XNZokIEi0C5SkEDoTwwSiJkcJG8eRBD0fNn13yWvUr+UzSPLccN2H7rSqyLOntU7QwqnasKQ -Or1wXA06He8+cDxUOZSr7yH7978Lb+VSazhOKuPKpVCTLPHs+zpwAJW7YhFhm2WFhOorlBEqu9Q+zuJZ -LTm8CtUSgdDcV+l4DC+t3+Z1Uxy3WpEaa/KaVTCUTfgbHty4/nby8Zj8gZpIEphwZbnu2YlT5wY839rP -rTDnVq4tlkrY3Hd90B2gyO5y7kAHf6XG9OjT07dUNKdbXuOhtfaRUqjeDycJVMrNFeFRe7ZOoNmdO9SY -IYd2OdOlpyrZLb+DyUJ1olYNxIRgOzp7FuFNlCgEJCOTmna1DxJQ2Au2rS2VuvJrStmLj2s+x8Oib5+9 -7oG6BBxAm65mL1+ci3Ol+0Toe2fFe6QUweEZ2jigO4M7ozenbd6K/VluXzjhrbkxY8I5USE6S5tk2KYZ -+wQMu06HF6jJTKk/P9dnI7VgxkpdtSSQN5S1czCh+tAVBGf38vQDmHlvuRX63o4FKddm1foyajvDITq7 -DPiGLtYjHXFgOLhTxEwkolCxQNGjPiiSnYHXzR+AJpUkZiJiIo4Ertu4NBbiFx8dVE8714TYDqzz7C5d -op5HnAjB7Y6vVLrEKWwRZdVs6hBEGDJNVIrO3kX2mzgy55qob8CZQsXCAEUEOg3KFFz+RELOjVJ3QwSd -HNDsFD+N6L46rBYFHkMwFL50o6dY5MtF9FT3xl0tlV3GegFAcpmxD0YhZZ7vcaTs85NkNpJkDWNd0H0c -vAiChx0HgMBejAPAY7/HwX0Z9dseB52Q6zTrXnLr1nyJKmorJl0E13GJSjDNuNSFYLAm8ZpkT/RKaIW7 -nntG8AvP7o1cgrXPxP7Nct3I2A8haivoTcB19kMQ5N2TfX751x6pltD2negf/bCD1auk+Ed5hmAPKWQv -L3skUgQ5+TzLIWrvAhdDBvF6Mgo6WoGCZxedzcAavl2Y8o3A7euk7/Q1IVyKY+X76Ld4lHguvvySjVKv -2ADjNAYbDASV33EcJOfKim/pbNHB7jvkf51Rco95WuXrFB3V0S24q3k/xbY3B2jfZL5yNjwhVQQs+fzE -/m4olJv93VAwo+B3Q0EP498NhVINhaGQzSjyaKl4aeyu6mgFdHNgdnJe7pmncT9F4yK5Obr7zk+b2uP+ -T3H8Dfc3nXucO/zzdHna2n73eormb6rjjU86eRvjsyeGzOnQPYDV/XrF9BSM8ag6336G4wM/YhT98yUe -MO7LZkGQEuK+2i7P+dLCMzrBZ1BXWVt7rKwyJH9TqirX5W8wvrg82/cZev0tD4Gbc07HCZ2AQunsswAQ -e34lfa8BttvR7RvgBl0t5FZw43zscKnnnbNyXMts0fO47ni8BJadQRUx8pNIQZ2cBtma7X7Hrig+UWjB -5MGTA2hsVENH/kgZEXJAiWGC7FThjThVuLulfLcKFX9pEoXDTH/sJib5W77yN5wimQzOxB96wY565okI -LbA/HLnkmKnmARmSxeDYH6bAYuRhhSTZJ9oDjmAAfxHG/0FYYb4ZIe9TvcdMKh/8GbPWa5igQsoiMcZU -JyLJytyPXZuyysScgs3l0tVQhhoWxvMloxwtHo8wbxeG35G69ylmV8GwdNgCQLPmkUi6cuN4fIxpWChz -S8A3mAVAJG2h8D0hUYbXxFZW5MypznbJPQKW5IIBgDChcnua3O/SyhmRMmB3WR0GZ+fiD3ahnZOj0L/3 -F4jEZWp8Wy9jgIi6l6fd0GzTZ7CmwsED+t00jhFO8vqsBlJRQNMk/DBH5wHvGLZdom5rroewoBYF/WMr -37YUt6/LYQSp2DH7pdLkxgkxV+OxhPcay/0s3o0rhW3Hcv3FOd7DHhLESbgaVouJPHZ4VxsxwJ+uNeNu -oY0fqQz7xD5V6+NdTazlUcalYa7Wc/jyHtSnC6N0OJbgxfcLeQ9dAU8sINQQf6BvbTALID+R/6TSUTKH -YRbW8xDThw4o00sNCapgjIUAIzggRge0mSiHjFohPQs45eAKY/nL1vJoOqix/QU+ufwZS14fvqCQaSMN -iCpDofJ8LNVBbZyrJG6pBDNQj4oc7bz9og7FYV1adm6tU9M+FjjPL3VopYNTLCZV5XMrDnkt8jeFS00C -/e/r8W8c9oWNOg0SO7TT/rEsXadG0vXZRYVZ0GouQ1eVJ80kq0yaWj7cohVa33/CShpFIq8eWl5g2Fki -6mOS+g4Jna+A7DDy19DJfB5jKrynzLpBNwa2gAba1gKhBX45bmLfhSiK6PgVpsdB7aX7bl0c0KzfThyV -s1yMgJz2oBxqG15ydsgwhkiPT6blSnAlhJHlRWimwuDpQEg15avAUE/FFnV6S/jr1E6ri2xbCA6jl8qo -LyNptXKiZ0RXYac6CmJ+AD9k1CjRx9O5tXYiy3X+m1NSq1c8AiaI0DoYyno40Ii6vGPEb8BUMcT8SSve -Rlo36UEYEA/ahWacuD8LtFYSSYBvokamrZKmIyzILG/OG9bmStu1kiM6xTaMbD+ODnkQ9GfCAkxT+9Vd -jJm0ZCPbxJRN2tKxY5OqMrs5VX4TRxgE/lOtbVllmS3jYIUC4x4YZi/M+WXCpGG6237HxFmLoZbFz71N -vblvL35GN4sR37JzEL2xjq8/F+8A7T7YxteGfJtl27F9cQ1A7phr2ZZpDzwDdA15JsyjvthF0HbMMNpi -ZMqN0R44SBQY8hAA9sbBBLnd8e+Ft3EC30OGsZ8xHiE00wfn4GMj3wp8aTKMVa3U2cSqxBMygX7NmkG5 -iEtz7tenIjUwFzB5RvGN3BF2CE38VUWPWHN8OffXd0/Z10dP/u8E/vmO/Zl7uMYCgedWMF+yV84Kl+FT -5SIF04gg/OxtWWoVrP9gbSzxtoTWrT/112gKhlOwtXjw1zXwCeakU7LonxaJPDwEKeZbkEnu0m4sGGSY -SiVx7sfFneYkCwh5sOPwZ6j6GquCrasYHlbAMO89trx0wuqdc0qKH/m33IMiCx5dWgGILDDi+R1mOh8N -6NvgoKamhToEEJW5dE6J8hleVcLR8SwIrLtRXV1RB+xCINmo4syyRUJ7wwaTPMtmtRI/RblWbQUZJzMJ -Zgr1hsPmonIvorXcm2c137cgzxh4TMhZoFcK+eDxLWshH4rSyIHS33x79LR68kdyCX0Oz5M03VA4ldOR -Y6tEM62adaeEkiW6Ee/rauMjc+CIgtOXF7jec2x1bIVPCho/NdIjU/kWqFmFi0ZyEimrEoPJsSkvtg5B -aeHp63CBVEG79ycrSaQGFKlRSCOrHpek/ehgCioP7NTRLyyVieOyjHw6qM6FEmwSmrVnwCKea99AZcSu -nsFSfNieYcpAtL13l0i/szMx2AHsJOPHDoRhB1BlLoIdiMMueOC79j8oDRYAPmqSmX9ghOMYrFsoV9VK -NXNMopWuhqKNazHXSlB2pkLrFKdzw0YlSEVsrrXmkAKAjOTrGr2rPgSLJhfVAyJUOMFgvSaXZ+VjoiGV -n4WeU3+S2kr5kXSO8ovUHNeqqT9hqiDkjB018Q8pXsWYldF1aOp/cnTEDgUT6qMcgdm75TDPWS6dlvn/ -39GZmY3v2Mxis3jBHA+WUX4URoG1TgPXN4Gb4Spqu3TA1pdnZULACuHgvgudy5is8PokFGyCc4OOXR7Q -Xkcc4fYI/+iEMHjmfMz4ho7W+PFiifh7eB6nCZjgIEZ0RrY08pB4YQP/1hwW6F70Dv8ORlejHHO/apCp -gzFrKZqTsLbCqby1Fsykr61oIott5TLJPLgeg2TUCGjCN7CyMcBPxri39CIYCYaO2dcNAFTsRAV6PZJg -r46uTarn5rcMxBMDEOk0llX/2qS6mK2yyt8YVE4mpaz2nwxqJ3NPVvvbuto1urNeBeMCtl6fSA1eU+KT -5txXv7ZJrlOesivFZFAo9Mr3b2nR90vdbFfJ72m2HnUWHm4+iwYqRUDjhDxigAHqvC2fhT7osmpOIVTu -W8ez/e30b3z2jgrBKuOUYcfhkcPmNVtu7T5dx+FyNPi7HwdsFvhbeMtsH1bZmMo4jNdrIJelbYQqV8In -xt2QN7W3TRarKaDRYBuGx4eHA5jYXH9O8R+mS5BfdLnBu8Fx4QthAW8PBeb/2Ibfk2fjdJBMjPRnjbhK -HKa+56/JU9JqkeRrhSh6//HuzV+mmE/LWzg3dyCJ8i7MMRvM4yCg48qfDuqGSxtacxi5xWVqK2LVLjz3 -PY+L6jAVo/ysLM/CQ51LC48sAOWoIB6pujB5Dg8xizVMjOI07NqHeRiP4ETBHR1a5ROgGYTcCcVBkXna -5nQ6rVEVzaSvFGv0xhX2B7z1cMqoQ9ZgMvARn6Ib86C2Bg4WrDUFPrzZepcBSEEQ3Y2GPwT+ipw3w4Om -FgVj5EldL17N0PlChyzm4p5eY80Alv+E9NUwURnD68YaNClK91NjQSQsIO/C4LHluo8HbVQQPpljq6Cv -mwNayjGeWupFfVnmbLBoZagKlVRTXynauAoW1zWmfAlJo4bbC+MzdHCNHizUa6dK6d14YRTN7MYrU2lo -F16aaiM78dpUmtmBF6fSxk68OpVWZJqyXTeTJj3aPTl1TivT8XAvKA2OKH1Jvlf9eueSvvzdl5MyO1t3 -ELkUb/fBg3ZOygCkia0JRMP7pWi4zRtWfmqMvPyTTDudHWXlhwyAFKiBz6yKGa3AMlit7rPyU7MkLD95 -91oJ89Szln9fdKplX/L+tNzbgiste5/zomUvMzdFqU2hVcvvUzVY63ErP9oeOBWTzD1y5UfDQ2cCq+rM -K3vsTKB1cu4p4LQ6+0yAlfyCus6/8qPvDFSOgIp7rWY8NJSr9/4px0pDqVqfn2ocNWKejqqGUvkx1uo7 -VLFdy5doJBLJkKHrmAImLldR9M3ggCjRfYJEnJgVwdL6DtbYjhcZjkUMnTlmto/nxZnN5+IcGEKPxVEV -oyGEJ5ifSn9PwMVFVidMrnIsubs2gif4FeLhHceDRTMMxRAHZjZU9eZgCQ4v6IIZuUIVUedkUD0oDrf8 -jrx+mW05LlmJ45y9N04tt3Fmg40za2qct4vGRQtHc5bEB88IjRA7B1A7ego/Tth38OPxY5M5Ap/C9I+0 -XjnX13TtIfHgOkZWQPJUYebgmSW+ULt/7ldy9ww8+e0yUNNOU1qCzV78Kogmr34Fr+5e/vJT9CUJ72hC -jwb3a3xPFSfV1OXeIlqyCXuigRRqMnmBEXQhettdAj1Ob9Ix3FlgfmA37EjmoK1isJZQaQsnpIhkAKaL -uFGKN7zkQUSNRObo3fQxevcYfiIQy4WfyDiaAD1Q5KnW1AFWWqnpsbyysWLUcy1yjeriJvBXYyCosWC4 -daL5ciQctpmDWEsNzC3o3cz5pzVKECn1WkhvlM1g+rptL5qgljoMuyKXGqA7QE+6GbuhJm3eXaCVOCY7 -IpYY2jtATTgzu+ElTPsdIJV4P7uhlSwnekOsRTNkh49oZ7a8lVHeuTnA4F+58lflAtdqCO/9VJG0Abgq -1bhmZ8kO0jlei9RTRqCG5V4zWfPDyB8yWL57oYMupnE6G8FXb9Fu4KNhD7DkIptmKdoZpMmCxh6z5nRr -E5ZfYKFp4dc+mRgyalJiVLsQlbpfp5HTU313jlgwGJKh7156M/vA59EUzcxmKg4Sa8UEeV0CdD2E9yuh -vbtXmMJz406P6C6TOD5gKN1jGsdHU8nmkTSbzpVoGk7onRA1mdgVSBpN7d0QNJriVSiaTfKdkDSY7BUY -mkz3ndAzmvYVCJpN/J1QzLYytduQZyweGZ2xyD8lKjMXpz6d+j6ADipE7iE/GENSz/AD8qNl4mk2IGs3 -4Mhdwr5nT9gxO2pGApmKlrAOL3Ep6/GtNJzxx+iATbrYPQmUMwObgNqTFTWcKdqTNj7khlhx9G6HOVs1 -BFn1wPoMnE1igOqCIzv1KRipQ9dlIGfCFvY9zhZ4RC7A/Z4x2rG6AFdWcIu9mprWGKSR40X2PMa60CjQ -I8XEQoodj+GdX739L+y/R8xk4WIyThvNvZrTsapHb6S22uBq2vLemd6Iu6rAvmaPjVcVRLuJ6HfCqxta -eg5gGudHOiO8RXc2fW1SnRoaM/J1uj3yoSBt5hfX0G1eg9ov2VFI5alSzROl5udC02GS3idGV4I4AKq6 -uqzpJUAdhieJ6ZgwRUWCFbwPCrew0a+7podaVhA589jNnWJ9yizbJrUZYSgywlJrhhL8kQJfCIWsO+AB -JTqqm2QcRMqS2LsUyw1D7050QTme3GvVqoAz9TZpN+nsBBHd8YpAZnxhefIIvciuqV/X87eVu+8ZHE1A -AvV85sb7H17CR+4RpUx6zEYjQJgMGiL6gB3iZrmOIsJHdZla9Sgv1Iv9Bmi+/ih5/snNwCVIxpNRqT5w -Vl7uCHn00ouw29xuDMYHpcDCfZhX0gVUQ77wEJltT6r2YnNtddqVxUeJ4ZVzbS66yaNpsVDRsZHM3XOK -xKeo1oUg4pjb2ST18lLrzoMTDUPGHQo9ZJH6mVm2DNcwBusa9wbpwBWo0zZYWU0RptIJSTfhbaXWaU/o -n5fhc8vW8+SVQ1Noc1TbyagIm5GgeQE47qjfXoeLjh1HISliF6NeiSs31H9yw7gNHEzd4mQRrZ1gORVk -bv8s3k3rDqyAgcezKK5ia/lcyJdCcI62+U+lldLAHlLNscePHd3ldohwEgCghTS3FZwk+IeQC+w7bTc0 -VH5lhRGpOjn7yj/bhCsHgUzdUdHs1aqbdRRGPNLfidutp0XMuBI37b5LQ7Ho3/bBnjrO95rmqXEKBkp9 -lNTO3ujCSLu5fGi+IgWaAEXHq6ElQtEOSnMOS0cZKdxczJzOE5nyS/WtQnRwNTKPksQctMQJkgxmVnrs -plJNanYq+DaLHpWaX6BcVi9cWgXUyeDc90Lf5VPXX4wGEhQuOKBNJi6epfdWEzTAnlGC0ryFOhRR0oZj -lqB8XIZffz8VGIXXPvEw0R0HhqGHGskDBSAPaMubpOP0avBSpe5rbjSXO4HWmKEMRA3Tx80Nxwu0FJqN -TorWRosQUSJIvbd1IKZhSxbCF2IvLt+JSeXma9IAg0rRAjStM862B1W3oevujRcQkrtuvaKU7OR1ROot -zeb9ISR27boiI5fofaJDtiD2mXC94rUFWM+7sQ1Sl27gdcL2Fd5c6A9V2qrryLjntIvWIzJyW64jOudy -u6tHhNIdNEOUMmgqZMbifndriKJ0kdZkf6SlDR0DneL85R/pNqAUjKnjQIlJM5war4QqwmHy1E/hRb6N -rgyjishD2tRLU8eu83bSsaokK1QlPGMT5+luv79mKCRNq5gUCQm4npIy1S3BJFVVmoJKqsq/edZSWLja -jDhfQ0KuM9TgVHRQ17QXJzLKjG6opmEZJXdI86ZRjoSxyCV2LIVHaSTpGZfSroG1MgaEzUXOr9FJxSj4 -FSetSD1Qp9CKYe11g5tmAe21a8CgeBcVJhS008boPG+JMZPHkCo1hWdJMRsB4CssXTe4VMwbUaqNnjru -IjmAruZIMRi/WbfJmPhmgXZFNvw02FDaD209IBrDYtO0fhNPi4T1zdI0Rn4NZ4px+g3ZKmPmd+FrlnLA -hLWiwYS3KYxG9hYp7JW/WTR9Na2leP5m3E2i6xtzN5cn3oC3srnRFTI3A9GoDkr09cpbCryvJrIS99+M -sVnQfWPWimwABlxN2yKZLaRzb2JshcJeWcu9jZrEUjoAM7YmEfmNmfrC25iwVLZDDIWqTWws0dMLE9ET -74vXMsekSPkTSqeNCpS0hbBecXtfzZRqrkmznqjmkdQ1O4p5Hesct6JU2bVZ480U3NEsfMvvNEsGqVGp -VTwUxqZWWZF40KAw5k7ULJ5lS9SsQHc7KmW1V+AwSN77z0q9mh9qY9mbY9lRjUOvIB7yr5H40TQMi9Vk -Ni/ZnHY1EA0a8j/xO/1KqRsWaybrEP3qJDVUV6xmtSsmUiGUFMnTPSpj7k796pmIEYAf0j/1QYg0cES3 -s3LwSNFj9sTA75PP65aXN8t16+SLgpPQxJhTrbWOiQZAKRJ1a9RGD0y6fq2X95Zdm9JGQI08tgBJ1sZ1 -ItlSPREaJYRUvFqA/JBTVc1i1gCoPjhn247/Q/bhTzgN1eigTsSqHT0iIA8XEh87PXhA9X1+bbU1fGcm -fjNtn1mNAVRr8NSroCyFuol1WZ0wxSw5DBDSMP2lXZnm3TBDgYfcJTiX6WF1gbSZr20sSFLb98QHBDfM -fjPmBNYi7fJArLjg633iRLYr+RDMuIS294kblzJf8sMIBuap3iNmJDvon5cZP2EaiD64gLm8h8lPQw4Q -Esl29Oel/wJQ6JV+CdeUBeeiWko9RQJA5Ppjg5bfQ6Ahs5xbyWFSJ6R0862crKQ2a8lPpru/JJtIT4EC -o8UvLy+OWZbZrNYmU54kTespV9g63JJ5oTmedZKnsmqc56JgNVnaKKw9IKm99yZhhwvgCvx7zOShSB1u -SIzkOcpWV0PRlKSjf5uV3DyvJFssVr31p9Z67d49d0jnhyOoOWZ/GA3/jwizPzwophEppr0/OcR8oWdf -nFAyz7Mv/icAAP//J89MAJ/4AAA= +H4sIAAAJbogA/+x9f3fbNrLo//0UiN5uJTeS7LTb9/oc2z2JnW5zm2x8k2z37vHx2aVEWGJMkVoSlOLb +zXe/MwPw9y+ApmK1tzxtbJPAYGYwGAwGwMzJo4s35+//fvmCLcXKPfviBH8w1/IWpwPuDc6+YPCcLLll +y1/pzxUXFpsvrSDk4nQQiZvJd4PMZ+EIl5/97S17JywRhSeH8sUXaYlHkwn78J8RD+7YjR+wjRU4fhSy +SDiuI+7GzPJs5nFuc5vN7tjM90UoAms9/RCyySTTUjgPnLVgYTA/HRx+CA8//AthTr6efj3903TleFBh +cHZyKIsVEXgegyUc1gEPuQcIO75H7YfiznW8Rb5BonwpxHrC/xU5m9PBf03++mxy7q/WUHHm8gGb+54A +OKeDly9Oub3gg2Jtz1rx08HG4du1H4hMha1ji+WpzTfOnE/ojzFzPEc4ljsJ55bLT59kgQFytyzg7ukA +MeXhknOAtgz4DfBiHoaHCdsm30y/mf4/4ge8HzTwr6pKEwt/8vz5rR8J4iDfABlsCbwr863Y0K2qCO38 +aXqk147sK+GzlXXL2SwSwvdC6iqxhAZDtvWDW/b1ZGuByHCx5dxjcTtULKFOAzfJhSfAha9bsXvnrzjz +b5gfBczfemzBPR5YLltyd80DdhN5c5SqFtndBpMjYMWTQlP6/Z0ASDv55DAduScz377Lom47G+bYpwPP +2oAUulYY0u8zK2Dyx8TmN1bkQiuBD9KHH50FDZCMDCWgFAQUZ8sBBhTKFMupJhC/yrKSR2vLK1SYBdCV +g6x2wUIVbR1CYwU086/Un2WGhAR40EZRoTwPAj+AWrYlrMnM8eADjApuzZfHLFOihS0wzAOQVvx3YoMW +RvkBDoEiqOPROtui4B/FMfsDvkEhWpvwpZq4mWUD4hteR1rme9+UZSpDF3OX0b8wvgMPxntNrcqaJGbN +dfB5R4Q0FiHwOOhvfebcHLPLwAe1v2Knp2wwyA3wRghRjJ7tC8HtHGuF77vCWR+zXxhNnMds+PIGdVzI +4L8PUQhcZIKvYPqwYAIF8fQ4KJgNzJxQIIz4WBZe8TC0FpxtHddlC59ZpBihjAi5ezMdsk+Ds5WzWArQ +lswGBp0cRu3oE/GHQL0OrVlOPfo8rHq/5AHQbMHMAHO6bDEKcUIipkhZnbKXQvLF84l8GJw2Ti1B5DFf +AAj2wZ+FUMzb8FCg1gNBFTDzeJHlusDDG3bnR8x1boHbM46jgS0dIWQ7nP3zJwTuiH+qeUpyG9r3fOb6 +JPxRaAFy/fG8YmDnPpfGBM4HLQPiL2CrHCs1XNIy+JFmKtS/J7OgGdTLi1pALy8MwFzWg7nUB3O/IfzK +hzFI08Jc1KJzATIzFT7+GB0kmLX3tRQYJu7WMOXKP5KpaCY8Bv/H+nMdue4kwCGcGxVz15nfwiwQgL0z +BTRvnGB1AeNbqrfB2UsxDMGSIEGW4142o8EynYF/z0Ef1+De3I/ANA64XctjVVa/32saYNavsR+Vjumx ++xp0SM0nU3NCTUo1xkTy9ddvSsyX3I4AQ/YSp2SjmfIcxXJ0wM7YE+1p8gqEA5RSwHFR2SzQP2DJaqm+ +3t+pqIaY1+FCf/S/1eDOK0syZ3RgOOjv04OIeYxcLWYENMEFDB4YLT1pbB1dpcaKlrKynXAFpuhrOZwH +Zxfy73ZNtVsFRAtn5Wg5Zk+Ojv74NCFzy0GZ4j+TcAWW4HqysoJFpULJgpKFjtkRsyLhP61TP8tvSxWe +ggqyUZHA7zAlw1y0WrsczMzcohdWV8C/siw43o2LXQAyKiw3HQGHy2/bNWCGuixkFNw8XJLgI13tGPiL +ADp8kCcVxjZ0+eq4EU4drAk6I7J/TEIROGscwbji4flvsTZX7or4G3zK0Uno4ZJByUFCs81d6+5yjgP3 +MRv+kUx2I8Wdh8RtyT/9pU71uC9CTVWAetHfmq1FcVORPeimNfds7omeukpB672zFNxsd6lXv7IOA5r8 +zr0Flp3dz6AiSD33EsFMewj7B0Rz7/une29EXj99EXk4hvvuDQk17Q/1Yu/7I/9NLkk695Hrh/2oNgTU +cw8hyLR73IwfZA/76J79MIuCfhQXAHJ6NwYk0LQv5N+frRd2Z6gjgl999RV5Zu+4YA7axSuYNQvUZWUg +8LdM2pktZnuypeNOPoaTb+vs9Rs/WOVkJJqtHOB+wP8V8VDAUu3PgR+tNS1jx1tHYrJoqUG1srZ/ptoE +lgp+bK0Lf7FAgVbOb/U22aWCRQOupqVD/HTwAj1cDKA6aHk4Nw78JXxmuaHPQs7JWy23p3AL04JFEKxE +VpZnhwwaBQ23dcQSSlkiA2E6OEv/0FocEzFqgYmSnKy7kNWEPIzS3LjcWG7EkeWtvG7kHCxddcatWgEX +/XPxBqhEXIoBjLlsYwv3br10gAKW/DZZg10+mTvB3M14yPXcdC3MbBx3yMsuO6H4oD4oD0SlykI/ELhb +EQt+ODqYutxbgGTU6bSTZcEZk9M85ZFcuW1a0Sy+G8Vb6iN3HByA6g64iAKPuVPHBoQC/PE9e8KO2eQJ ++3TQsoZvdQc0ORWN/ACSL22+gDrNn1H2Wj4CXddAQku7e0DPK1AEeF/PQB28TstORo4qi87qVBgGVuBY +E1I9K8c7HRzl3lgfTwcgJo3mQ9mJMGaxb2xtBaA0p+HS34JIk366kEv4MbOECBDMMG3P87fDHEAdC4SY +lRm63VwRBKVaDjt7IRLMtHeP2w1BKvbrEY0qx0WLeKgqjQKSA9tNSLo5QQhOnZjcw/+R4Ld3ooK+kF3L +Sdll0igjb7F4g3xkwHWRjS5uF4JRLRcdPS4JVvsgEbvu/4KTprn3pYukqf9jcJ16v5Ojh6DU9H9XH0+C +2T5IQIWfYddSUXILNYoFHlFpkIkUWBeh6OBYIhDVEnEPn1KC2IPIxOfp95IbqrHfn5MbqKHnU3Bder6T +K4uAVPd9Ry9WgtYD9vvOlg9c8EJ/N60NktIdFwdQv9/FAQLMLQ540XPXiNnDaPdoPoffdz2U4z1+/eF8 +rmo0yEAeaBcpiCH0JwYxxFQO4jcPIgh6vuz674pXiV/K5sJy3LDdh17pVZFnz+qdIblTNWFInZ47rgad +jncfOB6qHKrV95D9+9+5t2qpNRzHlXHlkqtJlnj6fR04gMpdvoi0zdJCUvXlykiVXWgfZ/G0lhpeuWqx +QGjuq3Q8hpfUb/O6VRy3WpEaa/KalTBUTfgbHty4/nby8Zj8gZpIEphwZbnu2YlT5wY839rPrTDjVq4t +lkjY3Hd90B2gyO4y7kAHf6XG9OjT07dUNKNbXuOhtfaRkqveDycJVMLNFeFRe7ZOotmdO9SYIYd2OdMl +pyrZLb+DyaLqRG01EBOCbXH2TOBNFBECksKkpl3ugxgU9oJta0ulrvyaUvbi45rP8bDo22eve6AuBgfQ +pqvZyxfn8lzpPhH63lnxHilFcHiGNgrozuDO6M1om7dyf5bbF054a27MmHBOVhBnSZMM2zRjn4Rh1+nw +HDWpKfXn5/pspBbMWKmrliTyhrJ2DiZUH7qC4Oxenl77niP84MKf38JC/xGYLcPdS5RqlMlWe5WoHD2Z +2W5fxCnD+h/Awn7LrdD3dszxTJtlw9eo7RQHcXYZ8A3FNEA6osBQryaImXAvVzFH0aM+KFKdgTf9H4Cm +KiWQioiJJiBw3WTYWIhffHRwZti5ysB2YIltd+mS6incEQhud3yl0gVOYYsoq2aztiTCkGmykjh7J+w3 +kTDnmqxvwJlcxdwARQQ6DcoEXPYwSMaDVXc5B/1L0OwUP40oVAAs1CUeQ7DRvnTFUyzy5UI81b3sWEtl +l7GeAxDfI+2DUUiZ53scKfv8JJmNJFXDWBd0Hwcvgg62hqzfzzgABPZiHAAe+z0O7suo3/Y46IRcp1n3 +klu35t4BWbti0kVwHb0DBNOMS10IBmsSb6j2RK+Clrtmu2cEv/Ds3sglWPtM7N8s1xXGLiBZu4LeGFxn +FxBB3j3Z55d/7ZFqBW3fif7RDztYvZUU/6iOb+whhezlZY9Eyvgyn2c5RO1d4GLIIFRSSkFHK1Dy7KKz +GVjDtwtTvhG4fZ30nb4mhEt5on8f/RaPYs/Fl1+yUeIVG2CIzGCDMbiym72D+Ehf/i0d6zrYfYf8rzNK +7jFPV/k6ZUd1dAvuat5PsO3NAdo3ma+cDY9JlbFiPj+xvxsKxWZ/NxTMKPjdUNDD+HdDoVCjwlBIZxR1 +qle+NHZXdbQCujkwOzkv98zTuJ+icRFf2t195ydN7XH/Jzj+hvubjpzOHf55ujxpbb97PUHzN9XxxofM +vI3xsR9D5nToHsDqfr1iegDJeFSdbz/D8YEfMYHB+RLPdvdlsyBIBXFfbZfnfGnhGZ3gM6irtK09VlYp +kr8pVZXp8jcY2l0dq/wMvf6Wh8DNOaeTnE5AUYz2WQCIPb+SvtcA2+3U/A1wg251ciu4cT52uE/1zlk5 +rmW26HlcdzNBAUuP/8r0BHGQpk5Og3TNdr9jVxQaKrRg8uDxATQ2qqEje6SMCDmgnDxBeqrwRp4q3N1S +/l7nUdPVbRwAxUx/7CYc/Fu+8jecgsgMzuQfenGmeuaJjOqwPxy55Jgk6AEZkoY/2R+mwGLkYYUk3ifa +A45g7gSZQeFBWGG+GaGusr3HJDYf/Bmz1muYoEJK4DHGLDMyv83cj1ybEvpEnOL8ZTIFUXIgFkbzJaP0 +OB4XmDINIx8p3fsUE9tgREBsAaBZcyHz3dw4Hh9jBhxKmhPwDSZgkPlyKHJSSJThDb2VJZw51dkuuUfA +4jQ8ABAmVG5P46t1Wuk6EgbsLqHG4Oxc/sEutNOh5Pr3/gIRu0yNL0qmDJABD7O0G5pt+gzWVDh4QL+b +xjHCSd1c1kBKBDRNwg9zdB7wemfb/fW25nqIyGpRvEW28m2r4uJ7MYIjFTtmv5Sa3Dghpsk8VvBeY7mf +5btxqbDtWK6/OMcr8EOCOAlXw3IxmUIQr8kjBvjTtWbczbXxI5Vhn9incn28Jou1PEp2NczUeg5f3oP6 +dGGUDscKvPx+oUIAVMCTC4hqiD/QtzaYOZCfyH9S6iiVPjKNqHqImVsHlGSnhoSqOJi52C44IEYHtJmo +hky1QnoWcEp/Fkbql63l0XRQY/tLfDKpS5a8PnJELslJEotWRaHl2TC2g9oQY3HIWAVmUD0qMrTz9os6 +FAJ3admZtU5N+1jgPLvUoZUOTrGYz5bPrSjktcjf5C41SfS/r8e/cdjnNuo0SOzQTvvHonSdGknXZxcV +ZkGrmeRoZZ40k1xl0tTy4Rat0Pr+k1bSSMiUhmh5gWFnyYCbcdZBJHS+ArJD4a+hk/k8wiyET5l1g24M +bAENtK0FQgv8ctzYvgtRFNHxK02Pg9p4B926OKBZv504Kme5GHw66UE11Da84OxQESSRHp9My5XkSggj +yxNopsLg6UBIOduuxFBPxeZ1ekvk8cROqwsqnIvLo5dFqi8jabVyxDOiK7dTLYKIH8APFbBL9vF0bq0d +YbnOf3PKJ/aKC2CCjGqEUcSHA42A1ztG/AZMFUPMn7TibaR14x6EAfGgXWjGifuzQGslEcdWJ2pUxjBl +OsKCzPLmvGFtXmm7ltJzJ9iGwvYjcciDoD8TFmCa2q/uYsyUJStsE1M2bkvHjo2rqsTyVPlNJDD+/qda +27LMMluFIAslxj0wzF6Y88uEScNkt/2OybMWQy2Ln3ubenPfXvyMbhYjvqXnIHpjHV9/Lt4B2n2wja8N ++TZLt2P74hqA3DHX0i3THngG6BryTJpHfbGLoO2YYbTFyCo3RnvgIFFgyEMA2BsHY+R2x78X3sYJfA8Z +xn7GUJDQTB+cg4+NfMvxpckwrmqlziauyvlBFkudcVy9iFNVmrLAGpgLmLck/0btCDuEJv5aRY9cc3w5 +99d3T9nXR0/+7wT++Y79mXu4xgKB51YwX7JXzgqX4dPKRQpmcEH46dui1Faw/oO1seTbAlq3/tRfoykY +TsHW4sFf18AnmJNOyaJ/mify8BCkmG9BJrlLu7FgkGEWm9i5H+V3muMELOTBjsKfoeprrAq2bsXwsAIW +cvcGW146YfnOOX6cCv+We1BkwcWlFYDIAiOe32GS+dGAvg0OampaqEMAUZXG6JQon+FVJRwdz4LAuhvV +1ZV1wC4Eko0qziybLkMFhg3GKa7NasV+imKt2goqRGkcR5Zh6KrmomovorXcm2c137cgzxjzTcpZoFcK ++eDxLWshH4rSyIHS33x79LR88kdxCX0Oz+MM6VA4kdORY1eJZlI17U4FJc0xJN/X1cZHpR+SBacvL3C9 +59jVsRU+VdD4qZEelUU5R80qXDSSE0tZmRjMS04pyXUISgpPX4cLpAravT9ZcQ47oKgahSSo7XFB2o8O +pqDywE4d/cISmTguysing/JcqMDGUXF7BixD6fYNVEXs6hkshebtGaaKAdx7d8nMRzsTgx3AjpOt7EAY +dgBVpYHYgTjsgge+a/+DMpAB4KMmmfkHBpeOwLqFcmWtVDPHxFrpaijbuJZzrQJlpyq0TnE6N2xUgJTH +5lprDskBSEm+rtG71Ydg0eSiekBEFU4wWK/J5Vn6GGvIys9Sz1V/Utqq8iPpnMovSnNcV039MVMlIWfs +qIl/SPEqwoSYrkNT/5OjI3YomVAf5QjM3i2Hec5y6bTM//+OzsxsfMdmFptFC+Z4sIzyRSgCa53kDGgC +N8NV1HbpgK2vzsqEgBXCwX0XOpcxWeH1SSjYBOcGHbs8oL2OSOD2CP/ohDB45nzM+IaO1vjRYon4e3ge +pwmY5CAG00a2NPKQeGED/9YcFuieeId/B6OrUYa5XzXI1MGYtRTNSFhb4UTeWgum0tdWNJbFtnKpZB5c +j0EyagQ05htY2RjgJ2XcW3oRjCRDx+zrBgBV7EQFej1SYK+Ork2qZ+a3FMQTAxDJNJZW/9qkupyt0srf +GFSOJ6W09p8MasdzT1r727raNbqzXgXjArZenygNXlPik+bcV7+2ia9TnrKriskgV+iV79/Sou+Xutmu +lFrVbD3qLDzcfJYNlIqAxgm5YIAB6rwtn4UYcLiczgmV+9bxbH87/RufvaNCsMo4ZdhxeOSwec2WWbtP +11G4HA3+7kcBmwX+Ft4y24dVNmaRDqP1GshlSRthlSvhE+NuyJva28aL1QTQaLANw+PDwwFMbK4/p/gP +0yXIL7rc4N3gOPeFsIC3hxLzf2zD78mzcTqIJ0b6s0ZcFQ5T3/PX5ClptUiytUIUvf949+YvU0xl5i2c +mzuQRHUX5pgN5lEQ0HHlTwd1w6UNrTmM3PwytRWxchee+57HZXWYilF+VpZn4aHOpYVHFoByVBCPqrow +fg4PMYE4TIzyNOzah3kYj+CI4I4OrfIJ0AxC7oTyoMg8aXM6ndaoimbSVxVr9MYV9ge89XDKqEPWYDLw +EZ+iG/OgtgYOFqw1BT682XqXAUhBIO5Gwx8Cf0XOm+FBU4uSMeqkrhetZuh8oUMWc3lPr7FmAMt/Qvpq +GKuM4XVjDZoUlfupsSASFpB3YfDYct3HgzYqCJ/UsZXT180BLdUYTyz1vL4scjZYtDK0CpVEU19VtHEV +LK5rTPkCkkYNtxfGZ+jgGj1YVK+dSqV344WpaGY3XplSQ7vw0pQb2YnXptTMDrw4pTZ24tUptaIyxO26 +mSTf1O7JqXNamY6He0FpcETpS/K96tc7l/Tl776cVInxuoPIZNe7Dx60c1IEoExsTSAa3q+Khtu8YcWn +xsjLPvG009lRVnzIAEiAGvjMypjRCiyF1eo+Kz41S8Lik3WvFTBPPGvZ93mnWvol60/LvM250tL3GS9a ++jJ1UxTalFq1+D5Rg7Uet+Kj7YGrYpK5R674aHjoTGCVnXlFj50JtE7OvQo4rc4+E2AFv6Cu86/46DsD +K0dAyb1WMx4aytV7/yrHSkOpWp9f1ThqxDwZVQ2lsmOs1XdYxXYtX6KRSMRDhq5jSpi4XEXRN4MDokT3 +CWJxYpaApfUdrLEdTxiORQydOWa2j+fFmc3n8hwYQo/kURWjIYQnmJ8qf0/A5UVWJ4yvciy5uzaCJ/kV +4uEdx4NFMwzFEAdmOlT15mAFDi/oghm5QhVR52SoelAcbvkdef1S23JcsBLHGXtvnFhu49QGG6fW1Dhr +F43zFo7mLIkPnhEaIXYOoHb0FH6csO/gx+PHJnMEPrnpH2m9cq6v6dpD7MF1jKyA+CnDzMAzS3xR7f65 +X8ndM/Dkt8tATTut0hJs9uKXQTR59Ut4dffyF5+8L0l6R2N6NLhf43sqOammLvcWYskm7IkGUqjJ1AVG +0IXobXcJ9Di5ScdwZ4H5gd2wI5mBtorAWkKlLZ2QMpIBmC7yRine8FIHETVyyKN308fo3WP4iUAsF34i +42gC9ECRJ1pTB1hhpabH8tLGilHPtcg1qoubwF+NgaDGguHWEfPlSDpsUwexlhqYW9C7qfNPa5QgUtVr +Ib1RNoPp67a9aIxa4jDsilxigO4APeVm7Iaasnl3gVbsmOyIWGxo7wA16czshpc07XeAVOz97IZWvJzo +DbEWzZAePqKd2eJWRnHn5gCDf2XKXxULXFdDeO8niqQNwFWhxjU7i3eQzvFapJ4yAjWs9prJmh8Kf8hg ++e6FDrqYxslsBF+9RbuBj4Y9wFKLbJqlaGeQJgsae8ya061NWH6BhaaFX/tkYsioSYFR7UJU6H6dRk5P +9d05csFgSIa+e+nN7AOfiymamc1UHMTWignyugToegjvV0J7dy83hWfGnR7RXSZxfMBQusc0jo+mks0i +aTadV6JpOKF3QtRkYq9A0mhq74ag0RRfhaLZJN8JSYPJvgJDk+m+E3pG034FgmYTfycU061M7TbUGYtH +Rmcssk+BytTFqU+nvg+ggwpRe8gPxpDEM/yA/GiZeJoNyNoNOHKXsO/ZE3bMjpqRQKaiJazDS1zKenyr +DGf8MTpgky52TwzlzMAmoPZURQ1nivakjQ+5IVYcvdthxlYNQVY9sD4DZxMboLrgyE59Ckbq0HUZyJm0 +hX2PswUekQtwv2eMdqwuwJUV3GKvJqY1BmnkeJE9i7EuNAr0SDGxkGLHY3jnV2//C/vvETNZuJiM00Zz +r+Z0bNWjN1JbbfBq2rLemd6IuyrBvmaPjVcVRLuJ6HfCqxtaeg5gGudHOiO8RXc2fW1SnRoaU/g63S58 +KEib+fk1dJvXoPZLehSy8lSp5olS83OhyTBJ7hOjK0EeAK26uqzpJUAdhieJ6ZgwRUWCFbwPCje30a+7 +podaViCceeRmTrE+ZZZtk9oUGIqMsNSaoSR/lMDnQiHrDnhAiY7qxhkHkbI49i7FcsPQuxNdUI6n9lq1 +KuBMvY3bjTs7RkR3vCKQGV9YnjpCL7Nr6tf1/G3p7nsKRxOQRD2bufH+h5fwUXtECZMes9EIECaDhog+ +YIe4Wa6jiPCpukxd9VReqJf7DdB8/VHy7JOZgQuQjCejQn3grLrcEXLx0hPYbW43BuODUmDhPswr5QKq +IV96iMy2J6v2YjNtddqVxacSwyvn2lx040fTYqGiYyOZu+cUiU9erUtBxDG3s0nq5aXWnQdHDEPGHQo9 +ZJH6mVm2CtcwBusa9wbpwBWo0zZYaU0ZptIJSTfhbaXWaU/qn5fhc8vW8+QVQ1Noc1TbyVgRNiNG8wJw +3FG/vQ4XHTuOQlJELka9klduqP/UhnEbOJi65ckiWjvBcipI3f5pvJvWHVgJA49nUVzF1vKZkC+54Bxt +81+VVkoCeyg1xx4/dnSX2yHCiQGAFtLcVnDi4B9SLrDvtN3QUPmVFQpSdWr2VX+2CVcGApm6o7zZq1U3 +7SiMeKS/E7dbT4uccRVu2n2XhGLRv+2DPXWc7TXNU+MUDJT6KK6dvtGFkXRz8dB8SQo0AcqOr4YWC0U7 +KM05LBllpHAzMXM6T2SVX8pvK0QHVyNzESfmoCVOEGcws5JjN6VqSrNTwbdp9KjE/ALlsnrh0iqgTgbn +vhf6Lp+6/mI0UKBwwQFtMnnxLLm3GqMB9kwlKM1bqEMZJW04ZjHKx0X49fdTgVF47RMPE91xYBh6qJE8 +UADqgLa6STpOrgYvq9R9zY3mYifQGjNUgahh+ri54XiBlkKz0UnR2mgRMkoEqfe2DsQ0bPFC+ELuxWU7 +Ma7cfE0aYFApWoAmdcbp9mDVbei6e+M5hNSuW68oxTt5HZF6S7N5fwjJXbuuyKglep/okC2IfSZdr3ht +AdbzbmSD1CUbeJ2wfYU3F/pDlbbqOjLuOe2i9YiM2pbriM652u7qEaFkB80QpRRaFTJjeb+7NURRskhr +sj+S0oaOgU5x/rKPchtQCsbEcVCJSTOcGq9EVYTD+KmfwvN8G10ZRhVRh7Spl6aOXeftpGNVcVaoUnjG +Js7T3X5/zVBImlYxCRIKcD0lRapbgklWVWkKKllV/s2zlsLS1WbE+RoSMp1RDa6KDuqa9uJERpHRDdU0 +LKP4DmnWNMqQMJa5xI6V8FQaSXrGpbJrYK2MAWEzkfNrdFI+Cn7JSStTD9QptHxYe93gpmlAe+0aMCje +idyEgnbaGJ3nLTFmshhSpabwLAlmIwB8haXrBlcV80aUaqOnjruID6BXcyQfjN+s21RMfLNAuzIbfhJs +KOmHth6QjWGxaVK/iad5wvpmaRIjv4Yz+Tj9hmxVMfO78DVNOWDCWtlgzNsERiN78xT2yt80mn41rYV4 +/mbcjaPrG3M3kyfegLequdEVMjcF0agOCvT1ylsKvF9NZCnuvxlj06D7xqyV2QAMuJq0RTKbS+fexNgS +hb2ylnubahIL6QDM2BpH5Ddm6gtvY8JS1Q4xFKo2sbFATy9MRE+8L1+rHJMy5U+onDZVoJQthPXy2/vV +TCnnmjTriXIeSV2zI5/Xsc5xK0sVXZs13kzJHc3Ct/xOs2SQGJVaxUNpbGqVlYkHDQpj7kTN4mm2RM0K +dLejVFZ7BQ6D5L3/rNCr2aE2Vr05Vh3VOPRy4qH+GskfTcMwX01l81LNaVcD0aAh/xO/06+UuGGxZrwO +0a9OUkN15WpWu2IsFVJJkTzdozLm7tSvnooYAfgh+VMfhEwDR3Q7KwePFD1mTwz8Ptm8bll5s1y3Tr4o +OAlNjBnVWuuYaACUIFG3Rm30wCTr13p5b9m1KWwE1MhjC5B4bVwnki3VY6GphJCIVwuQHzKqqlnMGgDV +B+ds2/F/yD78CaehGh3UidhqR48MyMOlxEdODx5QfZ9fW20N35mJ30zbZ1ZjANUaPPUqKE2hbmJdlidM +OUsOA4Q0TH5pV6ZZN8xQ4qF2Cc5VelhdIG3maxsL4tT2PfEBwQ3T34w5gbVIuzwQKy74ep84ke5KPgQz +LqHtfeLGpcqX/DCCgXmq94gZ8Q7652XGT5gGog8uYC7vYfzTkAOERLwd/XnpvwAUeqVfwTVlwbmsllBP +kQAQuf7YoOX3kGioLOdWfJjUCSndfCsnS6nNWvKT6e4vqSaSU6DAaPnLy4tjlmY2q7XJKk+SJvUqV9g6 +3FJ5oTmedVKnsmqc57JgOVnaKKw9IKm996ZghwvgCvx7zNShSB1uKIzUOcpWV0PelKSjf5uV2jwvJVvM +V731p9Z67d49d0jnhyOoOWZ/GA3/jwyzPzzIpxHJp70/OcR8oWdfnFAyz7Mv/icAAP//TPT6Khr6AAA= `, }, diff --git a/static/status.html b/static/status.html index e8c6f05ac..03927ef7a 100644 --- a/static/status.html +++ b/static/status.html @@ -236,6 +236,12 @@
Cores
+ +
+
Monitor Docker
+
+
+
From c68ddd1f2653ec8e0a3354f99c543d5cce73ec5b Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Thu, 3 May 2018 15:40:36 +0100 Subject: [PATCH 07/33] Start of work on wildcarded repgrp lookups; non-functional atm. --- jobqueue/db.go | 32 ++++++++++++++++++++++++++++++++ jobqueue/server.go | 18 ++++++++++++++---- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/jobqueue/db.go b/jobqueue/db.go index 02479ca1c..900cfb6ed 100644 --- a/jobqueue/db.go +++ b/jobqueue/db.go @@ -55,6 +55,7 @@ var ( bucketJobsLive = []byte("jobslive") bucketJobsComplete = []byte("jobscomplete") bucketRTK = []byte("repgroupToKey") + bucketRGs = []byte("repgroups") bucketDTK = []byte("depgroupToKey") bucketRDTK = []byte("reverseDepgroupToKey") bucketEnvs = []byte("envs") @@ -245,6 +246,10 @@ func initDB(dbFile string, dbBkFile string, deployment string, logger log15.Logg if errf != nil { return fmt.Errorf("create bucket %s: %s", bucketRTK, errf) } + _, errf = tx.CreateBucketIfNotExists(bucketRGs) + if errf != nil { + return fmt.Errorf("create bucket %s: %s", bucketRGs, errf) + } _, errf = tx.CreateBucketIfNotExists(bucketDTK) if errf != nil { return fmt.Errorf("create bucket %s: %s", bucketDTK, errf) @@ -331,6 +336,7 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j var rgLookups sobsd var dgLookups sobsd var rdgLookups sobsd + repGroups := make(map[string]bool) depGroups := make(map[string]bool) newJobKeys := make(map[string]bool) var keptJobs []*Job @@ -355,6 +361,7 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j job.RLock() rgLookups = append(rgLookups, [2][]byte{db.generateLookupKey(job.RepGroup, key), nil}) + repGroups[job.RepGroup] = true for _, depGroup := range job.DepGroups { if depGroup != "" { @@ -430,6 +437,17 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j errors <- db.storeBatched(bucketRTK, rgLookups, db.storeLookups) }() + db.wg.Add(1) + go func() { + defer db.wg.Done() + var rgs sobsd + for rg := range repGroups { + rgs = append(rgs, [2][]byte{[]byte(rg), nil}) + } + sort.Sort(rgs) + errors <- db.storeBatched(bucketRGs, rgs, db.storeLookups) + }() + if len(dgLookups) > 0 { db.wg.Add(1) go func() { @@ -633,6 +651,20 @@ func (db *db) retrieveCompleteJobsByKeys(keys []string) ([]*Job, error) { return jobs, err } +// retrieveRepGroups gets the rep groups of all jobs that have ever been added. +func (db *db) retrieveRepGroups() ([]string, error) { + var rgs []string + err := db.bolt.View(func(tx *bolt.Tx) error { + b := tx.Bucket(bucketRGs) + b.ForEach(func(k, v []byte) error { + rgs = append(rgs, string(k)) + return nil + }) + return nil + }) + return rgs, err +} + // retrieveCompleteJobsByRepGroup gets jobs with the given RepGroup from the // completed jobs bucket (ie. those that have gone through the queue and been // Archive()d), but not those that are also currently live (ie. are being diff --git a/jobqueue/server.go b/jobqueue/server.go index 6afe8dab4..c4b5d83eb 100644 --- a/jobqueue/server.go +++ b/jobqueue/server.go @@ -1378,7 +1378,7 @@ func (s *Server) killJob(jobkey string) (bool, error) { return true, err } -// getJobsByKeys gets jobs with the given keys (current and complete) +// getJobsByKeys gets jobs with the given keys (current and complete). func (s *Server) getJobsByKeys(keys []string, getStd bool, getEnv bool) (jobs []*Job, srerr string, qerr string) { var notfound []string for _, jobkey := range keys { @@ -1415,7 +1415,7 @@ func (s *Server) getJobsByKeys(keys []string, getStd bool, getEnv bool) (jobs [] return jobs, srerr, qerr } -// getJobsByRepGroup gets jobs in the given group (current and complete) +// getJobsByRepGroup gets jobs in the given group (current and complete). func (s *Server) getJobsByRepGroup(repgroup string, limit int, state JobState, getStd bool, getEnv bool) (jobs []*Job, srerr string, qerr string) { // look in the in-memory queue for matching jobs s.rpl.RLock() @@ -1451,7 +1451,17 @@ func (s *Server) getJobsByRepGroup(repgroup string, limit int, state JobState, g return jobs, srerr, qerr } -// getCompleteJobsByRepGroup gets complete jobs in the given group +// searchRepGroups looks up the rep groups of all jobs that have ever been added +// and returns those that contain the given sub string. +func (s *Server) searchRepGroups(partialRepGroup string) ([]string, error) { + rgs, err := s.db.retrieveRepGroups() + if err != nil { + return nil, err + } + return rgs, err +} + +// getCompleteJobsByRepGroup gets complete jobs in the given group. func (s *Server) getCompleteJobsByRepGroup(repgroup string) (jobs []*Job, srerr string, qerr string) { jobs, err := s.db.retrieveCompleteJobsByRepGroup(repgroup) if err != nil { @@ -1461,7 +1471,7 @@ func (s *Server) getCompleteJobsByRepGroup(repgroup string) (jobs []*Job, srerr return jobs, srerr, qerr } -// getJobsCurrent gets all current (incomplete) jobs +// getJobsCurrent gets all current (incomplete) jobs. func (s *Server) getJobsCurrent(limit int, state JobState, getStd bool, getEnv bool) []*Job { var jobs []*Job for _, item := range s.q.AllItems() { From ea86d753bda42e7fa6c5c3509c5ef3c34813c641 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 4 May 2018 10:31:36 +0100 Subject: [PATCH 08/33] Set WR_MANAGER_HOST and PORT env vars in bsub mode. --- jobqueue/client.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/jobqueue/client.go b/jobqueue/client.go index f5772ace6..79aa81506 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -121,6 +121,8 @@ type Client struct { teMutex sync.Mutex // to protect Touch() from other methods during Execute() token []byte ServerInfo *ServerInfo + host string + port string } // envStr holds the []string from os.Environ(), for codec compatibility. @@ -186,7 +188,15 @@ func Connect(addr, caFile, certDomain string, token []byte, timeout time.Duratio if err != nil { return nil, err } - c := &Client{sock: sock, ch: new(codec.BincHandle), token: token, clientid: u} + addrParts := strings.Split(addr, ":") + c := &Client{ + sock: sock, + ch: new(codec.BincHandle), + token: token, + clientid: u, + host: addrParts[0], + port: addrParts[1], + } // Dial succeeds even when there's no server up, so we test the connection // works with a Ping() @@ -610,6 +620,8 @@ func (c *Client) Execute(job *Job, shell string) error { } env = envOverride(env, []string{ "WR_BSUB_CONFIG=" + string(jobJSON), + "WR_MANAGER_HOST=" + c.host, + "WR_MANAGER_PORT=" + c.port, "LSF_SERVERDIR=/dev/null", "LSF_LIBDIR=/dev/null", "LSF_ENVDIR=/dev/null", From 5ace7148f04e98c7614e5142d2023af5993653ce Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 4 May 2018 10:33:51 +0100 Subject: [PATCH 09/33] Just use the supplied input as a script, not multiple cmds, and always rerun jobs. --- cmd/lsf.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/cmd/lsf.go b/cmd/lsf.go index 5b070a996..b93a7800c 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -23,7 +23,6 @@ import ( "encoding/json" "fmt" "os" - "path/filepath" "regexp" "strconv" "strings" @@ -56,7 +55,7 @@ NB: currently the emulation is extremely limited, supporting only the interactive "console" mode where you run bsub without any arguments, and it only supports single flags per #BSUB line, and it only pays attention to -J, -n and -M flags. (This is sufficient for compatibility with 10x Genomic's cellranger -software.) +software, and to work as the scheduler for nextflow.) The best way to use this LSF emulation is not to call this command yourself directly, but to use 'wr add --bsubs [other opts]' to add the command that you @@ -106,6 +105,7 @@ var lsfBsubCmd = &cobra.Command{ fmt.Printf("bsub> ") scanner := bufio.NewScanner(os.Stdin) + var possibleExe string for scanner.Scan() { line := strings.TrimSpace(scanner.Text()) @@ -127,13 +127,15 @@ var lsfBsubCmd = &cobra.Command{ job.Override = 2 } } + } else { + job.Cmd += line + "\n" } } else { - if job.Cmd == "" { - job.Cmd = line - } else { - job.Cmd += "; " + line + if possibleExe == "" { + parts := strings.Split(line, " ") + possibleExe = parts[0] } + job.Cmd += line + "\n" } fmt.Printf("bsub> ") @@ -149,8 +151,7 @@ var lsfBsubCmd = &cobra.Command{ } if job.ReqGroup == "" { - parts := strings.Split(job.Cmd, " ") - job.ReqGroup = filepath.Base(parts[0]) + job.ReqGroup = possibleExe } // connect to the server @@ -158,7 +159,7 @@ var lsfBsubCmd = &cobra.Command{ defer jq.Disconnect() // add the job to the queue - inserts, _, err := jq.Add([]*jobqueue.Job{job}, os.Environ(), true) + inserts, _, err := jq.Add([]*jobqueue.Job{job}, os.Environ(), false) if err != nil { die(err.Error()) } From 41e2436f03180db1c77e070963bff5e5029342f9 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 8 May 2018 13:16:54 +0100 Subject: [PATCH 10/33] Resolve #113: -z and -y options to treat -i as repgrp substr or key. wr status now shows the internal job id (key). job.key() made public by renaming it Key(). --- cmd/kill.go | 7 ++ cmd/remove.go | 8 ++ cmd/retry.go | 7 ++ cmd/status.go | 25 +++++- jobqueue/client.go | 39 +++++---- jobqueue/db.go | 12 +-- jobqueue/job.go | 6 +- jobqueue/jobqueue_test.go | 163 ++++++++++++++++++++------------------ jobqueue/server.go | 96 +++++++++++++--------- jobqueue/serverCLI.go | 10 +-- jobqueue/serverREST.go | 16 ++-- jobqueue/serverWebI.go | 10 +-- 12 files changed, 241 insertions(+), 158 deletions(-) diff --git a/cmd/kill.go b/cmd/kill.go index 7e51e9e03..dbae9e390 100644 --- a/cmd/kill.go +++ b/cmd/kill.go @@ -39,6 +39,11 @@ buried and you can "wr remove" them if desired. Specify one of the flags -f, -l, -i or -a to choose which commands you want to remove. Amongst those, only running jobs will be affected. +-i is the report group (-i) you supplied to "wr add" when you added the job(s) +you want to now kill. Combining with -z lets you kill jobs in multiple report +groups, assuming you have arranged that related groups share some substring. Or +-y lets you specify -i as the internal job id reported during "wr status". + The file to provide -f is in the format taken by "wr add". In -f and -l mode you must provide the cwd the commands were set to run in, if @@ -87,6 +92,8 @@ func init() { killCmd.Flags().BoolVarP(&cmdAll, "all", "a", false, "kill all running jobs") killCmd.Flags().StringVarP(&cmdFileStatus, "file", "f", "", "file containing commands you want to kill; - means read from STDIN") killCmd.Flags().StringVarP(&cmdIDStatus, "identifier", "i", "", "identifier of the commands you want to kill") + killCmd.Flags().BoolVarP(&cmdIDIsSubStr, "search", "z", false, "treat -i as a substring to match against all report groups") + killCmd.Flags().BoolVarP(&cmdIDIsInternal, "internal", "y", false, "treat -i as an internal job id") killCmd.Flags().StringVarP(&cmdLine, "cmdline", "l", "", "a command line you want to kill") killCmd.Flags().StringVarP(&cmdCwd, "cwd", "c", "", "working dir that the command(s) specified by -l or -f were set to run in") killCmd.Flags().StringVarP(&mountJSON, "mount_json", "j", "", "mounts that the command(s) specified by -l or -f were set to use (JSON format)") diff --git a/cmd/remove.go b/cmd/remove.go index 5b7470897..bcbdd6183 100644 --- a/cmd/remove.go +++ b/cmd/remove.go @@ -40,6 +40,12 @@ Specify one of the flags -f, -l, -i or -a to choose which commands you want to remove. Amongst those, only currently incomplete, non-running jobs will be affected. +-i is the report group (-i) you supplied to "wr add" when you added the job(s) +you want to now remove. Combining with -z lets you remove jobs in multiple +report groups, assuming you have arranged that related groups share some +substring. Or -y lets you specify -i as the internal job id reported during +"wr status". + The file to provide -f is in the format taken by "wr add". In -f and -l mode you must provide the cwd the commands were set to run in, if @@ -88,6 +94,8 @@ func init() { removeCmd.Flags().BoolVarP(&cmdAll, "all", "a", false, "remove all incomplete, non-running jobs") removeCmd.Flags().StringVarP(&cmdFileStatus, "file", "f", "", "file containing commands you want to remove; - means read from STDIN") removeCmd.Flags().StringVarP(&cmdIDStatus, "identifier", "i", "", "identifier of the commands you want to remove") + removeCmd.Flags().BoolVarP(&cmdIDIsSubStr, "search", "z", false, "treat -i as a substring to match against all report groups") + removeCmd.Flags().BoolVarP(&cmdIDIsInternal, "internal", "y", false, "treat -i as an internal job id") removeCmd.Flags().StringVarP(&cmdLine, "cmdline", "l", "", "a command line you want to remove") removeCmd.Flags().StringVarP(&cmdCwd, "cwd", "c", "", "working dir that the command(s) specified by -l or -f were set to run in") removeCmd.Flags().StringVarP(&mountJSON, "mount_json", "j", "", "mounts that the command(s) specified by -l or -f were set to use (JSON format)") diff --git a/cmd/retry.go b/cmd/retry.go index c0745d365..84c6e7eb6 100644 --- a/cmd/retry.go +++ b/cmd/retry.go @@ -38,6 +38,11 @@ have since failed and become "buried" using this command. Specify one of the flags -f, -l, -i or -a to choose which commands you want to retry. Amongst those, only currently buried jobs will be affected. +-i is the report group (-i) you supplied to "wr add" when you added the job(s) +you want to now retry. Combining with -z lets you retry jobs in multiple report +groups, assuming you have arranged that related groups share some substring. Or +-y lets you specify -i as the internal job id reported during "wr status". + The file to provide -f is in the format taken by "wr add". In -f and -l mode you must provide the cwd the commands were set to run in, if @@ -86,6 +91,8 @@ func init() { retryCmd.Flags().BoolVarP(&cmdAll, "all", "a", false, "retry all buried jobs") retryCmd.Flags().StringVarP(&cmdFileStatus, "file", "f", "", "file containing commands you want to retry; - means read from STDIN") retryCmd.Flags().StringVarP(&cmdIDStatus, "identifier", "i", "", "identifier of the commands you want to retry") + retryCmd.Flags().BoolVarP(&cmdIDIsSubStr, "search", "z", false, "treat -i as a substring to match against all report groups") + retryCmd.Flags().BoolVarP(&cmdIDIsInternal, "internal", "y", false, "treat -i as an internal job id") retryCmd.Flags().StringVarP(&cmdLine, "cmdline", "l", "", "a command line you want to retry") retryCmd.Flags().StringVarP(&cmdCwd, "cwd", "c", "", "working dir that the command(s) specified by -l or -f were set to run in") retryCmd.Flags().StringVarP(&mountJSON, "mount_json", "j", "", "mounts that the command(s) specified by -l or -f were set to use (JSON format)") diff --git a/cmd/status.go b/cmd/status.go index 5dcab2668..dc6613709 100644 --- a/cmd/status.go +++ b/cmd/status.go @@ -32,6 +32,8 @@ const shortTimeFormat = "06/1/2-15:04:05" // options for this cmd var cmdFileStatus string var cmdIDStatus string +var cmdIDIsSubStr bool +var cmdIDIsInternal bool var cmdLine string var showBuried bool var showStd bool @@ -50,6 +52,12 @@ Specify one of the flags -f, -l or -i to choose which commands you want the status of. If none are supplied, it gives you an overview of all your currently incomplete commands. +-i is the report group (-i) you supplied to "wr add" when you added the job(s) +you want the status of now. Combining with -z lets you get the status of jobs +in multiple report groups, assuming you have arranged that related groups share +some substring. Or -y lets you specify -i as the internal job id reported when +using this command. + The file to provide -f is in the format taken by "wr add". In -f and -l mode you must provide the cwd the commands were set to run in, if @@ -143,7 +151,7 @@ very many (tens of thousands+) commands.`, } other = fmt.Sprintf("Resource requirements: %s\n", strings.Join(others, ", ")) } - fmt.Printf("\n# %s\nCwd: %s\n%s%s%s%s%sId: %s; Requirements group: %s; Priority: %d; Attempts: %d\nExpected requirements: { memory: %dMB; time: %s; cpus: %d disk: %dGB }\n", job.Cmd, cwd, mounts, homeChanged, dockerMonitored, behaviours, other, job.RepGroup, job.ReqGroup, job.Priority, job.Attempts, job.Requirements.RAM, job.Requirements.Time, job.Requirements.Cores, job.Requirements.Disk) + fmt.Printf("\n# %s\nCwd: %s\n%s%s%s%s%sId: %s (%s); Requirements group: %s; Priority: %d; Attempts: %d\nExpected requirements: { memory: %dMB; time: %s; cpus: %d disk: %dGB }\n", job.Cmd, cwd, mounts, homeChanged, dockerMonitored, behaviours, other, job.RepGroup, job.Key(), job.ReqGroup, job.Priority, job.Attempts, job.Requirements.RAM, job.Requirements.Time, job.Requirements.Cores, job.Requirements.Disk) switch job.State { case jobqueue.JobStateDelayed: @@ -247,6 +255,8 @@ func init() { // flags specific to this sub-command statusCmd.Flags().StringVarP(&cmdFileStatus, "file", "f", "", "file containing commands you want the status of; - means read from STDIN") statusCmd.Flags().StringVarP(&cmdIDStatus, "identifier", "i", "", "identifier of the commands you want the status of") + statusCmd.Flags().BoolVarP(&cmdIDIsSubStr, "search", "z", false, "treat -i as a substring to match against all report groups") + statusCmd.Flags().BoolVarP(&cmdIDIsInternal, "internal", "y", false, "treat -i as an internal job id") statusCmd.Flags().StringVarP(&cmdLine, "cmdline", "l", "", "a command line you want the status of") statusCmd.Flags().StringVarP(&cmdCwd, "cwd", "c", "", "working dir that the command(s) specified by -l or -f were set to run in") statusCmd.Flags().StringVarP(&mountJSON, "mount_json", "j", "", "mounts that the command(s) specified by -l or -f were set to use (JSON format)") @@ -286,8 +296,17 @@ func getJobs(jq *jobqueue.Client, cmdState jobqueue.JobState, all bool, statusLi // get all jobs jobs, err = jq.GetIncomplete(statusLimit, cmdState, showStd, showEnv) case cmdIDStatus != "": - // get all jobs with this identifier (repgroup) - jobs, err = jq.GetByRepGroup(cmdIDStatus, statusLimit, cmdState, showStd, showEnv) + if cmdIDIsInternal { + // get the job with this internal id + var job *jobqueue.Job + job, err = jq.GetByEssence(&jobqueue.JobEssence{ + JobKey: cmdIDStatus, + }, showStd, showEnv) + jobs = append(jobs, job) + } else { + // get all jobs with this identifier (repgroup) + jobs, err = jq.GetByRepGroup(cmdIDStatus, cmdIDIsSubStr, statusLimit, cmdState, showStd, showEnv) + } case cmdFileStatus != "": // parse the supplied commands parsedJobs, _, _ := parseCmdFile(jq) diff --git a/jobqueue/client.go b/jobqueue/client.go index 79aa81506..20fc3f6f3 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -100,6 +100,7 @@ type clientRequest struct { JobEndState *JobEndState Jobs []*Job Keys []string + Search bool Limit int Method string SchedulerGroup string @@ -407,7 +408,7 @@ func (c *Client) Execute(job *Job, shell string) error { // but in this case we want to avoid starting to execute the command before // finding out about this problem if !uuid.Equal(c.clientid, job.ReservedBy) { - return Error{"Execute", job.key(), ErrMustReserve} + return Error{"Execute", job.Key(), ErrMustReserve} } // we support arbitrary shell commands that may include semi-colons, @@ -508,7 +509,7 @@ func (c *Client) Execute(job *Job, shell string) error { cmd.Dir = job.Cwd } else { // we'll create a unique location to work in - actualCwd, tmpDir, err = mkHashedDir(job.Cwd, job.key()) + actualCwd, tmpDir, err = mkHashedDir(job.Cwd, job.Key()) if err != nil { buryErr := fmt.Errorf("could not create working directory: %s", err) errb := c.Bury(job, nil, FailReasonCwd, buryErr) @@ -569,7 +570,7 @@ func (c *Client) Execute(job *Job, shell string) error { if erru != nil { extra += fmt.Sprintf(" (and unmounting the job failed: %s)", erru) } - return fmt.Errorf("failed to extract environment variables for job [%s]: %s%s", job.key(), err, extra) + return fmt.Errorf("failed to extract environment variables for job [%s]: %s%s", job.Key(), err, extra) } if tmpDir != "" { // (this works fine even if tmpDir has a space in one of the dir names) @@ -949,19 +950,19 @@ func (c *Client) Execute(job *Job, shell string) error { dorelease = true if ranoutMem { failreason = FailReasonRAM - myerr = Error{"Execute", job.key(), FailReasonRAM} + myerr = Error{"Execute", job.Key(), FailReasonRAM} } else if signalled { if ranoutTime { failreason = FailReasonTime - myerr = Error{"Execute", job.key(), FailReasonTime} + myerr = Error{"Execute", job.Key(), FailReasonTime} } else { failreason = FailReasonSignal - myerr = Error{"Execute", job.key(), FailReasonSignal} + myerr = Error{"Execute", job.Key(), FailReasonSignal} } } else if killCalled { dobury = true failreason = FailReasonKilled - myerr = Error{"Execute", job.key(), FailReasonKilled} + myerr = Error{"Execute", job.Key(), FailReasonKilled} } else { failreason = FailReasonExit myerr = fmt.Errorf("command [%s] exited with code %d%s", job.Cmd, exitcode, mayBeTemp) @@ -1379,14 +1380,20 @@ func (c *Client) jesToKeys(jes []*JobEssence) []string { // GetByRepGroup gets multiple Jobs at once given their RepGroup (an arbitrary // user-supplied identifier for the purpose of grouping related jobs together -// for reporting purposes). 'limit', if greater than 0, limits the number of -// jobs returned that have the same State, FailReason and Exitcode, and on the -// the last job of each State+FailReason group it populates 'Similar' with the -// number of other excluded jobs there were in that group. Providing 'state' -// only returns jobs in that State. 'getStd' and 'getEnv', if true, retrieve the -// stdout, stderr and environement variables for the Jobs. -func (c *Client) GetByRepGroup(repgroup string, limit int, state JobState, getStd bool, getEnv bool) ([]*Job, error) { - resp, err := c.request(&clientRequest{Method: "getbr", Job: &Job{RepGroup: repgroup}, Limit: limit, State: state, GetStd: getStd, GetEnv: getEnv}) +// for reporting purposes). +// +// If 'subStr' is true, gets Jobs in all RepGroups that the supplied repgroup is +// a substring of. +// +// 'limit', if greater than 0, limits the number of jobs returned that have the +// same State, FailReason and Exitcode, and on the the last job of each +// State+FailReason group it populates 'Similar' with the number of other +// excluded jobs there were in that group. +// +// Providing 'state' only returns jobs in that State. 'getStd' and 'getEnv', if +// true, retrieve the stdout, stderr and environement variables for the Jobs. +func (c *Client) GetByRepGroup(repgroup string, subStr bool, limit int, state JobState, getStd bool, getEnv bool) ([]*Job, error) { + resp, err := c.request(&clientRequest{Method: "getbr", Job: &Job{RepGroup: repgroup}, Search: subStr, Limit: limit, State: state, GetStd: getStd, GetEnv: getEnv}) if err != nil { return nil, err } @@ -1467,7 +1474,7 @@ func (c *Client) request(cr *clientRequest) (*serverResponse, error) { if sr.Err != "" { key := "" if cr.Job != nil { - key = cr.Job.key() + key = cr.Job.Key() } return sr, Error{cr.Method, key, sr.Err} } diff --git a/jobqueue/db.go b/jobqueue/db.go index 900cfb6ed..dcc387049 100644 --- a/jobqueue/db.go +++ b/jobqueue/db.go @@ -341,7 +341,7 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j newJobKeys := make(map[string]bool) var keptJobs []*Job for _, job := range jobs { - keyStr := job.key() + keyStr := job.Key() if ignoreAdded { var added bool @@ -399,7 +399,7 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j // arrange to have resurrected complete jobs stored in the live // bucket again for _, job := range jobsToQueue { - key := []byte(job.key()) + key := []byte(job.Key()) var encoded []byte enc := codec.NewEncoderBytes(&encoded, db.ch) job.RLock() @@ -421,7 +421,7 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j } // now go ahead and store the lookups and jobs - numStores := 2 + numStores := 3 if len(dgLookups) > 0 { numStores++ } @@ -444,7 +444,9 @@ func (db *db) storeNewJobs(jobs []*Job, ignoreAdded bool) (jobsToQueue []*Job, j for rg := range repGroups { rgs = append(rgs, [2][]byte{[]byte(rg), nil}) } - sort.Sort(rgs) + if len(rgs) > 1 { + sort.Sort(rgs) + } errors <- db.storeBatched(bucketRGs, rgs, db.storeLookups) }() @@ -842,7 +844,7 @@ func (db *db) updateJobAfterExit(job *Job, stdo []byte, stde []byte, forceStorag if db.closed { return } - jobkey := job.key() + jobkey := job.Key() job.RLock() secs := int(math.Ceil(job.EndTime.Sub(job.StartTime).Seconds())) jrg := job.ReqGroup diff --git a/jobqueue/job.go b/jobqueue/job.go index 5a6a4028e..54325d9bc 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -628,7 +628,7 @@ func (j *Job) Unmount(stopUploads ...bool) (logs string, err error) { // ToEssense converts a Job to its matching JobEssense, taking less space and // being required as input for certain methods. func (j *Job) ToEssense() *JobEssence { - return &JobEssence{JobKey: j.key()} + return &JobEssence{JobKey: j.Key()} } // updateAfterExit sets some properties on the job, only if the supplied @@ -674,8 +674,8 @@ func (j *Job) updateRecsAfterFailure() { } } -// key calculates a unique key to describe the job. -func (j *Job) key() string { +// Key calculates a unique key to describe the job. +func (j *Job) Key() string { if j.CwdMatters { return byteKey([]byte(fmt.Sprintf("%s.%s.%s", j.Cwd, j.Cmd, j.MountConfigs.Key()))) } diff --git a/jobqueue/jobqueue_test.go b/jobqueue/jobqueue_test.go index cc23d4201..ec85dc5a0 100644 --- a/jobqueue/jobqueue_test.go +++ b/jobqueue/jobqueue_test.go @@ -377,11 +377,11 @@ func TestJobqueue(t *testing.T) { So(job.State, ShouldEqual, JobStateReady) } - jobs, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 10) - jobs, err = jq.GetByRepGroup("foo", 0, "", false, false) + jobs, err = jq.GetByRepGroup("foo", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 0) }) @@ -795,7 +795,7 @@ func TestJobqueue(t *testing.T) { So(job2.Attempts, ShouldEqual, 1) Convey("Both current and archived jobs can be retrieved with GetByRepGroup", func() { - jobs, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 2) @@ -1035,7 +1035,7 @@ func TestJobqueue(t *testing.T) { if errp == nil && maxRAM > 80000 { // authors high memory system Convey("If a job uses close to all memory on machine it is killed and we recommend more next time", func() { jobs = nil - cmd := "perl -e '@a; for (1..1000) { push(@a, q[a] x 80000000000) }'" + cmd := "perl -e '@a; for (1..1000) { push(@a, q[a] x 8000000000) }'" jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "fake_group", Requirements: standardReqs, Retries: uint8(3), RepGroup: "run_out_of_mem"}) RecMBRound = 1 inserts, already, err := jq.Add(jobs, envVars, true) @@ -1048,7 +1048,7 @@ func TestJobqueue(t *testing.T) { So(job.Cmd, ShouldEqual, cmd) So(job.State, ShouldEqual, JobStateReserved) - ClientPercentMemoryKill = 10 + ClientPercentMemoryKill = 1 err = jq.Execute(job, config.RunnerExecShell) ClientPercentMemoryKill = 90 So(err, ShouldNotBeNil) @@ -1059,7 +1059,7 @@ func TestJobqueue(t *testing.T) { So(job.Exited, ShouldBeTrue) So(job.Exitcode, ShouldEqual, -1) So(job.FailReason, ShouldEqual, FailReasonRAM) - So(job.Requirements.RAM, ShouldBeGreaterThanOrEqualTo, 10000) + So(job.Requirements.RAM, ShouldBeGreaterThanOrEqualTo, 1000) deleted, errd := jq.Delete([]*JobEssence{{Cmd: cmd}}) So(errd, ShouldBeNil) So(deleted, ShouldEqual, 1) @@ -1300,7 +1300,7 @@ func TestJobqueue(t *testing.T) { for { select { case <-ticker.C: - jobs, err = jq.GetByRepGroup("bigerr", 0, JobStateBuried, false, false) + jobs, err = jq.GetByRepGroup("bigerr", false, 0, JobStateBuried, false, false) if err != nil { continue } @@ -1501,12 +1501,12 @@ func TestJobqueue(t *testing.T) { } Convey("The jobs can be retrieved by either RepGroup and will have the expected RepGroup", func() { - jobsg, err := jq.GetByRepGroup("rp1", 0, JobStateComplete, false, false) + jobsg, err := jq.GetByRepGroup("rp1", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(jobsg), ShouldEqual, 3) So(jobsg[0].RepGroup, ShouldEqual, "rp1") - jobsg, err = jq.GetByRepGroup("rp2", 0, JobStateComplete, false, false) + jobsg, err = jq.GetByRepGroup("rp2", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(jobsg), ShouldEqual, 4) So(jobsg[0].RepGroup, ShouldEqual, "rp2") @@ -1543,12 +1543,12 @@ func TestJobqueue(t *testing.T) { So(job, ShouldBeNil) Convey("The jobs can be retrieved by either RepGroup and will have the expected RepGroup", func() { - jobs, err := jq.GetByRepGroup("rp1", 0, JobStateComplete, false, false) + jobs, err := jq.GetByRepGroup("rp1", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 3) So(jobs[0].RepGroup, ShouldEqual, "rp1") - jobs, err = jq.GetByRepGroup("rp2", 0, JobStateComplete, false, false) + jobs, err = jq.GetByRepGroup("rp2", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 4) So(jobs[0].RepGroup, ShouldEqual, "rp2") @@ -1571,6 +1571,15 @@ func TestJobqueue(t *testing.T) { So(inserts, ShouldEqual, 3) So(already, ShouldEqual, 0) + Convey("You can search for the jobs using a common substring of their repgroups", func() { + gottenJobs, err := jq.GetByRepGroup("dep", true, 0, "", false, false) + So(err, ShouldBeNil) + So(len(gottenJobs), ShouldEqual, 3) + gottenJobs, err = jq.GetByRepGroup("2", true, 0, "", false, false) + So(err, ShouldBeNil) + So(len(gottenJobs), ShouldEqual, 1) + }) + Convey("You can reserve and execute one of them", func() { j1, err := jq.Reserve(50 * time.Millisecond) So(err, ShouldBeNil) @@ -1580,7 +1589,7 @@ func TestJobqueue(t *testing.T) { <-time.After(6 * time.Millisecond) - gottenJobs, err := jq.GetByRepGroup("dep1", 0, "", false, false) + gottenJobs, err := jq.GetByRepGroup("dep1", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateComplete) @@ -1608,7 +1617,7 @@ func TestJobqueue(t *testing.T) { // dep4 was added with a dependency on dep1, but after dep1 // was already completed; it should start off in the ready // queue, not the dependent queue - gottenJobs, err = jq.GetByRepGroup("dep4", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep4", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -1652,7 +1661,7 @@ func TestJobqueue(t *testing.T) { } }() - gottenJobs, err = jq.GetByRepGroup("dep6", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep6", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1660,7 +1669,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j4, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep6", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep6", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1671,12 +1680,12 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j3, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep6", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep6", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) - gottenJobs, err = jq.GetByRepGroup("dep5", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep5", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1687,7 +1696,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j2, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep5", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep5", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -1719,7 +1728,7 @@ func TestJobqueue(t *testing.T) { } }() - gottenJobs, err = jq.GetByRepGroup("dep8", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep8", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1727,12 +1736,12 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j5, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep8", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep8", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) - gottenJobs, err = jq.GetByRepGroup("dep7", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep7", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1743,7 +1752,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j6, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep7", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep7", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -1814,7 +1823,7 @@ func TestJobqueue(t *testing.T) { <-time.After(6 * time.Millisecond) - gottenJobs, err := jq.GetByRepGroup("dep1", 0, "", false, false) + gottenJobs, err := jq.GetByRepGroup("dep1", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateComplete) @@ -1842,7 +1851,7 @@ func TestJobqueue(t *testing.T) { // dep4 was added with a dependency on dep1, but after dep1 // was already completed; it should start off in the ready // queue, not the dependent queue - gottenJobs, err = jq.GetByRepGroup("dep4", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep4", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -1886,7 +1895,7 @@ func TestJobqueue(t *testing.T) { } }() - gottenJobs, err = jq.GetByRepGroup("dep6", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep6", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1894,7 +1903,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j4, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep6", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep6", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1905,12 +1914,12 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j3, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep6", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep6", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) - gottenJobs, err = jq.GetByRepGroup("dep5", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep5", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1921,7 +1930,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j2, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep5", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep5", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -1953,7 +1962,7 @@ func TestJobqueue(t *testing.T) { } }() - gottenJobs, err = jq.GetByRepGroup("dep8", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep8", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1961,12 +1970,12 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j5, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep8", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep8", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) - gottenJobs, err = jq.GetByRepGroup("dep7", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep7", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -1977,7 +1986,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j6, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("dep7", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("dep7", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -1993,12 +2002,12 @@ func TestJobqueue(t *testing.T) { So(inserts, ShouldEqual, 2) So(already, ShouldEqual, 0) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) - gottenJobs, err = jq.GetByRepGroup("after-afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("after-afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -2014,7 +2023,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j8, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -2026,7 +2035,7 @@ func TestJobqueue(t *testing.T) { So(inserts, ShouldEqual, 1) So(already, ShouldEqual, 0) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -2037,7 +2046,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j9, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -2048,12 +2057,12 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(faf, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateComplete) - gottenJobs, err = jq.GetByRepGroup("after-afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("after-afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -2063,12 +2072,12 @@ func TestJobqueue(t *testing.T) { So(inserts, ShouldEqual, 2) // the job I added, and the resurrected afterfinal job So(already, ShouldEqual, 0) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) - gottenJobs, err = jq.GetByRepGroup("after-afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("after-afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -2079,7 +2088,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(j9, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -2090,12 +2099,12 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(faf, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateComplete) - gottenJobs, err = jq.GetByRepGroup("after-afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("after-afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateReady) @@ -2106,7 +2115,7 @@ func TestJobqueue(t *testing.T) { err = jq.Execute(faaf, config.RunnerExecShell) So(err, ShouldBeNil) - gottenJobs, err = jq.GetByRepGroup("after-afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("after-afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateComplete) @@ -2118,12 +2127,12 @@ func TestJobqueue(t *testing.T) { So(inserts, ShouldEqual, 3) So(already, ShouldEqual, 0) - gottenJobs, err = jq.GetByRepGroup("afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) - gottenJobs, err = jq.GetByRepGroup("after-afterfinal", 0, "", false, false) + gottenJobs, err = jq.GetByRepGroup("after-afterfinal", false, 0, "", false, false) So(err, ShouldBeNil) So(len(gottenJobs), ShouldEqual, 1) So(gottenJobs[0].State, ShouldEqual, JobStateDependent) @@ -2253,7 +2262,7 @@ func TestJobqueue(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err := jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err := jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 0) @@ -2268,13 +2277,13 @@ func TestJobqueue(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 2) }) Convey("You can stop the server, delete or corrupt the database, and it will be restored from backup", func() { - jobsByRepGroup, err := jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err := jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 2) @@ -2288,7 +2297,7 @@ func TestJobqueue(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 2) @@ -2312,7 +2321,7 @@ func TestJobqueue(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 2) @@ -2336,7 +2345,7 @@ func TestJobqueue(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 2) }) @@ -2408,7 +2417,7 @@ func TestJobqueue(t *testing.T) { So(job, ShouldNotBeNil) errb := jq.Bury(job, nil, "") So(errb, ShouldBeNil) - deleted, err := jq.Delete([]*JobEssence{{JobKey: job.key()}}) + deleted, err := jq.Delete([]*JobEssence{{JobKey: job.Key()}}) server.Stop(true) So(deleted, ShouldEqual, 1) So(err, ShouldBeNil) @@ -2582,7 +2591,7 @@ func TestJobqueue(t *testing.T) { for { select { case <-ticker.C: - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateRunning, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateRunning, false, false) if err != nil { continue } @@ -2613,7 +2622,7 @@ func TestJobqueue(t *testing.T) { for { select { case <-ticker.C: - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateBuried, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateBuried, false, false) if err != nil { continue } @@ -2632,7 +2641,7 @@ func TestJobqueue(t *testing.T) { }() So(<-killed, ShouldBeTrue) - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateBuried, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateBuried, false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 1) So(jobs[0].State, ShouldEqual, JobStateBuried) @@ -2685,7 +2694,7 @@ func TestJobqueue(t *testing.T) { }() So(<-done, ShouldBeTrue) // we shouldn't have hit our time limit - jobs, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, count) ran := 0 @@ -2742,7 +2751,7 @@ func TestJobqueue(t *testing.T) { for { select { case <-ticker.C: - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateBuried, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateBuried, false, false) if err != nil { continue } @@ -2792,11 +2801,11 @@ func TestJobqueue(t *testing.T) { go waitForNoRunners() So(<-done2, ShouldBeTrue) - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateBuried, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateBuried, false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 1) - buriedKey := jobs[0].key() - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateComplete, false, false) + buriedKey := jobs[0].Key() + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 1) @@ -3067,7 +3076,7 @@ func TestJobqueue(t *testing.T) { for { select { case <-ticker.C: - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateComplete, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateComplete, false, false) if err != nil { continue } @@ -3126,7 +3135,7 @@ func TestJobqueue(t *testing.T) { // check they're really all complete, since the // switch to a new job array could leave us with no // runners temporarily - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateComplete, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateComplete, false, false) if err == nil && len(jobs) == count+count2 { ticker.Stop() done <- true @@ -3150,7 +3159,7 @@ func TestJobqueue(t *testing.T) { So(<-done, ShouldBeTrue) So(twoHundredCount, ShouldBeBetween, fourHundredCount/2, count+count2) - jobs, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, count+count2) ran := 0 @@ -3281,7 +3290,7 @@ func TestJobqueue(t *testing.T) { So(<-hadRunner, ShouldBeTrue) <-time.After(1 * time.Second) - jobs, err = jq.GetByRepGroup("manually_added", 0, JobStateReady, false, false) + jobs, err = jq.GetByRepGroup("manually_added", false, 0, JobStateReady, false, false) So(err, ShouldBeNil) So(len(jobs), ShouldEqual, 1) @@ -3453,19 +3462,19 @@ sudo usermod -aG docker ` + osUser So(<-done, ShouldBeTrue) expectedRAM := 2000 - got, err := jq.GetByRepGroup("first_docker", 0, JobStateComplete, false, false) + got, err := jq.GetByRepGroup("first_docker", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(got), ShouldEqual, 1) So(got[0].PeakRAM, ShouldBeGreaterThanOrEqualTo, expectedRAM) So(got[0].WallTime(), ShouldBeBetweenOrEqual, 5*time.Second, 15*time.Second) So(got[0].CPUtime, ShouldBeLessThan, 4*time.Second) - got, err = jq.GetByRepGroup("named_docker", 0, JobStateComplete, false, false) + got, err = jq.GetByRepGroup("named_docker", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(got), ShouldEqual, 1) So(got[0].PeakRAM, ShouldBeGreaterThanOrEqualTo, expectedRAM) - got, err = jq.GetByRepGroup("cidfile_docker", 0, JobStateComplete, false, false) + got, err = jq.GetByRepGroup("cidfile_docker", false, 0, JobStateComplete, false, false) So(err, ShouldBeNil) So(len(got), ShouldEqual, 1) So(got[0].PeakRAM, ShouldBeLessThan, 100) @@ -3530,7 +3539,7 @@ sudo usermod -aG docker ` + osUser }() So(<-done, ShouldBeTrue) - got, err := jq.GetByRepGroup("with_config_file", 0, JobStateBuried, true, false) + got, err := jq.GetByRepGroup("with_config_file", false, 0, JobStateBuried, true, false) So(err, ShouldBeNil) So(len(got), ShouldEqual, 1) stderr, err := got[0].StdErr() @@ -3623,7 +3632,7 @@ sudo usermod -aG docker ` + osUser select { case <-ticker.C: if server.HasRunners() { - got, errf := jq.GetByRepGroup("sleep", 0, JobStateRunning, false, false) + got, errf := jq.GetByRepGroup("sleep", false, 0, JobStateRunning, false, false) if errf != nil { ticker.Stop() started <- false @@ -3650,7 +3659,7 @@ sudo usermod -aG docker ` + osUser // them, while monitoring that we never request more than 2 // runners, and that we eventually spawn exactly 1 new server // to get the killed job running again - got, err := jq.GetByRepGroup("sleep", 0, JobStateRunning, false, false) + got, err := jq.GetByRepGroup("sleep", false, 0, JobStateRunning, false, false) So(err, ShouldBeNil) So(len(got), ShouldEqual, 2) @@ -3686,7 +3695,7 @@ sudo usermod -aG docker ` + osUser err = p.DestroyServer(job.HostID) So(err, ShouldBeNil) destroyed = true - killedJobEssence = &JobEssence{JobKey: job.key()} + killedJobEssence = &JobEssence{JobKey: job.Key()} break } } @@ -3866,7 +3875,7 @@ func TestJobqueueWithMounts(t *testing.T) { So(info2.Size(), ShouldEqual, 28672) Convey("You can stop the server, delete the database, and it will be restored from S3 backup", func() { - jobsByRepGroup, err := jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err := jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 1) @@ -3881,7 +3890,7 @@ func TestJobqueueWithMounts(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 1) @@ -3906,7 +3915,7 @@ func TestJobqueueWithMounts(t *testing.T) { jq, err = Connect(addr, config.ManagerCAFile, config.ManagerCertDomain, token, clientConnectTime) So(err, ShouldBeNil) - jobsByRepGroup, err = jq.GetByRepGroup("manually_added", 0, "", false, false) + jobsByRepGroup, err = jq.GetByRepGroup("manually_added", false, 0, "", false, false) So(err, ShouldBeNil) So(len(jobsByRepGroup), ShouldEqual, 1) }) diff --git a/jobqueue/server.go b/jobqueue/server.go index 633d5f950..ec262069b 100644 --- a/jobqueue/server.go +++ b/jobqueue/server.go @@ -33,6 +33,7 @@ import ( "os/signal" "path" "path/filepath" + "strings" "sync" "sync/atomic" "syscall" @@ -533,7 +534,7 @@ func Serve(config ServerConfig) (s *Server, msg string, token []byte, err error) if err != nil { return nil, msg, token, err } - itemdefs = append(itemdefs, &queue.ItemDef{Key: job.key(), ReserveGroup: job.getSchedulerGroup(), Data: job, Priority: job.Priority, Delay: 0 * time.Second, TTR: ServerItemTTR, Dependencies: deps}) + itemdefs = append(itemdefs, &queue.ItemDef{Key: job.Key(), ReserveGroup: job.getSchedulerGroup(), Data: job, Priority: job.Priority, Delay: 0 * time.Second, TTR: ServerItemTTR, Dependencies: deps}) } _, _, err = s.enqueueItems(itemdefs) if err != nil { @@ -1012,7 +1013,7 @@ func (s *Server) createQueue() { job.setScheduledRunner(false) } if s.rc != "" { - errs := q.SetReserveGroup(job.key(), schedulerGroup) + errs := q.SetReserveGroup(job.Key(), schedulerGroup) if errs != nil { // we could be trying to set the reserve group after the // job has already completed, if they complete @@ -1303,7 +1304,7 @@ func (s *Server) createJobs(inputJobs []*Job, envkey string, ignoreComplete bool qerr = err break } - itemdefs = append(itemdefs, &queue.ItemDef{Key: job.key(), ReserveGroup: job.getSchedulerGroup(), Data: job, Priority: job.Priority, Delay: 0 * time.Second, TTR: ServerItemTTR, Dependencies: deps}) + itemdefs = append(itemdefs, &queue.ItemDef{Key: job.Key(), ReserveGroup: job.getSchedulerGroup(), Data: job, Priority: job.Priority, Delay: 0 * time.Second, TTR: ServerItemTTR, Dependencies: deps}) } // storeNewJobs also returns jobsToUpdate, which are those jobs @@ -1316,7 +1317,7 @@ func (s *Server) createJobs(inputJobs []*Job, envkey string, ignoreComplete bool qerr = err break } - thisErr := s.q.Update(job.key(), job.getSchedulerGroup(), job, job.Priority, 0*time.Second, ServerItemTTR, deps) + thisErr := s.q.Update(job.Key(), job.getSchedulerGroup(), job, job.Priority, 0*time.Second, ServerItemTTR, deps) if thisErr != nil { qerr = thisErr break @@ -1333,7 +1334,6 @@ func (s *Server) createJobs(inputJobs []*Job, envkey string, ignoreComplete bool } } } - return added, dups, alreadyComplete, srerr, qerr } @@ -1424,33 +1424,63 @@ func (s *Server) getJobsByKeys(keys []string, getStd bool, getEnv bool) (jobs [] return jobs, srerr, qerr } +// searchRepGroups looks up the rep groups of all jobs that have ever been added +// and returns those that contain the given sub string. +func (s *Server) searchRepGroups(partialRepGroup string) ([]string, error) { + rgs, err := s.db.retrieveRepGroups() + if err != nil { + return nil, err + } + + var matching []string + for _, rg := range rgs { + if strings.Contains(rg, partialRepGroup) { + matching = append(matching, rg) + } + } + return matching, err +} + // getJobsByRepGroup gets jobs in the given group (current and complete). -func (s *Server) getJobsByRepGroup(repgroup string, limit int, state JobState, getStd bool, getEnv bool) (jobs []*Job, srerr string, qerr string) { - // look in the in-memory queue for matching jobs - s.rpl.RLock() - for key := range s.rpl.lookup[repgroup] { - item, err := s.q.Get(key) - if err == nil && item != nil { - job := s.itemToJob(item, false, false) - jobs = append(jobs, job) +func (s *Server) getJobsByRepGroup(repgroup string, search bool, limit int, state JobState, getStd bool, getEnv bool) (jobs []*Job, srerr string, qerr string) { + var rgs []string + if search { + var errs error + rgs, errs = s.searchRepGroups(repgroup) + if errs != nil { + return nil, ErrDBError, errs.Error() } + } else { + rgs = append(rgs, repgroup) } - s.rpl.RUnlock() - - // look in the permanent store for matching jobs - if state == "" || state == JobStateComplete { - var complete []*Job - complete, srerr, qerr = s.getCompleteJobsByRepGroup(repgroup) - if len(complete) > 0 { - // a job is stored in the db with only the single most recent - // RepGroup it had, but we're able to retrieve jobs based on any of - // the RepGroups it ever had; set the RepGroup to the one the user - // requested *** may want to change RepGroup to store a slice of - // RepGroups? But that could be massive... - for _, cj := range complete { - cj.RepGroup = repgroup + + for _, rg := range rgs { + // look in the in-memory queue for matching jobs + s.rpl.RLock() + for key := range s.rpl.lookup[rg] { + item, err := s.q.Get(key) + if err == nil && item != nil { + job := s.itemToJob(item, false, false) + jobs = append(jobs, job) + } + } + s.rpl.RUnlock() + + // look in the permanent store for matching jobs + if state == "" || state == JobStateComplete { + var complete []*Job + complete, srerr, qerr = s.getCompleteJobsByRepGroup(rg) + if len(complete) > 0 { + // a job is stored in the db with only the single most recent + // RepGroup it had, but we're able to retrieve jobs based on any of + // the RepGroups it ever had; set the RepGroup to the one the user + // requested *** may want to change RepGroup to store a slice of + // RepGroups? But that could be massive... + for _, cj := range complete { + cj.RepGroup = rg + } + jobs = append(jobs, complete...) } - jobs = append(jobs, complete...) } } @@ -1460,16 +1490,6 @@ func (s *Server) getJobsByRepGroup(repgroup string, limit int, state JobState, g return jobs, srerr, qerr } -// searchRepGroups looks up the rep groups of all jobs that have ever been added -// and returns those that contain the given sub string. -func (s *Server) searchRepGroups(partialRepGroup string) ([]string, error) { - rgs, err := s.db.retrieveRepGroups() - if err != nil { - return nil, err - } - return rgs, err -} - // getCompleteJobsByRepGroup gets complete jobs in the given group. func (s *Server) getCompleteJobsByRepGroup(repgroup string) (jobs []*Job, srerr string, qerr string) { jobs, err := s.db.retrieveCompleteJobsByRepGroup(repgroup) diff --git a/jobqueue/serverCLI.go b/jobqueue/serverCLI.go index b58e6b0a0..c9b36808b 100644 --- a/jobqueue/serverCLI.go +++ b/jobqueue/serverCLI.go @@ -330,7 +330,7 @@ func (s *Server) handleRequest(m *mangos.Message) error { srerr = ErrBadRequest job.Unlock() } else { - key := job.key() + key := job.Key() job.State = JobStateComplete job.FailReason = "" sgroup := job.schedulerGroup @@ -539,7 +539,7 @@ func (s *Server) handleRequest(m *mangos.Message) error { srerr = ErrBadRequest } else { var jobs []*Job - jobs, srerr, qerr = s.getJobsByRepGroup(cr.Job.RepGroup, cr.Limit, cr.State, cr.GetStd, cr.GetEnv) + jobs, srerr, qerr = s.getJobsByRepGroup(cr.Job.RepGroup, cr.Search, cr.Limit, cr.State, cr.GetStd, cr.GetEnv) if len(jobs) > 0 { sr = &serverResponse{Jobs: jobs} } @@ -567,7 +567,7 @@ func (s *Server) handleRequest(m *mangos.Message) error { } key := "" if cr.Job != nil { - key = cr.Job.key() + key = cr.Job.Key() } return Error{cr.Method, key, qerr} } @@ -633,7 +633,7 @@ func (s *Server) getij(cr *clientRequest) (*queue.Item, *Job, string) { return nil, nil, ErrBadRequest } - item, err := s.q.Get(cr.Job.key()) + item, err := s.q.Get(cr.Job.Key()) if err != nil || item.Stats().State != queue.ItemStateRun { return item, nil, ErrBadJob } @@ -721,7 +721,7 @@ func (s *Server) jobPopulateStdEnv(job *Job, getStd bool, getEnv bool) { job.Lock() defer job.Unlock() if getStd && ((job.Exited && job.Exitcode != 0) || job.State == JobStateBuried) { - job.StdOutC, job.StdErrC = s.db.retrieveJobStd(job.key()) + job.StdOutC, job.StdErrC = s.db.retrieveJobStd(job.Key()) } if getEnv { job.EnvC = s.db.retrieveEnv(job.EnvKey) diff --git a/jobqueue/serverREST.go b/jobqueue/serverREST.go index 8bf692e0c..9d613027b 100644 --- a/jobqueue/serverREST.go +++ b/jobqueue/serverREST.go @@ -513,16 +513,20 @@ func restJobs(s *Server) http.HandlerFunc { // restJobsStatus gets the status of the requested jobs in the queue. The // request url can be suffixed with comma separated job keys or RepGroups. -// Possible query parameters are std, env (which can take a "true" value), limit -// (a number) and state (one of delayed|ready|reserved|running|lost|buried| -// dependent|complete). Returns the Jobs, a http.Status* value and error. +// Possible query parameters are search, std, env (which can take a "true" +// value), limit (a number) and state (one of +// delayed|ready|reserved|running|lost|buried| dependent|complete). Returns the +// Jobs, a http.Status* value and error. func restJobsStatus(r *http.Request, s *Server) ([]*Job, int, error) { // handle possible ?query parameters - var getStd, getEnv bool + var search, getStd, getEnv bool var limit int var state JobState var err error + if r.Form.Get("search") == restFormTrue { + search = true + } if r.Form.Get("std") == restFormTrue { getStd = true } @@ -571,7 +575,7 @@ func restJobsStatus(r *http.Request, s *Server) ([]*Job, int, error) { } // id might be a Job.RepGroup - theseJobs, _, qerr := s.getJobsByRepGroup(id, limit, state, getStd, getEnv) + theseJobs, _, qerr := s.getJobsByRepGroup(id, search, limit, state, getStd, getEnv) if qerr != "" { return nil, http.StatusInternalServerError, fmt.Errorf(qerr) } @@ -718,7 +722,7 @@ func restJobsAdd(r *http.Request, s *Server) ([]*Job, int, error) { // slow and wasteful?... var jobs []*Job for _, job := range inputJobs { - item, qerr := s.q.Get(job.key()) + item, qerr := s.q.Get(job.Key()) if qerr == nil && item != nil { // append the q's version of the job, not the input job, since the // job may have been a duplicate and we want to return its current diff --git a/jobqueue/serverWebI.go b/jobqueue/serverWebI.go index 8cf1e24cf..1b87b3344 100644 --- a/jobqueue/serverWebI.go +++ b/jobqueue/serverWebI.go @@ -270,7 +270,7 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { // *** probably want to take the count as a req option, // so user can request to see more than just 1 job per // State+Exitcode+FailReason - jobs, _, errstr := s.getJobsByRepGroup(req.RepGroup, 1, req.State, true, true) + jobs, _, errstr := s.getJobsByRepGroup(req.RepGroup, false, 1, req.State, true, true) if errstr == "" && len(jobs) > 0 { writeMutex.Lock() failed := false @@ -291,7 +291,7 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { case "retry": jobs := s.reqToJobs(req, []queue.ItemState{queue.ItemStateBury}) for _, job := range jobs { - err := s.q.Kick(job.key()) + err := s.q.Kick(job.Key()) if err != nil { continue } @@ -301,7 +301,7 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { jobs := s.reqToJobs(req, []queue.ItemState{queue.ItemStateBury, queue.ItemStateDelay, queue.ItemStateDependent, queue.ItemStateReady}) var toDelete []string for _, job := range jobs { - key := job.key() + key := job.Key() // we can't allow the removal of jobs that have // dependencies, as *queue would regard that as @@ -332,7 +332,7 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { case "kill": jobs := s.reqToJobs(req, []queue.ItemState{queue.ItemStateRun}) for _, job := range jobs { - _, err := s.killJob(job.key()) + _, err := s.killJob(job.Key()) if err != nil { s.Warn("web interface kill job failed", "err", err) } @@ -464,7 +464,7 @@ func jobToStatus(job *Job) jstatus { ot = append(ot, key+":"+val) } return jstatus{ - Key: job.key(), + Key: job.Key(), RepGroup: job.RepGroup, DepGroups: job.DepGroups, Dependencies: job.Dependencies.Stringify(), From f6ed049792d6cac278e6c3f9c1b8edf01479ae72 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 8 May 2018 16:09:01 +0100 Subject: [PATCH 11/33] Resolve #114: status now takes -o option and outputs in more formats. jstatus made public be renaming to JStatus. job gets new method ToStatus(), replacing jobqueue.jobToStatus(). --- cmd/status.go | 148 ++++++++++++++++++++++++++++++++++------- glide.lock | 6 +- glide.yaml | 1 + jobqueue/job.go | 64 +++++++++++++++++- jobqueue/rest_test.go | 28 ++++---- jobqueue/serverREST.go | 4 +- jobqueue/serverWebI.go | 66 ++---------------- 7 files changed, 209 insertions(+), 108 deletions(-) diff --git a/cmd/status.go b/cmd/status.go index dc6613709..323a8756c 100644 --- a/cmd/status.go +++ b/cmd/status.go @@ -19,11 +19,15 @@ package cmd import ( + "encoding/json" "fmt" + "os" + "sort" "strings" "time" "github.com/VertebrateResequencing/wr/jobqueue" + "github.com/carbocation/runningvariance" "github.com/spf13/cobra" ) @@ -38,7 +42,7 @@ var cmdLine string var showBuried bool var showStd bool var showEnv bool -var quietMode bool +var outputFormat string var statusLimit int // statusCmd represents the status command @@ -49,7 +53,7 @@ var statusCmd = &cobra.Command{ "wr add" or "wr setup" by running this command. Specify one of the flags -f, -l or -i to choose which commands you want the -status of. If none are supplied, it gives you an overview of all your currently +status of. If none are supplied, you will get the status of all your currently incomplete commands. -i is the report group (-i) you supplied to "wr add" when you added the job(s) @@ -66,12 +70,22 @@ options that was used when the command was added, if any. You can do this by using the -c and --mounts/--mounts_json options in -l mode, or by providing the same file you gave to "wr add" in -f mode. -By default, commands with the same state, reason for failure and exitcode are -grouped together and only a random 1 of them is displayed (and you are told how -many were skipped). --limit changes how many commands in each of these groups -are displayed. A limit of 0 turns off grouping and shows all your desired -commands individually, but you could hit a timeout if retrieving the details of -very many (tens of thousands+) commands.`, +There are 4 output formats to choose from with -o (you can shorten the output +name to just the first letter, eg. -o c): + "counts" just displays the count of jobs in each possible state. + "summary" shows the counts broken down by report group, along with the mean + (and standard deviation) resource usage of completed jobs in each report + group, and the internal identifiers of any buried jobs, broken down by exit + code+failure reason. + "details" groups jobs with the same state, reason for failure and exitcode + together and shows the complete details of --limit random jobs in each group + (and you are told how many are not being displayed). A limit of 0 turns off + grouping and shows all your desired commands individually, but you could hit + a timeout if retrieving the details of very many (tens of thousands+) + commands. + "json" simply dumps the complete details of every job out as an array of + JSON objects. The properties of the JSON objects are described in the + documentation for wr's REST API.`, Run: func(cmd *cobra.Command, args []string) { set := countGetJobArgs() if set > 1 { @@ -92,10 +106,16 @@ very many (tens of thousands+) commands.`, } }() + if outputFormat != "details" && outputFormat != "d" { + statusLimit = 0 + showStd = false + showEnv = false + } jobs := getJobs(jq, cmdState, set == 0, statusLimit, showStd, showEnv) showextra := cmdFileStatus == "" - if quietMode { + switch outputFormat { + case "counts", "c": var d, re, b, ru, l, c, dep int for _, job := range jobs { switch job.State { @@ -116,7 +136,71 @@ very many (tens of thousands+) commands.`, } } fmt.Printf("complete: %d\nrunning: %d\nready: %d\ndependent: %d\nlost contact: %d\ndelayed: %d\nburied: %d\n", c, ru, re, dep, l, d, b) - } else { + case "summary", "s": + counts := make(map[string]map[jobqueue.JobState]int) + buried := make(map[string]map[string][]string) + memory := make(map[string]*runningvariance.RunningStat) + walltime := make(map[string]*runningvariance.RunningStat) + cputime := make(map[string]*runningvariance.RunningStat) + for _, job := range jobs { + if _, exists := counts[job.RepGroup]; !exists { + counts[job.RepGroup] = make(map[jobqueue.JobState]int) + } + state := job.State + if state == jobqueue.JobStateReserved { + state = jobqueue.JobStateRunning + } + counts[job.RepGroup][job.State]++ + + if state == jobqueue.JobStateBuried { + if _, exists := buried[job.RepGroup]; !exists { + buried[job.RepGroup] = make(map[string][]string) + } + group := fmt.Sprintf("exitcode.%d,\"%s\"", job.Exitcode, job.FailReason) + buried[job.RepGroup][group] = append(buried[job.RepGroup][group], job.Key()) + } else if state == jobqueue.JobStateComplete { + if _, exists := memory[job.RepGroup]; !exists { + memory[job.RepGroup] = runningvariance.NewRunningStat() + walltime[job.RepGroup] = runningvariance.NewRunningStat() + cputime[job.RepGroup] = runningvariance.NewRunningStat() + } + memory[job.RepGroup].Push(float64(job.PeakRAM)) + walltime[job.RepGroup].Push(float64(job.WallTime())) + cputime[job.RepGroup].Push(float64(job.CPUtime)) + } + } + + // sort RepGroups for a nicer display + var rgs []string + for rg := range counts { + rgs = append(rgs, rg) + } + sort.Strings(rgs) + + // display summary for each RepGroup + for _, rg := range rgs { + var usage string + if counts[rg][jobqueue.JobStateComplete] > 0 { + usage = fmt.Sprintf(" memory=%dMB(+/-%dMB) walltime=%s(+/-%s) cputime=%s(+/-%s)", int(memory[rg].Mean()), int(memory[rg].StandardDeviation()), time.Duration(walltime[rg].Mean()), time.Duration(walltime[rg].StandardDeviation()), time.Duration(cputime[rg].Mean()), time.Duration(cputime[rg].StandardDeviation())) + } + + var dead string + if counts[rg][jobqueue.JobStateBuried] > 0 { + // sort the bury groups + var bgs []string + for bg := range buried[rg] { + bgs = append(bgs, bg) + } + sort.Strings(bgs) + + for _, bg := range bgs { + dead += fmt.Sprintf(" %s=%s", bg, strings.Join(buried[rg][bg], ",")) + } + } + + fmt.Printf("%s : complete=%d running=%d ready=%d dependent=%d lost=%d delayed=%d buried=%d%s%s\n", rg, counts[rg][jobqueue.JobStateComplete], counts[rg][jobqueue.JobStateRunning], counts[rg][jobqueue.JobStateReady], counts[rg][jobqueue.JobStateDependent], counts[rg][jobqueue.JobStateLost], counts[rg][jobqueue.JobStateDelayed], counts[rg][jobqueue.JobStateBuried], usage, dead) + } + case "details", "d": // print out status information for each job for _, job := range jobs { cwd := job.Cwd @@ -186,17 +270,17 @@ very many (tens of thousands+) commands.`, } fmt.Printf("%s: { Exit code: %d; Peak memory: %dMB; Wall time: %s; CPU time: %s }\nHost: %s (IP: %s%s); Pid: %d\n", prefix, job.Exitcode, job.PeakRAM, job.WallTime(), job.CPUtime, job.Host, job.HostIP, hostID, job.Pid) if showextra && showStd && job.Exitcode != 0 { - stdout, err := job.StdOut() - if err != nil { - warn("problem reading the cmd's STDOUT: %s", err) + stdout, errs := job.StdOut() + if errs != nil { + warn("problem reading the cmd's STDOUT: %s", errs) } else if stdout != "" { fmt.Printf("StdOut:\n%s\n", stdout) } else { fmt.Printf("StdOut: [none]\n") } - stderr, err := job.StdErr() - if err != nil { - warn("problem reading the cmd's STDERR: %s", err) + stderr, errs := job.StdErr() + if errs != nil { + warn("problem reading the cmd's STDERR: %s", errs) } else if stderr != "" { fmt.Printf("StdErr:\n%s\n", stderr) } else { @@ -212,16 +296,16 @@ very many (tens of thousands+) commands.`, } else if showextra && showStd { // it's possible for jobs that got buried before they even // ran to have details of the bury in their stderr - stderr, err := job.StdErr() - if err == nil && stderr != "" { + stderr, errs := job.StdErr() + if errs == nil && stderr != "" { fmt.Printf("Details: %s\n", stderr) } } if showextra && showEnv { - env, err := job.Env() - if err != nil { - warn("problem reading the cmd's Env: %s", err) + env, erre := job.Env() + if erre != nil { + warn("problem reading the cmd's Env: %s", erre) } else { fmt.Printf("Env: %s\n", env) } @@ -243,6 +327,20 @@ very many (tens of thousands+) commands.`, fmt.Printf("+ %d other commands with the same status%s%s\n", job.Similar, er, fr) } } + case "json", "j": + jstati := make([]jobqueue.JStatus, len(jobs)) + for i, job := range jobs { + jstati[i] = job.ToStatus() + } + + encoder := json.NewEncoder(os.Stdout) + encoder.SetEscapeHTML(false) + err = encoder.Encode(jstati) + if err != nil { + die("failed to encode jobs: %s", err) + } + default: + die("invalid -o format specified") } fmt.Printf("\n") @@ -262,10 +360,10 @@ func init() { statusCmd.Flags().StringVarP(&mountJSON, "mount_json", "j", "", "mounts that the command(s) specified by -l or -f were set to use (JSON format)") statusCmd.Flags().StringVar(&mountSimple, "mounts", "", "mounts that the command(s) specified by -l or -f were set to use (simple format)") statusCmd.Flags().BoolVarP(&showBuried, "buried", "b", false, "in default or -i mode only, only show the status of buried commands") - statusCmd.Flags().BoolVarP(&showStd, "std", "s", false, "except in -f mode, also show the most recent STDOUT and STDERR of incomplete commands") - statusCmd.Flags().BoolVarP(&showEnv, "env", "e", false, "except in -f mode, also show the environment variables the command(s) ran with") - statusCmd.Flags().BoolVarP(&quietMode, "quiet", "q", false, "minimal verbosity: just display status counts") - statusCmd.Flags().IntVar(&statusLimit, "limit", 1, "number of commands that share the same properties to display; 0 displays all") + statusCmd.Flags().BoolVarP(&showStd, "std", "s", false, "in -o d mode, except in -f mode, also show the most recent STDOUT and STDERR of incomplete commands") + statusCmd.Flags().BoolVarP(&showEnv, "env", "e", false, "in -o d mode, except in -f mode, also show the environment variables the command(s) ran with") + statusCmd.Flags().StringVarP(&outputFormat, "output", "o", "details", "['counts','summary','details','json'] output format") + statusCmd.Flags().IntVar(&statusLimit, "limit", 1, "in -o d mode, number of commands that share the same properties to display; 0 displays all") statusCmd.Flags().IntVar(&timeoutint, "timeout", 120, "how long (seconds) to wait to get a reply from 'wr manager'") } diff --git a/glide.lock b/glide.lock index e32403c0a..bfe6dd6d9 100644 --- a/glide.lock +++ b/glide.lock @@ -1,5 +1,5 @@ -hash: b5179681947c4f7148802f6b580d9e0914b9d18a767f8d3067851dcb8634334e -updated: 2018-05-03T09:22:29.966469534+01:00 +hash: b720e204ab97d64f5cb6cd5a12c8e1619c2a5652ace8b6774038b1161c31a31b +updated: 2018-05-08T14:04:55.197606254+01:00 imports: - name: code.cloudfoundry.org/bytefmt version: b31f603f5e1e047fdb38584e1a922dcc5c4de5c8 @@ -7,6 +7,8 @@ imports: version: d358565f3c3f5334209f1e80693e4f621650c489 - name: github.com/BurntSushi/toml version: a368813c5e648fee92e5f6c30e3944ff9d5e8895 +- name: github.com/carbocation/runningvariance + version: fdcce8a03b6b0a7738fb52457381c120426e0fc3 - name: github.com/coreos/bbolt version: 48ea1b39c25fc1bab3506fbc712ecbaa842c4d2d - name: github.com/dgryski/go-farm diff --git a/glide.yaml b/glide.yaml index bddc5a8b2..2ad941f59 100644 --- a/glide.yaml +++ b/glide.yaml @@ -68,6 +68,7 @@ import: subpackages: - client - api/types +- package: github.com/carbocation/runningvariance testImport: - package: github.com/smartystreets/goconvey version: master diff --git a/jobqueue/job.go b/jobqueue/job.go index 54325d9bc..c826956eb 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -372,10 +372,10 @@ func (j *Job) EnvAddOverride(env []string) error { // Getenv is like os.Getenv(), but for the environment variables stored in the // the job, including any overrides. Returns blank if Env() would have returned // an error. -func (j *Job) Getenv(key string) (value string) { +func (j *Job) Getenv(key string) string { env, err := j.Env() if err != nil { - return + return "" } for _, envvar := range env { pair := strings.Split(envvar, "=") @@ -383,7 +383,7 @@ func (j *Job) Getenv(key string) (value string) { return pair[1] } } - return + return "" } // StdOut returns the decompressed job.StdOutC, which is the head and tail of @@ -714,6 +714,64 @@ func (j *Job) setSchedulerGroup(newval string) { j.schedulerGroup = newval } +// ToStatus converts a job to a simplified JStatus, useful for output as JSON. +func (j *Job) ToStatus() JStatus { + stderr, _ := j.StdErr() + stdout, _ := j.StdOut() + env, _ := j.Env() + var cwdLeaf string + j.RLock() + defer j.RUnlock() + if j.ActualCwd != "" { + cwdLeaf, _ = filepath.Rel(j.Cwd, j.ActualCwd) + cwdLeaf = "/" + cwdLeaf + } + state := j.State + if state == JobStateRunning && j.Lost { + state = JobStateLost + } + var ot []string + for key, val := range j.Requirements.Other { + ot = append(ot, key+":"+val) + } + return JStatus{ + Key: j.Key(), + RepGroup: j.RepGroup, + DepGroups: j.DepGroups, + Dependencies: j.Dependencies.Stringify(), + Cmd: j.Cmd, + State: state, + CwdBase: j.Cwd, + Cwd: cwdLeaf, + HomeChanged: j.ChangeHome, + Behaviours: j.Behaviours.String(), + Mounts: j.MountConfigs.String(), + MonitorDocker: j.MonitorDocker, + ExpectedRAM: j.Requirements.RAM, + ExpectedTime: j.Requirements.Time.Seconds(), + RequestedDisk: j.Requirements.Disk, + OtherRequests: ot, + Cores: j.Requirements.Cores, + PeakRAM: j.PeakRAM, + Exited: j.Exited, + Exitcode: j.Exitcode, + FailReason: j.FailReason, + Pid: j.Pid, + Host: j.Host, + HostID: j.HostID, + HostIP: j.HostIP, + Walltime: j.WallTime().Seconds(), + CPUtime: j.CPUtime.Seconds(), + Started: j.StartTime.Unix(), + Ended: j.EndTime.Unix(), + Attempts: j.Attempts, + Similar: j.Similar, + StdErr: stderr, + StdOut: stdout, + Env: env, + } +} + // JobEssence struct describes the essential aspects of a Job that make it // unique, used to describe a Job when eg. you want to search for one. type JobEssence struct { diff --git a/jobqueue/rest_test.go b/jobqueue/rest_test.go index 3e0c336c5..148ce07c7 100644 --- a/jobqueue/rest_test.go +++ b/jobqueue/rest_test.go @@ -140,7 +140,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 0) @@ -164,7 +164,7 @@ func TestREST(t *testing.T) { So(err, ShouldBeNil) responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 3) @@ -196,7 +196,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 3) @@ -211,7 +211,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 1) @@ -225,7 +225,7 @@ func TestREST(t *testing.T) { responseData, err = ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati2 []jstatus + var jstati2 []JStatus err = json.Unmarshal(responseData, &jstati2) So(err, ShouldBeNil) So(len(jstati2), ShouldEqual, 2) @@ -242,7 +242,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 2) @@ -261,7 +261,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 1) @@ -300,7 +300,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 2) @@ -318,7 +318,7 @@ func TestREST(t *testing.T) { responseData, err = ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati2 []jstatus + var jstati2 []JStatus err = json.Unmarshal(responseData, &jstati2) So(err, ShouldBeNil) So(len(jstati2), ShouldEqual, 1) @@ -336,7 +336,7 @@ func TestREST(t *testing.T) { responseData, err = ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati3 []jstatus + var jstati3 []JStatus err = json.Unmarshal(responseData, &jstati3) So(err, ShouldBeNil) So(len(jstati3), ShouldEqual, 1) @@ -356,7 +356,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 1) @@ -379,7 +379,7 @@ func TestREST(t *testing.T) { So(err, ShouldBeNil) responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 1) @@ -400,7 +400,7 @@ func TestREST(t *testing.T) { responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 1) @@ -440,7 +440,7 @@ func TestREST(t *testing.T) { So(err, ShouldBeNil) responseData, err := ioutil.ReadAll(response.Body) So(err, ShouldBeNil) - var jstati []jstatus + var jstati []JStatus err = json.Unmarshal(responseData, &jstati) So(err, ShouldBeNil) So(len(jstati), ShouldEqual, 1) diff --git a/jobqueue/serverREST.go b/jobqueue/serverREST.go index 9d613027b..90a3692bd 100644 --- a/jobqueue/serverREST.go +++ b/jobqueue/serverREST.go @@ -494,9 +494,9 @@ func restJobs(s *Server) http.HandlerFunc { } // convert jobs to jstatus - jstati := make([]jstatus, len(jobs)) + jstati := make([]JStatus, len(jobs)) for i, job := range jobs { - jstati[i] = jobToStatus(job) + jstati[i] = job.ToStatus() } // return job details as JSON diff --git a/jobqueue/serverWebI.go b/jobqueue/serverWebI.go index 1b87b3344..fcf7ccd38 100644 --- a/jobqueue/serverWebI.go +++ b/jobqueue/serverWebI.go @@ -22,7 +22,6 @@ package jobqueue import ( "net/http" - "path/filepath" "strings" "sync" @@ -61,9 +60,9 @@ type jstatusReq struct { Msg string // required argument for dismissMsg } -// jstatus is the job info we send to the status webpage (only real difference +// JStatus is the job info we send to the status webpage (only real difference // to Job is that some of the values are converted to easy-to-display forms). -type jstatus struct { +type JStatus struct { Key string RepGroup string DepGroups []string @@ -275,7 +274,7 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { writeMutex.Lock() failed := false for _, job := range jobs { - status := jobToStatus(job) + status := job.ToStatus() status.RepGroup = req.RepGroup // since we want to return the group the user asked for, not the most recent group the job was made for err := conn.WriteJSON(status) if err != nil { @@ -362,7 +361,7 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { case req.Key != "": jobs, _, errstr := s.getJobsByKeys([]string{req.Key}, true, true) if errstr == "" && len(jobs) == 1 { - status := jobToStatus(jobs[0]) + status := jobs[0].ToStatus() writeMutex.Lock() err := conn.WriteJSON(status) writeMutex.Unlock() @@ -444,63 +443,6 @@ func webInterfaceStatusWS(s *Server) http.HandlerFunc { } } -func jobToStatus(job *Job) jstatus { - stderr, _ := job.StdErr() - stdout, _ := job.StdOut() - env, _ := job.Env() - var cwdLeaf string - job.RLock() - defer job.RUnlock() - if job.ActualCwd != "" { - cwdLeaf, _ = filepath.Rel(job.Cwd, job.ActualCwd) - cwdLeaf = "/" + cwdLeaf - } - state := job.State - if state == JobStateRunning && job.Lost { - state = JobStateLost - } - var ot []string - for key, val := range job.Requirements.Other { - ot = append(ot, key+":"+val) - } - return jstatus{ - Key: job.Key(), - RepGroup: job.RepGroup, - DepGroups: job.DepGroups, - Dependencies: job.Dependencies.Stringify(), - Cmd: job.Cmd, - State: state, - CwdBase: job.Cwd, - Cwd: cwdLeaf, - HomeChanged: job.ChangeHome, - Behaviours: job.Behaviours.String(), - Mounts: job.MountConfigs.String(), - MonitorDocker: job.MonitorDocker, - ExpectedRAM: job.Requirements.RAM, - ExpectedTime: job.Requirements.Time.Seconds(), - RequestedDisk: job.Requirements.Disk, - OtherRequests: ot, - Cores: job.Requirements.Cores, - PeakRAM: job.PeakRAM, - Exited: job.Exited, - Exitcode: job.Exitcode, - FailReason: job.FailReason, - Pid: job.Pid, - Host: job.Host, - HostID: job.HostID, - HostIP: job.HostIP, - Walltime: job.WallTime().Seconds(), - CPUtime: job.CPUtime.Seconds(), - Started: job.StartTime.Unix(), - Ended: job.EndTime.Unix(), - Attempts: job.Attempts, - Similar: job.Similar, - StdErr: stderr, - StdOut: stdout, - Env: env, - } -} - // reqToJobs takes a request from the status webpage and returns the requested // jobs. func (s *Server) reqToJobs(req jstatusReq, allowedItemStates []queue.ItemState) []*Job { From 65068a3874d3f6209775f008028c50c676c2fb90 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 8 May 2018 16:39:20 +0100 Subject: [PATCH 12/33] Delint following changes for LSF emulation. --- cmd/lsf.go | 14 +++++-- cmd/root.go | 2 + jobqueue/client.go | 91 +++++++++++++++++++++++++++++++--------------- jobqueue/db.go | 3 +- jobqueue/server.go | 4 +- main.go | 3 +- 6 files changed, 78 insertions(+), 39 deletions(-) diff --git a/cmd/lsf.go b/cmd/lsf.go index b93a7800c..a57a4d129 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -70,9 +70,9 @@ var lsfBsubCmd = &cobra.Command{ Short: "Add a job using bsub syntax", Long: `Add a job to the queue using bsub syntax.`, Run: func(cmd *cobra.Command, args []string) { - wd, err := os.Getwd() - if err != nil { - die(err.Error()) + wd, errg := os.Getwd() + if errg != nil { + die(errg.Error()) } job := &jobqueue.Job{ @@ -156,7 +156,13 @@ var lsfBsubCmd = &cobra.Command{ // connect to the server jq := connect(10 * time.Second) - defer jq.Disconnect() + var err error + defer func() { + err = jq.Disconnect() + if err != nil { + warn("Disconnecting from the server failed: %s", err) + } + }() // add the job to the queue inserts, _, err := jq.Add([]*jobqueue.Job{job}, os.Environ(), false) diff --git a/cmd/root.go b/cmd/root.go index cae5d8382..7cc05f96d 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -77,6 +77,8 @@ func Execute() { } } +// ExecuteLSF is for treating a call to wr as if `wr lsf xxx` was called, for +// the LSF emulation to work. func ExecuteLSF(cmd string) { args := []string{"lsf", cmd} command, _, err := RootCmd.Find(args) diff --git a/jobqueue/client.go b/jobqueue/client.go index 20fc3f6f3..dc76f6971 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -73,6 +73,9 @@ const ( // symlinks in const lsfEmulationDir = ".wr_lsf_emulation" +// localhost is the name of host we're running on +const localhost = "localhost" + // these global variables are primarily exported for testing purposes; you // probably shouldn't change them (*** and they should probably be re-factored // as fields of a config struct...) @@ -440,52 +443,76 @@ func (c *Client) Execute(job *Job, shell string) error { if job.BsubMode != "" { // create parent of job.Cwd so we can later mount at job.Cwd parent := filepath.Dir(job.Cwd) - os.MkdirAll(parent, os.ModePerm) - if fi, err := os.Stat(parent); err != nil || !fi.Mode().IsDir() { - c.Bury(job, nil, FailReasonCwd) - return fmt.Errorf("parent of working directory [%s] could not be created", parent) + errm := os.MkdirAll(parent, os.ModePerm) + if fi, errs := os.Stat(parent); errs != nil || !fi.Mode().IsDir() { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("parent of working directory [%s] could not be created: %s%s", parent, errm, extra) } } var mountCouldFail bool host, err := os.Hostname() if err != nil { - host = "localhost" + host = localhost } if job.BsubMode != "" { jobCwd := job.Cwd if jobCwd == "" { jobCwd = "." } - absJobCwd, err := filepath.Abs(jobCwd) - if err != nil { - c.Bury(job, nil, FailReasonCwd) - return fmt.Errorf("failed to make cmd dir absolute: %s", err) + absJobCwd, erra := filepath.Abs(jobCwd) + if erra != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("failed to make cmd dir absolute: %s%s", erra, extra) } parent := filepath.Dir(absJobCwd) // create bsub and bjobs symlinks in a sister dir of job.Cwd prependPath = filepath.Join(parent, lsfEmulationDir) - os.MkdirAll(prependPath, os.ModePerm) - if fi, err := os.Stat(prependPath); err != nil || !fi.Mode().IsDir() { - c.Bury(job, nil, FailReasonCwd) - return fmt.Errorf("sister of working directory [%s] could not be created", prependPath) + errm := os.MkdirAll(prependPath, os.ModePerm) + if fi, errs := os.Stat(prependPath); errs != nil || !fi.Mode().IsDir() { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("sister of working directory [%s] could not be created: %s%s", prependPath, errm, extra) } - wr, err := os.Executable() - if err != nil { - c.Bury(job, nil, FailReasonCwd) - return fmt.Errorf("could not get path to wr: %s", err) + wr, erre := os.Executable() + if erre != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not get path to wr: %s%s", erre, extra) } bsub := filepath.Join(prependPath, "bsub") bjobs := filepath.Join(prependPath, "bjobs") err = os.Symlink(wr, bsub) if err != nil && !os.IsExist(err) { - c.Bury(job, nil, FailReasonCwd) - return fmt.Errorf("could not create bsub symlink: %s", err) + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not create bsub symlink: %s%s", err, extra) } err = os.Symlink(wr, bjobs) if err != nil && !os.IsExist(err) { - c.Bury(job, nil, FailReasonCwd) - return fmt.Errorf("could not create bjobs symlink: %s", err) + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not create bjobs symlink: %s%s", err, extra) } onCwd = job.CwdMatters @@ -494,14 +521,14 @@ func (c *Client) Execute(job *Job, shell string) error { // we'll run the command from the desired directory, which must exist or // it will fail if fi, errf := os.Stat(job.Cwd); errf != nil || !fi.Mode().IsDir() { - os.MkdirAll(job.Cwd, os.ModePerm) - if _, errf = os.Stat(job.Cwd); errf != nil { + errm := os.MkdirAll(job.Cwd, os.ModePerm) + if _, errs := os.Stat(job.Cwd); errs != nil { errb := c.Bury(job, nil, FailReasonCwd) extra := "" if errb != nil { extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) } - return fmt.Errorf("working directory [%s] does not exist%s", job.Cwd, extra) + return fmt.Errorf("working directory [%s] does not exist%s: %s", job.Cwd, extra, errm) } } var actualCwd, tmpDir string @@ -527,7 +554,7 @@ func (c *Client) Execute(job *Job, shell string) error { // parent if jsonStr := job.Getenv("WR_BSUB_CONFIG"); jsonStr != "" { configJob := &Job{} - if err := json.Unmarshal([]byte(jsonStr), configJob); err == nil && configJob.Host == host { + if erru := json.Unmarshal([]byte(jsonStr), configJob); erru == nil && configJob.Host == host { mountCouldFail = true // *** but the problem with this is, the parent job could finish // while we're still running, and unmount!... @@ -614,10 +641,14 @@ func (c *Client) Execute(job *Job, shell string) error { BsubMode: job.BsubMode, Host: host, } - jobJSON, err := json.Marshal(simplified) - if err != nil { - c.Bury(job, nil, fmt.Sprintf("could not convert job to JSON: %s", err)) - return fmt.Errorf("could not convert job to JSON: %s", err) + jobJSON, errm := json.Marshal(simplified) + if errm != nil { + errb := c.Bury(job, nil, fmt.Sprintf("could not convert job to JSON: %s", errm)) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not convert job to JSON: %s%s", errm, extra) } env = envOverride(env, []string{ "WR_BSUB_CONFIG=" + string(jobJSON), @@ -1142,7 +1173,7 @@ func (c *Client) Started(job *Job, pid int) error { // host details host, err := os.Hostname() if err != nil { - host = "localhost" + host = localhost } job.Host = host job.HostIP, err = CurrentIP("") diff --git a/jobqueue/db.go b/jobqueue/db.go index dcc387049..73a39f784 100644 --- a/jobqueue/db.go +++ b/jobqueue/db.go @@ -658,11 +658,10 @@ func (db *db) retrieveRepGroups() ([]string, error) { var rgs []string err := db.bolt.View(func(tx *bolt.Tx) error { b := tx.Bucket(bucketRGs) - b.ForEach(func(k, v []byte) error { + return b.ForEach(func(k, v []byte) error { rgs = append(rgs, string(k)) return nil }) - return nil }) return rgs, err } diff --git a/jobqueue/server.go b/jobqueue/server.go index ec262069b..2bc5cc258 100644 --- a/jobqueue/server.go +++ b/jobqueue/server.go @@ -89,7 +89,7 @@ var ( // BsubID is used to give added jobs a unique (atomically incremented) id when // pretending to be bsub. -var BsubID uint64 = 0 +var BsubID uint64 // Error records an error and the operation and item that caused it. type Error struct { @@ -384,7 +384,7 @@ func Serve(config ServerConfig) (s *Server, msg string, token []byte, err error) keyFile := config.KeyFile certDomain := config.CertDomain if certDomain == "" { - certDomain = "localhost" + certDomain = localhost } err = internal.CheckCerts(certFile, keyFile) var certMsg string diff --git a/main.go b/main.go index 20bce3c79..bfed70e54 100644 --- a/main.go +++ b/main.go @@ -99,9 +99,10 @@ options. package main import ( - "github.com/VertebrateResequencing/wr/cmd" "os" "path/filepath" + + "github.com/VertebrateResequencing/wr/cmd" ) func main() { From 007d00c96d626b3e908e55b002ae63e7bcad228d Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Wed, 9 May 2018 15:49:03 +0100 Subject: [PATCH 13/33] Do not log the authentication token to file. --- cmd/manager.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cmd/manager.go b/cmd/manager.go index 0e0ddfc70..88a331d29 100644 --- a/cmd/manager.go +++ b/cmd/manager.go @@ -412,6 +412,10 @@ func init() { func logStarted(s *jobqueue.ServerInfo, token []byte) { info("wr manager started on %s, pid %d", sAddr(s), s.PID) + + // go back to just stderr so we don't log token to file (this doesn't affect + // server logging) + appLogger.SetHandler(log15.LvlFilterHandler(log15.LvlInfo, log15.StderrHandler)) info("wr's web interface can be reached at https://%s:%s/?token=%s", s.Host, s.WebPort, string(token)) if setDomainIP { @@ -534,6 +538,7 @@ func startJQ(postCreation []byte) { } logStarted(server.ServerInfo, token) + l15h.AddHandler(appLogger, fh) // logStarted disabled logging to file; reenable to get final message below // block forever while the jobqueue does its work err = server.Block() From 401ad5c13323885f82efec8f0f1d61a74b9fabe4 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Wed, 9 May 2018 16:51:50 +0100 Subject: [PATCH 14/33] Resolve #115: queue-based schedulers now do bin packing. --- cmd/add.go | 9 +- jobqueue/scheduler/local.go | 205 +++++++++++++++------------ jobqueue/scheduler/openstack.go | 12 ++ jobqueue/scheduler/scheduler_test.go | 91 +++++++++++- 4 files changed, 223 insertions(+), 94 deletions(-) diff --git a/cmd/add.go b/cmd/add.go index 9c40f80c1..7ef1e444a 100644 --- a/cmd/add.go +++ b/cmd/add.go @@ -195,8 +195,13 @@ size if necessary] "priority" defines how urgent a particular command is; those with higher priorities will start running before those with lower priorities. The range of -possible values is 0 (default) to 255. Commands with the same priority will be -started in the order they were added. +possible values is 0 (default, for lowest priority) to 255 (highest priority). +Commands with the same priority will be started in the order they were added. +(Note, however, that order of starting is only guaranteed to hold true amongst +jobs with similar resource requirements, since your chosen job scheduler may, +for example, run your highest priority job on a machine where it takes up 90% of +memory, and then find another job to run on that machine that needs 10% or less +memory - and that job might be one of your low priority ones.) "retries" defines how many times a command will be retried automatically if it fails. Automatic retries are helpful in the case of transient errors, or errors diff --git a/jobqueue/scheduler/local.go b/jobqueue/scheduler/local.go index c45fde592..faaded8bb 100644 --- a/jobqueue/scheduler/local.go +++ b/jobqueue/scheduler/local.go @@ -37,6 +37,7 @@ import ( const ( localPlace = "localhost" localReserveTimeout = 1 + priorityScaler = float64(255) / float64(100) ) // reqCheckers are functions used by schedule() to see if it is at all possible @@ -45,6 +46,12 @@ const ( // use local's schedule(), but have their own reqChecker implementation.) type reqChecker func(req *Requirements) error +// maxResourceGetter are functions used by schedule() to see what the maximum of +// of a resource like memory or time is. (We make use of this in the local +// struct so that other implementers of scheduleri can embed local, use local's +// schedule(), but have their own maxResourceGetter implementation.) +type maxResourceGetter func() int + // canCounters are functions used by processQueue() to see how many of a job // can be run. (We make use of this in the local struct so that other // implementers of scheduleri can embed local, use local's processQueue(), but @@ -85,6 +92,8 @@ type local struct { running map[string]int cleaned bool reqCheckFunc reqChecker + maxMemFunc maxResourceGetter + maxCPUFunc maxResourceGetter canCountFunc canCounter stateUpdateFunc stateUpdater stateUpdateFreq time.Duration @@ -136,6 +145,8 @@ func (s *local) initialize(config interface{}, logger log15.Logger) error { // set our functions for use in schedule() and processQueue() s.reqCheckFunc = s.reqCheck + s.maxMemFunc = s.maxMem + s.maxCPUFunc = s.maxCPU s.canCountFunc = s.canCount s.runCmdFunc = s.runCmd s.cancelRunCmdFunc = s.cancelRun @@ -176,6 +187,21 @@ func (s *local) schedule(cmd string, req *Requirements, count int) error { } // else, just in case a job with these reqs somehow got through in the // past, allow it to be cancelled + // priority of this cmd will be based on how "large" it is, which is the max + // of the percentage of available memory it needs and percentage of cpus it + // needs. A cmd that needs 100% of memory or cpu will be our highest + // priority command, which is expressed as priority 255, while one that + // needs 0% of resources will be expressed as priority 0. + maxMem := s.maxMemFunc() + maxCPU := s.maxCPUFunc() + percentMemNeeded := (float64(req.RAM) / float64(maxMem)) * float64(100) + percentCPUNeeded := (float64(req.Cores) / float64(maxCPU)) * float64(100) + percentMachineNeeded := percentMemNeeded + if percentCPUNeeded > percentMachineNeeded { + percentMachineNeeded = percentCPUNeeded + } + priority := uint8(math.Round(priorityScaler * percentMachineNeeded)) + // add to the queue key := jobName(cmd, "n/a", false) data := &job{ @@ -184,7 +210,8 @@ func (s *local) schedule(cmd string, req *Requirements, count int) error { count: count, } s.mutex.Lock() - item, err := s.queue.Add(key, "", data, 0, 0*time.Second, 30*time.Second) // the ttr just has to be long enough for processQueue() to process a job, not actually run the cmds + + item, err := s.queue.Add(key, "", data, priority, 0*time.Second, 30*time.Second) // the ttr just has to be long enough for processQueue() to process a job, not actually run the cmds if err != nil { if qerr, ok := err.(queue.Error); ok && qerr.Err == queue.ErrAlreadyExists { // update the job's count (only) @@ -201,7 +228,7 @@ func (s *local) schedule(cmd string, req *Requirements, count int) error { s.startAutoProcessing() - // try and run the oldest job in the queue + // try and run the jobs in the queue return s.processQueue() } @@ -213,6 +240,16 @@ func (s *local) reqCheck(req *Requirements) error { return nil } +// maxMem returns the maximum memory available on the machine in MB. +func (s *local) maxMem() int { + return s.maxRAM +} + +// maxCPU returns the total number of CPU cores available on the machine. +func (s *local) maxCPU() int { + return s.maxCores +} + // removeKey removes a key from the queue, for when there are no more jobs for // that key. If this results in an empty queue, stops autoProcessing. You must // hold the lock on s before calling this! @@ -229,8 +266,9 @@ func (s *local) removeKey(key string) { } } -// processQueue gets the oldest job in the queue, sees if it's possible to run -// it, does so if it is, otherwise returns the job to the queue. +// processQueue goes through the jobs in the queue by size, sees if it's +// possible to run any, does so if it is, otherwise returns the jobs to the +// queue. func (s *local) processQueue() error { // first perform any global state update needed by the scheduler s.stateUpdateFunc() @@ -247,12 +285,6 @@ func (s *local) processQueue() error { return nil } - var key, cmd string - var req *Requirements - var count, canCount int - var j *job - - // get the oldest job var toRelease []string defer func() { for _, key := range toRelease { @@ -262,6 +294,8 @@ func (s *local) processQueue() error { } } }() + + // go through the jobs largest to smallest (standard bin packing approach) for { item, err := s.queue.Reserve() if err != nil { @@ -270,12 +304,12 @@ func (s *local) processQueue() error { } return err } - key = item.Key - j = item.Data.(*job) + key := item.Key + j := item.Data.(*job) j.RLock() - cmd = j.cmd - req = j.req - count = j.count + cmd := j.cmd + req := j.req + count := j.count j.RUnlock() running := s.running[key] @@ -295,101 +329,92 @@ func (s *local) processQueue() error { toRelease = append(toRelease, key) shouldCount := count - running if shouldCount <= 0 { - // we're already running everything for this job, try the next most - // oldest + // we're already running everything for this job, try the next + // largest cmd continue } // now see if there's remaining capacity to run the job - canCount = s.canCountFunc(req) + canCount := s.canCountFunc(req) s.Debug("processQueue canCount", "can", canCount, "running", running, "should", shouldCount) if canCount > shouldCount { canCount = shouldCount } if canCount == 0 { - // we don't want to go to the next most oldest, but will wait until - // something calls processQueue() again to get the cmd for this - // job running: dumb fifo behaviour - //*** could easily make this less dumb by considering how long we - // most likely have to wait for a currently running job to finish, - // and seeing if there are any other jobs in the queue that will - // finish in less time... - return nil + // try and fill any "gaps" (spare memory/ cpu) by seeing if a cmd + // with lesser resource requirements can be run + continue } - break - } - - // start running what we can - s.Debug("processQueue runCmdFunc", "count", canCount) - reserved := make(chan bool, canCount) - for i := 0; i < canCount; i++ { - s.running[key]++ + // start running what we can + s.Debug("processQueue runCmdFunc", "count", canCount) + reserved := make(chan bool, canCount) + for i := 0; i < canCount; i++ { + s.running[key]++ - go func() { - defer internal.LogPanic(s.Logger, "runCmd", true) + go func() { + defer internal.LogPanic(s.Logger, "runCmd", true) + + err := s.runCmdFunc(cmd, req, reserved) + + s.mutex.Lock() + s.resourceMutex.Lock() + s.ram -= req.RAM + s.cores -= req.Cores + s.resourceMutex.Unlock() + s.running[key]-- + if s.running[key] <= 0 { + delete(s.running, key) + } - err := s.runCmdFunc(cmd, req, reserved) + if err == nil { + j.Lock() + j.count-- + jCount := j.count + j.Unlock() + if jCount <= 0 { + s.removeKey(key) + } + } else if err.Error() != standinNotNeeded { + // users are notified of relevant errors during runCmd; here + // we just debug log everything + s.Debug("runCmd error", "err", err) + } + s.mutex.Unlock() + err = s.processQueue() + if err != nil { + s.Error("processQueue recall failed", "err", err) + } + }() + } - s.mutex.Lock() - s.resourceMutex.Lock() - s.ram -= req.RAM - s.cores -= req.Cores - s.resourceMutex.Unlock() - s.running[key]-- - if s.running[key] <= 0 { - delete(s.running, key) + // before allowing this function to be called again, wait for all the + // above runCmdFuncs to at least get as far as reserving their + // resources, so subsequent calls to canCountFunc will be accurate + s.processing = true + go func() { + for i := 0; i < canCount; i++ { + <-reserved } - if err == nil { - j.Lock() - j.count-- - jCount := j.count - j.Unlock() - if jCount <= 0 { - s.removeKey(key) - } - } else if err.Error() != standinNotNeeded { - // users are notified of relevant errors during runCmd; here we - // just debug log everything - s.Debug("runCmd error", "err", err) - } - s.mutex.Unlock() - err = s.processQueue() - if err != nil { - s.Error("processQueue recall failed", "err", err) + s.mutex.Lock() + defer s.mutex.Unlock() + s.processing = false + recall := s.recall + s.recall = false + if recall { + go func() { + errp := s.processQueue() + if errp != nil { + s.Warn("processQueue recall failed", "err", errp) + } + }() } }() - } - - // before allowing this function to be called again, wait for all the above - // runCmdFuncs to at least get as far as reserving their resources, so - // subsequent calls to canCountFunc will be accurate - s.processing = true - go func() { - for i := 0; i < canCount; i++ { - <-reserved - } - - s.mutex.Lock() - defer s.mutex.Unlock() - s.processing = false - recall := s.recall - s.recall = false - if recall { - go func() { - errp := s.processQueue() - if errp != nil { - s.Warn("processQueue recall failed", "err", errp) - } - }() - } - }() - // the item will now be released, so on the next call to this method we'll - // try to run the remainder - return nil + // keep looping, in case any smaller job can also be run + } } // canCount tells you how many jobs with the given RAM and core requirements it diff --git a/jobqueue/scheduler/openstack.go b/jobqueue/scheduler/openstack.go index 5fc8f6eac..21ba7f937 100644 --- a/jobqueue/scheduler/openstack.go +++ b/jobqueue/scheduler/openstack.go @@ -453,6 +453,8 @@ func (s *opst) initialize(config interface{}, logger log15.Logger) error { // set our functions for use in schedule() and processQueue() s.reqCheckFunc = s.reqCheck + s.maxMemFunc = s.maxMem + s.maxCPUFunc = s.maxCPU s.canCountFunc = s.canCount s.runCmdFunc = s.runCmd s.cancelRunCmdFunc = s.cancelRun @@ -507,6 +509,16 @@ func (s *opst) reqCheck(req *Requirements) error { return nil } +// maxMem returns the maximum memory available in quota. +func (s *opst) maxMem() int { + return s.quotaMaxRAM +} + +// maxCPU returns the maximum number of CPU cores available quota. +func (s *opst) maxCPU() int { + return s.quotaMaxCores +} + // determineFlavor picks a server flavor, preferring the smallest (cheapest) // amongst those that are capable of running it. func (s *opst) determineFlavor(req *Requirements) (*cloud.Flavor, error) { diff --git a/jobqueue/scheduler/scheduler_test.go b/jobqueue/scheduler/scheduler_test.go index f2153ad38..ac930a349 100644 --- a/jobqueue/scheduler/scheduler_test.go +++ b/jobqueue/scheduler/scheduler_test.go @@ -232,6 +232,79 @@ func TestLocal(t *testing.T) { } }) + if maxCPU > 2 { + Convey("Schedule() does bin packing and fills up the machine with different size cmds", func() { + smallTmpdir, err := ioutil.TempDir("", "wr_schedulers_local_test_small_output_dir_") + if err != nil { + log.Fatal(err) + } + defer os.RemoveAll(smallTmpdir) + bigTmpdir, err := ioutil.TempDir("", "wr_schedulers_local_test_big_output_dir_") + if err != nil { + log.Fatal(err) + } + defer os.RemoveAll(bigTmpdir) + + blockCmd := "perl -e 'select(undef, undef, undef, 0.25)'" // sleep for 0.25s + blockReq := &Requirements{1, 1 * time.Second, maxCPU, 0, otherReqs} + smallCmd := fmt.Sprintf("perl -MFile::Temp=tempfile -e '@a = tempfile(DIR => q[%s]); select(undef, undef, undef, 0.75); exit(0);'", smallTmpdir) // creates a file and sleeps for 0.75s + smallReq := &Requirements{1, 1 * time.Second, 1, 0, otherReqs} + bigCmd := fmt.Sprintf("perl -MFile::Temp=tempfile -e '@a = tempfile(DIR => q[%s]); select(undef, undef, undef, 0.75); exit(0);'", bigTmpdir) + bigReq := &Requirements{1, 1 * time.Second, maxCPU - 1, 0, otherReqs} + + // schedule 2 big cmds and then a small one to prove the small + // one fits the gap and runs before the second big one + err = s.Schedule(bigCmd, bigReq, 2) + So(err, ShouldBeNil) + err = s.Schedule(smallCmd, smallReq, 1) + So(err, ShouldBeNil) + + for { + if !s.Busy() { + break + } + <-time.After(1 * time.Millisecond) + } + + bigTimes := mtimesOfFilesInDir(bigTmpdir, 2) + So(len(bigTimes), ShouldEqual, 2) + smallTimes := mtimesOfFilesInDir(smallTmpdir, 1) + So(len(smallTimes), ShouldEqual, 1) + firstBig := bigTimes[0] + secondBig := bigTimes[1] + if secondBig.Before(firstBig) { + firstBig = bigTimes[1] + secondBig = bigTimes[0] + } + So(smallTimes[0], ShouldHappenOnOrAfter, firstBig) + So(smallTimes[0], ShouldHappenBefore, secondBig) + + // schedule a blocker so that subsequent schedules will be + // compared to each other, then schedule 2 small cmds and a big + // one to prove that the big one takes priority + err = s.Schedule(blockCmd, blockReq, 1) + So(err, ShouldBeNil) + err = s.Schedule(smallCmd, smallReq, 2) + So(err, ShouldBeNil) + err = s.Schedule(bigCmd, bigReq, 1) + So(err, ShouldBeNil) + + for { + if !s.Busy() { + break + } + <-time.After(1 * time.Millisecond) + } + + bigTimes = mtimesOfFilesInDir(bigTmpdir, 1) + So(len(bigTimes), ShouldEqual, 1) + smallTimes = mtimesOfFilesInDir(smallTmpdir, 2) + So(len(smallTimes), ShouldEqual, 2) + So(bigTimes[0], ShouldHappenOnOrBefore, smallTimes[0]) + So(bigTimes[0], ShouldHappenOnOrBefore, smallTimes[1]) + }) + } + // wait a while for any remaining jobs to finish So(waitToFinish(s, 30, 100), ShouldBeTrue) }) @@ -942,7 +1015,7 @@ func TestOpenstack(t *testing.T) { }) } -func testDirForFiles(tmpdir string, expected int) (numfiles int) { +func getInfoOfFilesInDir(tmpdir string, expected int) []os.FileInfo { files, err := ioutil.ReadDir(tmpdir) if err != nil { log.Fatal(err) @@ -956,7 +1029,21 @@ func testDirForFiles(tmpdir string, expected int) (numfiles int) { log.Fatal(err) } } - return len(files) + return files +} + +func testDirForFiles(tmpdir string, expected int) (numfiles int) { + return len(getInfoOfFilesInDir(tmpdir, expected)) +} + +func mtimesOfFilesInDir(tmpdir string, expected int) []time.Time { + files := getInfoOfFilesInDir(tmpdir, expected) + var times []time.Time + for _, info := range files { + times = append(times, info.ModTime()) + os.Remove(filepath.Join(tmpdir, info.Name())) + } + return times } func waitToFinish(s *Scheduler, maxS int, interval int) bool { From 9b0a2c5a926b1bd47b30add49cb64a10b2354c7e Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Thu, 10 May 2018 09:34:21 +0100 Subject: [PATCH 15/33] Fix typo in comment. --- cloud/openstack.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cloud/openstack.go b/cloud/openstack.go index 8f168e3bd..09026fdb8 100644 --- a/cloud/openstack.go +++ b/cloud/openstack.go @@ -108,7 +108,7 @@ func (p *openstackp) requiredEnv() []string { return openstackReqEnvs[:] } -// maybedEnv returns envs that might be required. +// maybeEnv returns envs that might be required. func (p *openstackp) maybeEnv() []string { return openstackMaybeEnvs[:] } From 9bcd82b8ad4e16fd37f337592f5d154eaaf9dea5 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 11 May 2018 10:23:41 +0100 Subject: [PATCH 16/33] Add timeout on nova command calls, just in case. --- jobqueue/scheduler/scheduler_test.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/jobqueue/scheduler/scheduler_test.go b/jobqueue/scheduler/scheduler_test.go index ac930a349..37db428e3 100644 --- a/jobqueue/scheduler/scheduler_test.go +++ b/jobqueue/scheduler/scheduler_test.go @@ -19,6 +19,7 @@ package scheduler import ( + "context" "fmt" "io/ioutil" "log" @@ -529,7 +530,6 @@ func TestOpenstack(t *testing.T) { possibleReq := &Requirements{100, 1 * time.Minute, 1, 1, otherReqs} impossibleReq := &Requirements{9999999999, 999999 * time.Hour, 99999, 20, otherReqs} - Convey("ReserveTimeout() returns 25 seconds", func() { So(s.ReserveTimeout(), ShouldEqual, 1) }) @@ -767,7 +767,6 @@ func TestOpenstack(t *testing.T) { Convey("Schedule() lets you...", func() { oFile := filepath.Join(tmpdir, "out") oReqs := make(map[string]string) - if flavorRegex == `^m.*$` && os.Getenv("OS_TENANT_ID") == "" { Convey("Run a job on a specific flavor", func() { cmd := "sleep 10" @@ -1072,13 +1071,20 @@ func waitToFinish(s *Scheduler, maxS int, interval int) bool { } func novaCountServers(novaCmd string, rName, osPrefix string, flavor ...string) int { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() var extra string if len(flavor) == 1 { extra = "--flavor " + flavor[0] + " " } if osPrefix == "" { - cmd := exec.Command("bash", "-c", novaCmd+" list "+extra+"| grep -c "+rName) + cmdStr := novaCmd + " list " + extra + "| grep -c " + rName + cmd := exec.CommandContext(ctx, "bash", "-c", cmdStr) out, err := cmd.Output() + if ctx.Err() != nil { + log.Printf("exec of [%s] timed out\n", cmdStr) + return 0 + } if err == nil { count, err := strconv.Atoi(strings.TrimSpace(string(out))) if err == nil { @@ -1086,8 +1092,13 @@ func novaCountServers(novaCmd string, rName, osPrefix string, flavor ...string) } } } else { - cmd := exec.Command("bash", "-c", novaCmd+" list "+extra+"| grep "+rName) + cmdStr := novaCmd + " list " + extra + "| grep " + rName + cmd := exec.CommandContext(ctx, "bash", "-c", cmdStr) out, err := cmd.Output() + if ctx.Err() != nil { + log.Printf("exec of [%s] timed out\n", cmdStr) + return 0 + } if err == nil { r := regexp.MustCompile(rName + "-\\S+") count := 0 From 29baa9c6866409d6aaa60a1e279e63a154d88c8a Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 11 May 2018 14:56:07 +0100 Subject: [PATCH 17/33] Fail immediately when starting manager in openstack mode outside of openstack. Manager now shows error/crit log lines on failure to start. Fix cloud command to correctly get and pass on max_servers and timeout to remote manager. --- cmd/cloud.go | 17 ++++++----------- cmd/manager.go | 26 +++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/cmd/cloud.go b/cmd/cloud.go index 7adce8ba5..426557371 100644 --- a/cmd/cloud.go +++ b/cmd/cloud.go @@ -55,7 +55,7 @@ const wrEnvFileName = ".wr_envvars" // options for this cmd var providerName string -var maxServers int +var cloudMaxServers int var serverKeepAlive int var osPrefix string var osUsername string @@ -71,6 +71,7 @@ var cloudConfigFiles string var forceTearDown bool var setDomainIP bool var cloudDebug bool +var cloudManagerTimeoutSeconds int // cloudCmd represents the cloud command var cloudCmd = &cobra.Command{ @@ -597,12 +598,12 @@ func init() { cloudDeployCmd.Flags().StringVarP(&postCreationScript, "script", "s", defaultConfig.CloudScript, "path to a start-up script that will be run on each server created") cloudDeployCmd.Flags().StringVarP(&postDeploymentScript, "on_success", "x", defaultConfig.DeploySuccessScript, "path to a script to run locally after a successful deployment") cloudDeployCmd.Flags().IntVarP(&serverKeepAlive, "keepalive", "k", defaultConfig.CloudKeepAlive, "how long in seconds to keep idle spawned servers alive for; 0 means forever") - cloudDeployCmd.Flags().IntVarP(&maxServers, "max_servers", "m", defaultConfig.CloudServers+1, "maximum number of servers to spawn; 0 means unlimited (default 0)") + cloudDeployCmd.Flags().IntVarP(&cloudMaxServers, "max_servers", "m", defaultConfig.CloudServers+1, "maximum number of servers to spawn; 0 means unlimited (default 0)") cloudDeployCmd.Flags().StringVar(&cloudGatewayIP, "network_gateway_ip", defaultConfig.CloudGateway, "gateway IP for the created subnet") cloudDeployCmd.Flags().StringVar(&cloudCIDR, "network_cidr", defaultConfig.CloudCIDR, "CIDR of the created subnet") cloudDeployCmd.Flags().StringVar(&cloudDNS, "network_dns", defaultConfig.CloudDNS, "comma separated DNS name server IPs to use in the created subnet") cloudDeployCmd.Flags().StringVarP(&cloudConfigFiles, "config_files", "c", defaultConfig.CloudConfigFiles, "comma separated paths of config files to copy to spawned servers") - cloudDeployCmd.Flags().IntVarP(&managerTimeoutSeconds, "timeout", "t", 10, "how long to wait in seconds for the manager to start up") + cloudDeployCmd.Flags().IntVarP(&cloudManagerTimeoutSeconds, "timeout", "t", 15, "how long to wait in seconds for the manager to start up") cloudDeployCmd.Flags().BoolVar(&setDomainIP, "set_domain_ip", defaultConfig.ManagerSetDomainIP, "on success, use infoblox to set your domain's IP") cloudDeployCmd.Flags().BoolVar(&cloudDebug, "debug", false, "include extra debugging information in the logs") @@ -781,18 +782,12 @@ func bootstrapOnRemote(provider *cloud.Provider, server *cloud.Server, exe strin } // get the manager running - m := maxServers - 1 - if m == -2 { - // *** for unknown reason, if maxServers defaults to 0 in init(), - // here the value is -1?! User explicitly setting a value works as - // expected, and we don't get here. - m = -1 - } + m := cloudMaxServers - 1 debugStr := "" if cloudDebug { debugStr = " --debug" } - mCmd := fmt.Sprintf("source %s && %s manager start --deployment %s -s %s -k %d -o '%s' -r %d -m %d -u %s%s%s%s%s --cloud_gateway_ip '%s' --cloud_cidr '%s' --cloud_dns '%s' --local_username '%s' --timeout %d%s && rm %s", wrEnvFileName, remoteExe, config.Deployment, providerName, serverKeepAlive, osPrefix, osRAM, m, osUsername, postCreationArg, flavorArg, osDiskArg, configFilesArg, cloudGatewayIP, cloudCIDR, cloudDNS, realUsername(), managerTimeoutSeconds, debugStr, wrEnvFileName) + mCmd := fmt.Sprintf("source %s && %s manager start --deployment %s -s %s -k %d -o '%s' -r %d -m %d -u %s%s%s%s%s --cloud_gateway_ip '%s' --cloud_cidr '%s' --cloud_dns '%s' --local_username '%s' --timeout %d%s && rm %s", wrEnvFileName, remoteExe, config.Deployment, providerName, serverKeepAlive, osPrefix, osRAM, m, osUsername, postCreationArg, flavorArg, osDiskArg, configFilesArg, cloudGatewayIP, cloudCIDR, cloudDNS, realUsername(), cloudManagerTimeoutSeconds, debugStr, wrEnvFileName) var e string _, e, err = server.RunCmd(mCmd, false) diff --git a/cmd/manager.go b/cmd/manager.go index 88a331d29..31ec877e0 100644 --- a/cmd/manager.go +++ b/cmd/manager.go @@ -19,6 +19,7 @@ package cmd import ( + "bufio" "fmt" "io/ioutil" "os" @@ -29,6 +30,7 @@ import ( "syscall" "time" + "github.com/VertebrateResequencing/wr/cloud" "github.com/VertebrateResequencing/wr/internal" "github.com/VertebrateResequencing/wr/jobqueue" jqs "github.com/VertebrateResequencing/wr/jobqueue/scheduler" @@ -46,6 +48,7 @@ var localUsername string var backupPath string var managerTimeoutSeconds int var managerDebug bool +var maxServers int // managerCmd represents the manager command var managerCmd = &cobra.Command{ @@ -145,8 +148,19 @@ var managerStartCmd = &cobra.Command{ // before exiting mTimeout := time.Duration(managerTimeoutSeconds) * time.Second internal.WaitForFile(config.ManagerTokenFile, mTimeout) - jq := connect(mTimeout) + jq := connect(mTimeout, true) if jq == nil { + // display any error or crit lines in the log + f, errf := os.Open(config.ManagerLogFile) + if errf == nil { + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "lvl=crit") || strings.Contains(line, "lvl=eror") { + fmt.Println(line) + } + } + } die("wr manager failed to start on port %s after %ds", config.ManagerPort, managerTimeoutSeconds) } token, err := token() @@ -507,6 +521,16 @@ func startJQ(postCreation []byte) { if config.ManagerCAFile != "" { cloudConfig.AddConfigFile(config.ManagerCAFile + ":~/.wr_" + config.Deployment + "/ca.pem") } + + // also check that we're actually in the cloud, or this is not going to + // work + provider, errc := cloud.New(scheduler, cloudResourceName(localUsername), filepath.Join(config.ManagerDir, "cloud_resources."+scheduler), appLogger) + if errc != nil { + die("cloud not connect to %s: %s", scheduler, errc) + } + if !provider.InCloud() { + die("according to hostname, this is not an instance in %s", scheduler) + } } // start the jobqueue server From 44acb22538f0d0dcdb5ed2e42dc0d2e9cf0fba69 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 11 May 2018 14:57:21 +0100 Subject: [PATCH 18/33] Fix data race and flavor selection in tests. --- jobqueue/scheduler/scheduler_test.go | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/jobqueue/scheduler/scheduler_test.go b/jobqueue/scheduler/scheduler_test.go index 37db428e3..a6f9a9bf4 100644 --- a/jobqueue/scheduler/scheduler_test.go +++ b/jobqueue/scheduler/scheduler_test.go @@ -766,7 +766,6 @@ func TestOpenstack(t *testing.T) { Convey("Schedule() lets you...", func() { oFile := filepath.Join(tmpdir, "out") - oReqs := make(map[string]string) if flavorRegex == `^m.*$` && os.Getenv("OS_TENANT_ID") == "" { Convey("Run a job on a specific flavor", func() { cmd := "sleep 10" @@ -811,11 +810,12 @@ func TestOpenstack(t *testing.T) { Convey("Run jobs with no inputs/outputs", func() { // on authors setup, the following count is sufficient to // test spawning instances over the quota in the test - // environment if we reserve 54 cores per job - count := 10 + // environment if we reserve 26 cores per job + count := 18 eta := 200 // if it takes longer than this, it's a likely indicator of a bug where it has actually stalled on a stuck lock cmd := "sleep 10" - thisReq := &Requirements{100, 1 * time.Minute, 54, 1, oReqs} + oReqs := make(map[string]string) + thisReq := &Requirements{100, 1 * time.Minute, 26, 1, oReqs} err := s.Schedule(cmd, thisReq, count) So(err, ShouldBeNil) So(s.Busy(), ShouldBeTrue) @@ -865,6 +865,7 @@ func TestOpenstack(t *testing.T) { Convey("Run everything even when a server fails to spawn", func() { debugCounter = 0 debugEffect = "failFirstSpawn" + oReqs := make(map[string]string) newReq := &Requirements{100, 1 * time.Minute, 1, 1, oReqs} newCount := 3 eta := 120 @@ -878,6 +879,7 @@ func TestOpenstack(t *testing.T) { Convey("Run jobs and have servers still self-terminate when a server is slow to spawn", func() { debugCounter = 0 debugEffect = "slowSecondSpawn" + oReqs := make(map[string]string) newReq := &Requirements{100, 1 * time.Minute, 1, 1, oReqs} newCount := 3 eta := 120 @@ -898,6 +900,7 @@ func TestOpenstack(t *testing.T) { // *** test if we have a Centos 7 image to use... if osPrefix != "Centos 7" { + oReqs := make(map[string]string) oReqs["cloud_os"] = "Centos 7" oReqs["cloud_user"] = "centos" oReqs["cloud_os_ram"] = "4096" @@ -952,16 +955,18 @@ func TestOpenstack(t *testing.T) { }) } - numCores := 4 - multiCoreFlavor, err := oss.determineFlavor(&Requirements{1024, 1 * time.Minute, numCores, 6 * numCores, oReqs}) + numCores := 5 + oReqsm := make(map[string]string) + multiCoreFlavor, err := oss.determineFlavor(&Requirements{1024, 1 * time.Minute, numCores, 0, oReqsm}) if err == nil && multiCoreFlavor.Cores >= numCores { + oReqs := make(map[string]string) oReqs["cloud_os_ram"] = strconv.Itoa(multiCoreFlavor.RAM) - jobReq := &Requirements{int(multiCoreFlavor.RAM / numCores), 1 * time.Minute, 1, 6, oReqs} + jobReq := &Requirements{int(multiCoreFlavor.RAM / numCores), 1 * time.Minute, 1, 0, oReqs} confirmFlavor, err := oss.determineFlavor(oss.reqForSpawn(jobReq)) if err == nil && confirmFlavor.Cores >= numCores { Convey("Run multiple jobs at once on multi-core servers", func() { cmd := "sleep 30" - jobReq := &Requirements{int(multiCoreFlavor.RAM / numCores), 1 * time.Minute, 1, int(multiCoreFlavor.Disk / numCores), oReqs} + jobReq := &Requirements{int(multiCoreFlavor.RAM / numCores), 1 * time.Minute, 1, 0, oReqs} err = s.Schedule(cmd, jobReq, numCores) So(err, ShouldBeNil) So(s.Busy(), ShouldBeTrue) @@ -989,7 +994,7 @@ func TestOpenstack(t *testing.T) { }() // wait for enough time to have spawned a server - // and run both commands in parallel, but not + // and run the commands in parallel, but not // sequentially *** but how long does it take to // spawn?! (50s in authors test area, but this // will vary...) we need better confirmation of From fdb96675a49b078b0076d588bc70cf5b8704d990 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 11 May 2018 14:59:13 +0100 Subject: [PATCH 19/33] Fix race condition where destroyed server can be considered alive and "not bad". --- cloud/server.go | 16 +++++++++++----- jobqueue/scheduler/openstack.go | 8 +++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/cloud/server.go b/cloud/server.go index 37fe8f55d..27d53b7e5 100644 --- a/cloud/server.go +++ b/cloud/server.go @@ -66,6 +66,7 @@ type Server struct { cancelID int cancelRunCmd map[int]chan bool created bool // to distinguish instances we discovered or spawned + toBeDestroyed bool destroyed bool goneBad bool location *time.Location @@ -147,6 +148,7 @@ func (s *Server) Release(cores, ramMB, diskGB int) { // destroy the server s.mutex.Lock() s.onDeathrow = false + s.toBeDestroyed = true s.mutex.Unlock() err := s.Destroy() s.logger.Debug("server died on deathrow", "err", err) @@ -643,13 +645,16 @@ func (s *Server) GoneBad(permanentProblem ...string) { } // NotBad lets you change your mind about a server you called GoneBad() on. -// (Unless GoneBad() was called with a permanentProblem.) -func (s *Server) NotBad() { +// (Unless GoneBad() was called with a permanentProblem, or the server has been +// destroyed). +func (s *Server) NotBad() bool { s.mutex.Lock() defer s.mutex.Unlock() - if s.permanentProblem == "" { + if !s.destroyed && !s.toBeDestroyed && s.permanentProblem == "" { s.goneBad = false + return true } + return false } // IsBad tells you if GoneBad() has been called (more recently than NotBad()). @@ -686,6 +691,7 @@ func (s *Server) Destroy() error { ch <- true } + s.toBeDestroyed = false s.destroyed = true s.goneBad = true @@ -716,7 +722,7 @@ func (s *Server) Destroy() error { func (s *Server) Destroyed() bool { s.mutex.Lock() defer s.mutex.Unlock() - return s.destroyed + return s.destroyed || s.toBeDestroyed } // Alive tells you if a server is usable. It first does the same check as @@ -724,7 +730,7 @@ func (s *Server) Destroyed() bool { // will double check the server to make sure it can be ssh'd to. func (s *Server) Alive(checkSSH ...bool) bool { s.mutex.Lock() - if s.destroyed { + if s.destroyed || s.toBeDestroyed { s.mutex.Unlock() return false } diff --git a/jobqueue/scheduler/openstack.go b/jobqueue/scheduler/openstack.go index 21ba7f937..37bef2983 100644 --- a/jobqueue/scheduler/openstack.go +++ b/jobqueue/scheduler/openstack.go @@ -1173,9 +1173,11 @@ func (s *opst) stateUpdate() { if server.IsBad() { // check if the server is fine now if alive && server.PermanentProblem() == "" { - server.NotBad() - s.notifyBadServer(server) - s.Debug("server became good", "server", server.ID) + worked := server.NotBad() + if worked { + s.notifyBadServer(server) + s.Debug("server became good", "server", server.ID) + } } } else if !alive { server.GoneBad() From db49014e2b2d41204c7c04228e607c939720fa44 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 11 May 2018 14:59:39 +0100 Subject: [PATCH 20/33] Fix data race in cloud.openstackp.flavors(). cloud.openstackp.initialize() now allows non-essential and time-consuming lookups to be done lazily. --- cloud/openstack.go | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/cloud/openstack.go b/cloud/openstack.go index 09026fdb8..80ec0149e 100644 --- a/cloud/openstack.go +++ b/cloud/openstack.go @@ -174,25 +174,10 @@ func (p *openstackp) initialize(logger log15.Logger) error { return err } - // get the external network id - p.externalNetworkID, err = networks.IDFromName(p.networkClient, p.poolName) - if err != nil { - return err - } - - // get the details of all the possible server flavors + // flavors and images are retrieved on-demand via caching methods that store + // in these maps p.fmap = make(map[string]*Flavor) - err = p.cacheFlavors() - if err != nil { - return err - } - - // get the details of all active images p.imap = make(map[string]*images.Image) - err = p.cacheImages() - if err != nil { - return err - } // to get a reasonable new server timeout we'll keep track of how long it // takes to spawn them using an exponentially weighted moving average. We @@ -515,6 +500,14 @@ func (p *openstackp) deploy(resources *Resources, requiredPorts []int, gatewayIP return err } if routerID == "" { + // get the external network id + if p.externalNetworkID == "" { + p.externalNetworkID, err = networks.IDFromName(p.networkClient, p.poolName) + if err != nil { + return err + } + } + var router *routers.Router router, err = routers.Create(p.networkClient, routers.CreateOpts{ Name: resources.ResourceName, @@ -617,7 +610,10 @@ func (p *openstackp) flavors() map[string]*Flavor { } p.fmapMutex.RLock() } - fmap := p.fmap + fmap := make(map[string]*Flavor) + for key, val := range p.fmap { + fmap[key] = val + } p.fmapMutex.RUnlock() return fmap } From ea8dc0d2774c3169b642d9086f6cdab556fdd3cf Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 11 May 2018 15:32:50 +0100 Subject: [PATCH 21/33] Fix LSF emulation by connecting to config.ManagerHost, not hardcoded localhost! --- cmd/root.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/root.go b/cmd/root.go index 7cc05f96d..16a86abbd 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -271,7 +271,7 @@ func connect(wait time.Duration, expectedToBeDown ...bool) *jobqueue.Client { die("could not read token file; has the manager been started? [%s]", err) } - jq, err := jobqueue.Connect("localhost:"+config.ManagerPort, caFile, config.ManagerCertDomain, token, wait) + jq, err := jobqueue.Connect(config.ManagerHost+":"+config.ManagerPort, caFile, config.ManagerCertDomain, token, wait) if err != nil && !(len(expectedToBeDown) == 1 && expectedToBeDown[0]) { die("%s", err) } From 60d4eac3799f5acdb9c5583952f075f3800e3487 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Mon, 14 May 2018 14:53:37 +0100 Subject: [PATCH 22/33] Add minimal implementations of bjobs and bkill. --- cmd/lsf.go | 287 ++++++++++++++++++++++++++++++++++++++++-- cmd/root.go | 2 +- jobqueue/serverCLI.go | 1 + main.go | 2 + 4 files changed, 282 insertions(+), 10 deletions(-) diff --git a/cmd/lsf.go b/cmd/lsf.go index a57a4d129..fdb2600fa 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -21,19 +21,40 @@ package cmd import ( "bufio" "encoding/json" + goflag "flag" "fmt" + "io" "os" "regexp" "strconv" "strings" + "text/tabwriter" "time" + "github.com/VertebrateResequencing/wr/internal" "github.com/VertebrateResequencing/wr/jobqueue" jqs "github.com/VertebrateResequencing/wr/jobqueue/scheduler" "github.com/spf13/cobra" ) +const lsfTimeFormat = "Jan 02 15:04" + +var jobStateToLSFState = map[jobqueue.JobState]string{ + jobqueue.JobStateNew: "PEND", + jobqueue.JobStateDelayed: "PEND", + jobqueue.JobStateDependent: "PEND", + jobqueue.JobStateReady: "PEND", + jobqueue.JobStateReserved: "PEND", + jobqueue.JobStateRunning: "RUN", + jobqueue.JobStateLost: "UNKWN", + jobqueue.JobStateBuried: "EXIT", + jobqueue.JobStateComplete: "DONE", +} + // options for this cmd +var lsfNoHeader bool +var lsfFormat string +var lsfQueue string // lsfCmd represents the lsf command. var lsfCmd = &cobra.Command{ @@ -55,7 +76,8 @@ NB: currently the emulation is extremely limited, supporting only the interactive "console" mode where you run bsub without any arguments, and it only supports single flags per #BSUB line, and it only pays attention to -J, -n and -M flags. (This is sufficient for compatibility with 10x Genomic's cellranger -software, and to work as the scheduler for nextflow.) +software (which has Maritan built in), and to work as the scheduler for +nextflow.) There is only one "queue", called 'wr'. The best way to use this LSF emulation is not to call this command yourself directly, but to use 'wr add --bsubs [other opts]' to add the command that you @@ -184,6 +206,8 @@ var lsfBsubCmd = &cobra.Command{ }, } +type lsfFieldDisplay func(*jobqueue.Job) string + // bjobs sub-command emulates bjobs. var lsfBjobsCmd = &cobra.Command{ Use: "bjobs", @@ -191,21 +215,266 @@ var lsfBjobsCmd = &cobra.Command{ Long: `See jobs that have been added using the lsf bsub command, using bjobs syntax and being formatted the way bjobs display this information. -NB: Not yet implemented.`, +Only lists all incomplete jobs. Unlike real bjobs, does not list recently +completed jobs. Unlike real bjobs, does not truncate columns (always effectivly +in -w mode). + +Only supports this limited set of real bjobs options: +-noheader +-o +-q + +The output format only supports simple listing of desired columns (not choosing +their width), and specifying the delimiter. The only columns supported are +JOBID, USER, STAT, QUEUE, FROM_HOST, EXEC_HOST, JOB_NAME and SUBMIT_TIME. +eg. -o 'JOBID STAT SUBMIT_TIME delimiter=","' + +While -q can be provided, and that provided queue will be displayed in the +output, in reality there is only 1 queue called 'wr', so -q has no real function +other than providing compatability with real bjobs command line args.`, + Run: func(cmd *cobra.Command, args []string) { + user, err := internal.Username() + if err != nil { + die(err.Error()) + } + + // connect to the server + jq := connect(10 * time.Second) + defer func() { + err = jq.Disconnect() + if err != nil { + warn("Disconnecting from the server failed: %s", err) + } + }() + + // set up viewing of the allowed fields + fieldLookup := make(map[string]lsfFieldDisplay) + fieldLookup["JOBID"] = func(job *jobqueue.Job) string { + return strconv.Itoa(int(job.BsubID)) + } + fieldLookup["USER"] = func(job *jobqueue.Job) string { + return user + } + fieldLookup["STAT"] = func(job *jobqueue.Job) string { + return jobStateToLSFState[job.State] + } + fieldLookup["QUEUE"] = func(job *jobqueue.Job) string { + return lsfQueue + } + fieldLookup["FROM_HOST"] = func(job *jobqueue.Job) string { + return jq.ServerInfo.Host + } + fieldLookup["EXEC_HOST"] = func(job *jobqueue.Job) string { + return job.Host + } + fieldLookup["JOB_NAME"] = func(job *jobqueue.Job) string { + return job.RepGroup + } + fieldLookup["SUBMIT_TIME"] = func(job *jobqueue.Job) string { + return job.StartTime.Format(lsfTimeFormat) + } + + // parse -o + var delimiter string + var fields []string + var w io.Writer + if lsfFormat != "" { + // parse -o format + re := regexp.MustCompile(`(?i)\s*delimiter=["'](.*)["']\s*`) + matches := re.FindStringSubmatch(lsfFormat) + if matches != nil { + delimiter = matches[1] + lsfFormat = re.ReplaceAllString(lsfFormat, "") + } else { + delimiter = " " + } + for _, field := range strings.Split(lsfFormat, " ") { + field = strings.ToUpper(field) + if _, exists := fieldLookup[field]; !exists { + die("unsupported field '%s'", field) + } + fields = append(fields, field) + } + + // custom format just uses a single delimiter between fields + w = os.Stdout + } else { + // standard format uses aligned columns of the fields + delimiter = "\t" + fields = []string{"JOBID", "USER", "STAT", "QUEUE", "FROM_HOST", "EXEC_HOST", "JOB_NAME", "SUBMIT_TIME"} + w = tabwriter.NewWriter(os.Stdout, 2, 2, 3, ' ', 0) + } + + // get all incomplete jobs + jobs, err := jq.GetIncomplete(0, "", false, false) + if err != nil { + die(err.Error()) + } + + // print out details about the ones that have BsubIDs + found := false + for _, job := range jobs { + jid := job.BsubID + if jid == 0 { + continue + } + + if !found { + if !lsfNoHeader { + // print header + fmt.Fprintln(w, strings.Join(fields, delimiter)) + } + found = true + } + + var vals []string + for _, field := range fields { + vals = append(vals, fieldLookup[field](job)) + } + fmt.Fprintln(w, strings.Join(vals, delimiter)) + } + + if lsfFormat == "" { + tw := w.(*tabwriter.Writer) + tw.Flush() + } + + if !found { + fmt.Println("No unfinished job found") + } + }, +} + +// bkill sub-command emulates bkill. +var lsfBkillCmd = &cobra.Command{ + Use: "bkill", + Short: "Kill jobs added using bsub", + Long: `Kill jobs that have been added using the lsf bsub command. + +Only supports providing jobIds as command line arguements. Does not currently +understand any of the options that real bkill does. + +Note that if a given jobId is not currently in the queue, always just claims +that the job has already finished, even if an invalid jobId was supplied.`, Run: func(cmd *cobra.Command, args []string) { - fmt.Println("bjobs not yet implemented") - os.Exit(-1) + // convert args to uint64s + desired := make(map[uint64]bool) + for _, arg := range args { + i, err := strconv.Atoi(arg) + if err != nil { + die("could not convert jobID [%s] to an int: %s", arg, err) + } + desired[uint64(i)] = true + } + if len(desired) == 0 { + die("job ID must be specified") + } + + // connect to the server + jq := connect(10 * time.Second) + var err error + defer func() { + err = jq.Disconnect() + if err != nil { + warn("Disconnecting from the server failed: %s", err) + } + }() + + // get all incomplete jobs *** this is hardly efficient... + jobs, err := jq.GetIncomplete(0, "", false, false) + if err != nil { + die(err.Error()) + } + + // remove the matching ones + JOBS: + for _, job := range jobs { + jid := job.BsubID + if !desired[jid] { + continue + } + + if job.State == jobqueue.JobStateRunning { + _, errk := jq.Kill([]*jobqueue.JobEssence{job.ToEssense()}) + if errk != nil { + warn("error trying to kill job %d: %s", jid, errk) + continue + } + + // wait until it gets buried + for { + <-time.After(500 * time.Millisecond) + got, errg := jq.GetByEssence(job.ToEssense(), false, false) + if errg != nil { + warn("error trying confirm job %d was killed: %s", jid, errg) + continue JOBS + } + + if got.State == jobqueue.JobStateBuried { + break + } + } + } + + _, errd := jq.Delete([]*jobqueue.JobEssence{job.ToEssense()}) + if errd != nil { + warn("error trying to delete job %d: %s", jid, errd) + continue + } + + fmt.Printf("Job <%d> is being terminated\n", jid) + delete(desired, jid) + } + + for jid := range desired { + fmt.Printf("Job <%d>: Job has already finished\n", jid) + } }, } func init() { + // custom handling of LSF args with their single dashes + args, lsfArgs := filterGoFlags(os.Args, map[string]bool{ + "noheader": false, + "o": true, + "q": true, + }) + os.Args = args + + goflag.BoolVar(&lsfNoHeader, "noheader", false, "disable header output") + goflag.StringVar(&lsfFormat, "o", "", "output format") + goflag.StringVar(&lsfQueue, "q", "wr", "queue") + if err := goflag.CommandLine.Parse(lsfArgs); err != nil { + die("error parsing LSF args: ", err) + } + RootCmd.AddCommand(lsfCmd) lsfCmd.AddCommand(lsfBsubCmd) lsfCmd.AddCommand(lsfBjobsCmd) + lsfCmd.AddCommand(lsfBkillCmd) +} - // flags specific to these sub-commands - // defaultConfig := internal.DefaultConfig() - // managerStartCmd.Flags().BoolVarP(&foreground, "foreground", "f", false, "do not daemonize") - // managerStartCmd.Flags().StringVarP(&scheduler, "scheduler", "s", defaultConfig.ManagerScheduler, "['local','lsf','openstack'] job scheduler") - // managerStartCmd.Flags().IntVarP(&osRAM, "cloud_ram", "r", defaultConfig.CloudRAM, "for cloud schedulers, ram (MB) needed by the OS image specified by --cloud_os") +// filterGoFlags splits lsf args, which use single dash named args, from wr +// args, which use single dash to mean a set of shorthand flags. +func filterGoFlags(args []string, prefixes map[string]bool) ([]string, []string) { + // from https://gist.github.com/doublerebel/8b95c5c118e958e495d2 + var goFlags []string + for i := 0; 0 < len(args) && i < len(args); i++ { + for prefix, hasValue := range prefixes { + if strings.HasPrefix(args[i], "-"+prefix) { + goFlags = append(goFlags, args[i]) + skip := 1 + if hasValue && i+1 < len(args) { + goFlags = append(goFlags, args[i+1]) + skip = 2 + } + if i+skip <= len(args) { + args = append(args[:i], args[i+skip:]...) + } + i-- + break + } + } + } + return args, goFlags } diff --git a/cmd/root.go b/cmd/root.go index 16a86abbd..97c7061fe 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -80,7 +80,7 @@ func Execute() { // ExecuteLSF is for treating a call to wr as if `wr lsf xxx` was called, for // the LSF emulation to work. func ExecuteLSF(cmd string) { - args := []string{"lsf", cmd} + args := append([]string{"lsf", cmd}, os.Args[1:]...) command, _, err := RootCmd.Find(args) if err != nil { die(err.Error()) diff --git a/jobqueue/serverCLI.go b/jobqueue/serverCLI.go index c9b36808b..464e241cf 100644 --- a/jobqueue/serverCLI.go +++ b/jobqueue/serverCLI.go @@ -705,6 +705,7 @@ func (s *Server) itemToJob(item *queue.Item, getStd bool, getEnv bool) *Job { MountConfigs: sjob.MountConfigs, MonitorDocker: sjob.MonitorDocker, BsubMode: sjob.BsubMode, + BsubID: sjob.BsubID, } if state == JobStateReserved && !sjob.StartTime.IsZero() { diff --git a/main.go b/main.go index bfed70e54..0711b12ce 100644 --- a/main.go +++ b/main.go @@ -113,6 +113,8 @@ func main() { cmd.ExecuteLSF("bsub") case "bjobs": cmd.ExecuteLSF("bjobs") + case "bkill": + cmd.ExecuteLSF("bkill") default: // otherwise we call our root command, which handles everything else cmd.Execute() From 0818438f1f1b115486f06f903edffd613ae2b1d3 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Mon, 14 May 2018 17:00:38 +0100 Subject: [PATCH 23/33] Fix reqForSpawn() to not lower RAM when disk needs changing. --- jobqueue/scheduler/openstack.go | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/jobqueue/scheduler/openstack.go b/jobqueue/scheduler/openstack.go index 37bef2983..8cb529a39 100644 --- a/jobqueue/scheduler/openstack.go +++ b/jobqueue/scheduler/openstack.go @@ -745,20 +745,30 @@ func (s *opst) reqForSpawn(req *Requirements) *Requirements { osRAM = s.config.OSRAM } + if req.RAM < osRAM { + reqForSpawn = &Requirements{ + RAM: osRAM, + Time: req.Time, + Cores: req.Cores, + Disk: req.Disk, + Other: req.Other, + } + } + disk := req.Disk if disk == 0 { disk = s.config.OSDisk } - - if req.RAM < osRAM || req.Disk < disk { + if req.Disk < disk { reqForSpawn = &Requirements{ - RAM: osRAM, - Time: req.Time, - Cores: req.Cores, + RAM: reqForSpawn.RAM, + Time: reqForSpawn.Time, + Cores: reqForSpawn.Cores, Disk: disk, - Other: req.Other, + Other: reqForSpawn.Other, } } + return reqForSpawn } From e47d630366447ed4b19cc993a5db52f4d674ad95 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 15 May 2018 09:49:12 +0100 Subject: [PATCH 24/33] Fix filtering of -o and -q on all subcmds. --- cmd/lsf.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cmd/lsf.go b/cmd/lsf.go index fdb2600fa..67c3679d8 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -436,14 +436,10 @@ func init() { // custom handling of LSF args with their single dashes args, lsfArgs := filterGoFlags(os.Args, map[string]bool{ "noheader": false, - "o": true, - "q": true, }) os.Args = args goflag.BoolVar(&lsfNoHeader, "noheader", false, "disable header output") - goflag.StringVar(&lsfFormat, "o", "", "output format") - goflag.StringVar(&lsfQueue, "q", "wr", "queue") if err := goflag.CommandLine.Parse(lsfArgs); err != nil { die("error parsing LSF args: ", err) } @@ -452,6 +448,11 @@ func init() { lsfCmd.AddCommand(lsfBsubCmd) lsfCmd.AddCommand(lsfBjobsCmd) lsfCmd.AddCommand(lsfBkillCmd) + + // add lsf single character options using normal method, so these don't get + // stripped out from all other wr sub-cmds + lsfBjobsCmd.Flags().StringVarP(&lsfFormat, "output", "o", "", "output format") + lsfBjobsCmd.Flags().StringVarP(&lsfQueue, "queue", "q", "wr", "queue") } // filterGoFlags splits lsf args, which use single dash named args, from wr From 513f0711a50df18c48cbf01a757e7e54df7b1c87 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 15 May 2018 12:42:44 +0100 Subject: [PATCH 25/33] Remove commented out code. --- jobqueue/job.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/jobqueue/job.go b/jobqueue/job.go index c826956eb..a3d5106e1 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -437,9 +437,6 @@ func (j *Job) TriggerBehaviours(success bool) error { // itself. (This will fail if j.Cwd is not empty or already mounted by another // process.) func (j *Job) Mount(onCwd ...bool) error { - // j.Lock() - // defer j.Unlock() - cwd := j.Cwd defaultMount := filepath.Join(j.Cwd, "mnt") defaultCacheBase := cwd From 143d0055e1ed4a19d8f56aafdcc6e5dccc58c432 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 15 May 2018 14:14:59 +0100 Subject: [PATCH 26/33] Improve error message when failing to upload cloud config files. --- cmd/add.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/add.go b/cmd/add.go index 7ef1e444a..dcae421eb 100644 --- a/cmd/add.go +++ b/cmd/add.go @@ -594,7 +594,7 @@ func copyCloudConfigFiles(jq *jobqueue.Client, configFiles string) string { remote, err := jq.UploadFile(local, desired) if err != nil { - warn("failed to open file %s: %s", local, err) + warn("failed to upload [%s] to [%s]: %s", local, desired, err) remoteConfigFiles = append(remoteConfigFiles, cf) continue } From 357063eed290ea23cce71c660f7f55f47eb03a59 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Tue, 15 May 2018 16:11:48 +0100 Subject: [PATCH 27/33] Fix Execute() to work in bsub mode when --cwd is /tmp; create LSF symlinks in tmp dir. --- jobqueue/client.go | 168 +++++++++++++++++++++++---------------------- 1 file changed, 86 insertions(+), 82 deletions(-) diff --git a/jobqueue/client.go b/jobqueue/client.go index dc76f6971..3dbfff112 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -438,86 +438,6 @@ func (c *Client) Execute(job *Job, shell string) error { stdout := &prefixSuffixSaver{N: 4096} stdoutWait := stdFilter(outReader, stdout) - var onCwd bool - var prependPath string - if job.BsubMode != "" { - // create parent of job.Cwd so we can later mount at job.Cwd - parent := filepath.Dir(job.Cwd) - errm := os.MkdirAll(parent, os.ModePerm) - if fi, errs := os.Stat(parent); errs != nil || !fi.Mode().IsDir() { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) - } - return fmt.Errorf("parent of working directory [%s] could not be created: %s%s", parent, errm, extra) - } - } - var mountCouldFail bool - host, err := os.Hostname() - if err != nil { - host = localhost - } - if job.BsubMode != "" { - jobCwd := job.Cwd - if jobCwd == "" { - jobCwd = "." - } - absJobCwd, erra := filepath.Abs(jobCwd) - if erra != nil { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) - } - return fmt.Errorf("failed to make cmd dir absolute: %s%s", erra, extra) - } - parent := filepath.Dir(absJobCwd) - - // create bsub and bjobs symlinks in a sister dir of job.Cwd - prependPath = filepath.Join(parent, lsfEmulationDir) - errm := os.MkdirAll(prependPath, os.ModePerm) - if fi, errs := os.Stat(prependPath); errs != nil || !fi.Mode().IsDir() { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) - } - return fmt.Errorf("sister of working directory [%s] could not be created: %s%s", prependPath, errm, extra) - } - wr, erre := os.Executable() - if erre != nil { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) - } - return fmt.Errorf("could not get path to wr: %s%s", erre, extra) - } - bsub := filepath.Join(prependPath, "bsub") - bjobs := filepath.Join(prependPath, "bjobs") - err = os.Symlink(wr, bsub) - if err != nil && !os.IsExist(err) { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) - } - return fmt.Errorf("could not create bsub symlink: %s%s", err, extra) - } - err = os.Symlink(wr, bjobs) - if err != nil && !os.IsExist(err) { - errb := c.Bury(job, nil, FailReasonCwd) - extra := "" - if errb != nil { - extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) - } - return fmt.Errorf("could not create bjobs symlink: %s%s", err, extra) - } - - onCwd = job.CwdMatters - } - // we'll run the command from the desired directory, which must exist or // it will fail if fi, errf := os.Stat(job.Cwd); errf != nil || !fi.Mode().IsDir() { @@ -549,9 +469,48 @@ func (c *Client) Execute(job *Job, shell string) error { job.ActualCwd = actualCwd } + var myerr error + + var onCwd bool + var prependPath string + if job.BsubMode != "" { + // create our bsub symlinks in a tmp dir + prependPath, err = ioutil.TempDir("", lsfEmulationDir) + if err != nil { + buryErr := fmt.Errorf("could not create lsf emulation directory: %s", err) + errb := c.Bury(job, nil, FailReasonCwd, buryErr) + if errb != nil { + buryErr = fmt.Errorf("%s (and burying the job failed: %s)", buryErr.Error(), errb) + } + return buryErr + } + defer func() { + errr := os.RemoveAll(prependPath) + if errr != nil { + if myerr == nil { + myerr = errr + } else { + myerr = fmt.Errorf("%s (and removing the lsf emulation dir failed: %s)", myerr.Error(), errr) + } + } + }() + + err = c.createLSFSymlinks(prependPath, job) + if err != nil { + return err + } + + onCwd = job.CwdMatters + } + // if we are a child job of another running on the same host, we expect // mounting to fail since we're running in the same directory as our // parent + var mountCouldFail bool + host, err := os.Hostname() + if err != nil { + host = localhost + } if jsonStr := job.Getenv("WR_BSUB_CONFIG"); jsonStr != "" { configJob := &Job{} if erru := json.Unmarshal([]byte(jsonStr), configJob); erru == nil && configJob.Host == host { @@ -580,8 +539,6 @@ func (c *Client) Execute(job *Job, shell string) error { } } - var myerr error - // and we'll run it with the environment variables that were present when // the command was first added to the queue (or if none, current env vars, // and in either case, including any overrides) *** we need a way for users @@ -1162,6 +1119,53 @@ func (c *Client) Execute(job *Job, shell string) error { return myerr } +// createLSFSymlinks creates symlinks of bsub, bjobs and bkill to own exe, +// inside the given dir. +func (c *Client) createLSFSymlinks(prependPath string, job *Job) error { + wr, erre := os.Executable() + if erre != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not get path to wr: %s%s", erre, extra) + } + + bsub := filepath.Join(prependPath, "bsub") + bjobs := filepath.Join(prependPath, "bjobs") + bkill := filepath.Join(prependPath, "bkill") + err := os.Symlink(wr, bsub) + if err != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not create bsub symlink: %s%s", err, extra) + } + err = os.Symlink(wr, bjobs) + if err != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not create bjobs symlink: %s%s", err, extra) + } + err = os.Symlink(wr, bkill) + if err != nil { + errb := c.Bury(job, nil, FailReasonCwd) + extra := "" + if errb != nil { + extra = fmt.Sprintf(" (and burying the job failed: %s)", errb) + } + return fmt.Errorf("could not create bkill symlink: %s%s", err, extra) + } + + return nil +} + // Started updates a Job on the server with information that you've started // running the Job's Cmd. Started also figures out some host name, ip and // possibly id (in cloud situations) to associate with the job, so that if From b20a39844f1d8bdae52ce707c230d5ef4e1ecc0d Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Thu, 17 May 2018 13:45:44 +0100 Subject: [PATCH 28/33] New cloud_shared option for added jobs, for setting up an NFS shared disk. New cloud.Server.[Create|Mount]SharedDisk() methods. OpenStack scheduler uses these to create NFS share on-demand and mount it on client servers. cloud.WaitUntilReady() changed to require the postCreationScript arg. cloud.LoclhostServer now fills in non-loopback IP, UserName and sets Name to "localhost". cloud.Server has new SharedDisk property, used during Matches(). cloud.Server.MkDir() fixed to make the given directory, not just its parent. Moved jobqueue.CurrentIP() to internal.CurrentIP(). --- cloud/cloud.go | 32 ++++-- cloud/server.go | 143 ++++++++++++++++++++++++--- cmd/add.go | 22 ++++- cmd/manager.go | 4 +- internal/utils.go | 84 ++++++++++++++++ jobqueue/client.go | 6 +- jobqueue/job.go | 7 +- jobqueue/jobqueue_test.go | 8 +- jobqueue/scheduler/openstack.go | 44 ++++++--- jobqueue/scheduler/scheduler_test.go | 34 +++++++ jobqueue/server.go | 2 +- jobqueue/serverREST.go | 11 ++- jobqueue/utils.go | 85 ---------------- 13 files changed, 348 insertions(+), 134 deletions(-) diff --git a/cloud/cloud.go b/cloud/cloud.go index 33c57c01b..f2c5e6d16 100644 --- a/cloud/cloud.go +++ b/cloud/cloud.go @@ -573,10 +573,10 @@ func (p *Provider) Spawn(os string, osUser string, flavorID string, diskGB int, // non-blank it will CopyOver the specified files (after the server is ready, // before any postCreationScript is run). // -// postCreationScript is the optional []byte content of a script that will be -// run on the server (as the user supplied to Spawn()) once it is ready, and it -// will complete before this function returns; empty slice means do nothing. -func (s *Server) WaitUntilReady(files string, postCreationScript ...[]byte) error { +// postCreationScript is the []byte content of a script that will be run on the +// server (as the user supplied to Spawn()) once it is ready, and it will +// complete before this function returns; empty slice means do nothing. +func (s *Server) WaitUntilReady(files string, postCreationScript []byte) error { // wait for ssh to come up _, err := s.SSHClient() if err != nil { @@ -619,9 +619,9 @@ SENTINEL: } // run the postCreationScript - if len(postCreationScript[0]) > 0 { + if len(postCreationScript) > 0 { pcsPath := "/tmp/.postCreationScript" - err = s.CreateFile(string(postCreationScript[0]), pcsPath) + err = s.CreateFile(string(postCreationScript), pcsPath) if err != nil { return fmt.Errorf("cloud server start up script failed to upload: %s", err) } @@ -647,7 +647,7 @@ SENTINEL: s.logger.Warn("failed to remove post creation script", "path", pcsPath, "err", rmErr) } - s.Script = postCreationScript[0] + s.Script = postCreationScript // because the postCreationScript may have altered PATH and other things // that subsequent RunCmd may rely on, clear the client @@ -727,15 +727,29 @@ func (p *Provider) HeadNode() *Server { // LocalhostServer returns a Server object with details of the host we are // currently running on. No cloud API calls are made to construct this. -func (p *Provider) LocalhostServer(os string, postCreationScript []byte, configFiles string) (*Server, error) { +func (p *Provider) LocalhostServer(os string, postCreationScript []byte, configFiles string, cidr ...string) (*Server, error) { maxRAM, err := internal.ProcMeminfoMBs() if err != nil { return nil, err } + diskSize := internal.DiskSize() + + ip, err := internal.CurrentIP(cidr[0]) + if err != nil { + return nil, err + } + + user, err := internal.Username() + if err != nil { + return nil, err + } + return &Server{ - IP: "127.0.0.1", + Name: "localhost", + IP: ip, OS: os, + UserName: user, Script: postCreationScript, ConfigFiles: configFiles, Flavor: &Flavor{ diff --git a/cloud/server.go b/cloud/server.go index 27d53b7e5..cdc1bb0e7 100644 --- a/cloud/server.go +++ b/cloud/server.go @@ -22,10 +22,12 @@ package cloud import ( "bytes" + "context" "errors" "fmt" "io" "os" + "os/exec" "path/filepath" "strings" "sync" @@ -37,6 +39,8 @@ import ( "golang.org/x/crypto/ssh" ) +const sharePath = "/shared" // mount point for the *SharedDisk methods + // Flavor describes a "flavor" of server, which is a certain (virtual) hardware // configuration type Flavor struct { @@ -60,6 +64,7 @@ type Server struct { OS string // the name of the Operating System image Script []byte // the content of a start-up script run on the server ConfigFiles string // files that you will CopyOver() and require to be on this Server, in CopyOver() format + SharedDisk bool // the server will mount /shared TTD time.Duration // amount of idle time allowed before destruction UserName string // the username needed to log in to the server cancelDestruction chan bool @@ -80,15 +85,17 @@ type Server struct { usedRAM int homeDir string hmutex sync.Mutex + createdShare bool + csmutex sync.Mutex logger log15.Logger // (not embedded to make gob happy) } // Matches tells you if in principle a Server has the given os, script, config -// files and flavor. Useful before calling HasSpaceFor, since if you don't match -// these things you can't use the Server regardless of how empty it is. -// configFiles is in the CopyOver() format. -func (s *Server) Matches(os string, script []byte, configFiles string, flavor *Flavor) bool { - return s.OS == os && bytes.Equal(s.Script, script) && s.ConfigFiles == configFiles && (flavor == nil || flavor.ID == s.Flavor.ID) +// files, flavor and has a shared disk mounted. Useful before calling +// HasSpaceFor, since if you don't match these things you can't use the Server +// regardless of how empty it is. configFiles is in the CopyOver() format. +func (s *Server) Matches(os string, script []byte, configFiles string, flavor *Flavor, sharedDisk bool) bool { + return s.OS == os && bytes.Equal(s.Script, script) && s.ConfigFiles == configFiles && (flavor == nil || flavor.ID == s.Flavor.ID) && s.SharedDisk == sharedDisk } // Allocate records that the given resources have now been used up on this @@ -408,7 +415,7 @@ func (s *Server) UploadFile(source string, dest string) error { defer internal.LogClose(s.logger, client, "upload file client session", "source", source, "dest", dest) // create all parent dirs of dest - err = s.MkDir(dest) + err = s.MkDir(filepath.Dir(dest)) if err != nil { return err } @@ -558,7 +565,7 @@ func (s *Server) CreateFile(content string, dest string) error { defer internal.LogClose(s.logger, client, "create file client session") // create all parent dirs of dest - err = s.MkDir(dest) + err = s.MkDir(filepath.Dir(dest)) if err != nil { return err } @@ -610,17 +617,131 @@ func (s *Server) DownloadFile(source string, dest string) error { } // MkDir creates a directory (and it's parents as necessary) on the server. -func (s *Server) MkDir(dest string) error { +// Requires sudo. +func (s *Server) MkDir(dir string) error { + if dir == "." { + return nil + } + //*** it would be nice to do this with client.Mkdir, but that doesn't do // the equivalent of mkdir -p, and errors out if dirs already exist... for // now it's easier to just call mkdir - dir := filepath.Dir(dest) - if dir != "." { - _, _, err := s.RunCmd("mkdir -p "+dir, false) + _, _, err := s.RunCmd(fmt.Sprintf("[ -d %s ]", dir), false) + if err == nil { + // dir already exists + return nil + } + + // try without sudo, so that if we create multiple dirs, they all have the + // correct permissions + _, _, err = s.RunCmd("mkdir -p "+dir, false) + if err == nil { + return nil + } + + // try again with sudo + _, e, err := s.RunCmd("sudo mkdir -p "+dir, false) + if err != nil { + return fmt.Errorf("%s; %s", e, err.Error()) + } + + // correct permission on leaf dir *** not currently correcting permission on + // any parent dirs we might have just made + _, e, err = s.RunCmd(fmt.Sprintf("sudo chown %s:%s %s", s.UserName, s.UserName, dir), false) + if err != nil { + return fmt.Errorf("%s; %s", e, err.Error()) + } + + return nil +} + +// CreateSharedDisk creates an NFS share at /shared, which must be empty or not +// exist. This does not work for remote Servers, so only call this on the return +// value of LocalhostServer(). Does nothing and returns nil if the share was +// already created. NB: this is currently hard-coded to only work on Ubuntu, and +// the ability to sudo is required! Also assumes you don't have any other shares +// configured, and no other process started the NFS server! +func (s *Server) CreateSharedDisk() error { + s.csmutex.Lock() + defer s.csmutex.Unlock() + if s.createdShare { + return nil + } + + ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "bash", "-c", "sudo apt-get install nfs-kernel-server -y") + err := cmd.Run() + if err != nil { + return err + } + + cmd = exec.CommandContext(ctx, "bash", "-c", fmt.Sprintf("echo '%s *(rw,sync,no_root_squash)' | sudo tee --append /etc/exports > /dev/null", sharePath)) + err = cmd.Run() + if err != nil { + return err + } + + if _, err := os.Stat(sharePath); err != nil && os.IsNotExist(err) { + cmd = exec.CommandContext(ctx, "bash", "-c", "sudo mkdir "+sharePath) + err = cmd.Run() + if err != nil { + return err + } + + cmd = exec.CommandContext(ctx, "bash", "-c", fmt.Sprintf("sudo chown %s:%s %s", s.UserName, s.UserName, sharePath)) + err = cmd.Run() if err != nil { return err } } + + cmd = exec.CommandContext(ctx, "bash", "-c", "sudo systemctl start nfs-kernel-server.service && sudo exportfs -a") + err = cmd.Run() + if err != nil { + return err + } + + s.createdShare = true + s.SharedDisk = true + s.logger.Debug("created shared disk") + return nil +} + +// MountSharedDisk can be used to mount a share from another Server (identified +// by its IP address) that you called CreateSharedDisk() on. The shared disk +// will be accessible at /shared. Does nothing and returns nil if the share was +// already mounted (or created on this Server). NB: currently hard-coded to use +// apt-get to install nfs-common on the server first, so probably only +// compatible with Ubuntu. Requires sudo. +func (s *Server) MountSharedDisk(nfsServerIP string) error { + s.csmutex.Lock() + defer s.csmutex.Unlock() + if s.createdShare { + return nil + } + + _, _, err := s.RunCmd("sudo apt-get install nfs-common -y", false) + if err != nil { + return err + } + + err = s.MkDir(sharePath) + if err != nil { + return err + } + s.logger.Debug("ran MkDir") + + stdo, stde, err := s.RunCmd(fmt.Sprintf("sudo mount %s:%s %s", nfsServerIP, sharePath, sharePath), false) + if err != nil { + s.logger.Error("mount attempt failed", "stdout", stdo, "stderr", stde) + return err + } + + s.createdShare = true + s.SharedDisk = true + s.logger.Debug("mounted shared disk") return nil } diff --git a/cmd/add.go b/cmd/add.go index dcae421eb..37547f1e9 100644 --- a/cmd/add.go +++ b/cmd/add.go @@ -59,6 +59,7 @@ var cmdOsRAM int var cmdBsubMode bool var cmdPostCreationScript string var cmdCloudConfigs string +var cmdCloudSharedDisk bool var cmdFlavor string var cmdMonitorDocker string @@ -77,7 +78,7 @@ command as one of the name:value pairs. The possible options are: cmd cwd cwd_matters change_home on_failure on_success on_exit mounts req_grp memory time override cpus disk priority retries rep_grp dep_grps deps cmd_deps monitor_docker cloud_os cloud_username cloud_ram cloud_script cloud_config_files -cloud_flavor env bsub_mode +cloud_flavor cloud_shared env bsub_mode If any of these will be the same for all your commands, you can instead specify them as flags (which are treated as defaults in the case that they are @@ -255,6 +256,16 @@ help text for "wr cloud deploy"'s --config_files option. The per-job config files you specify will be treated as in addition to any specified during cloud deploy or when starting the manager. +cloud_shared only works when using a cloud scheduler where both the manager and +jobs will run on Ubuntu. It will cause /shared on the manager's server to be +NFS shared to /shared mounted on the server where your job runs. This gives you +an easy way of having a shared disk in the cloud, but the size of that disk is +limited to the size of the manager's volume. Performance may also be poor. This +is only intended when you need a little bit of shared state between jobs, not +for writing lots of large files. (If you need a high performance shared disk, +don't use this option, and instead set up your own shared filesystem, eg. +GlusterFS, and specify a cloud_script that mounts it.) + "env" is an array of "key=value" environment variables, which override or add to the environment variables the command will see when it runs. The base variables that are overwritten depend on if you run 'wr add' on the same machine as you @@ -292,7 +303,8 @@ new job will have this job's mount and cloud_* options.`, envVars = os.Environ() } - // add the jobs to the queue + // add the jobs to the queue *** should add at most 1,000,000 jobs at a + // time to avoid time out issues... inserts, dups, err := jq.Add(jobs, envVars, !cmdReRun) if err != nil { die("%s", err) @@ -338,6 +350,7 @@ func init() { addCmd.Flags().StringVar(&cmdFlavor, "cloud_flavor", "", "in the cloud, exact name of the server flavor that the commands must run on") addCmd.Flags().StringVar(&cmdPostCreationScript, "cloud_script", "", "in the cloud, path to a start-up script that will be run on the servers created to run these commands") addCmd.Flags().StringVar(&cmdCloudConfigs, "cloud_config_files", "", "in the cloud, comma separated paths of config files to copy to servers created to run these commands") + addCmd.Flags().BoolVar(&cmdCloudSharedDisk, "cloud_shared", false, "mount /shared") addCmd.Flags().StringVar(&cmdEnv, "env", "", "comma-separated list of key=value environment variables to set before running the commands") addCmd.Flags().BoolVar(&cmdReRun, "rerun", false, "re-run any commands that you add that had been previously added and have since completed") addCmd.Flags().BoolVar(&cmdBsubMode, "bsub", false, "enable bsub emulation mode") @@ -367,7 +380,7 @@ func groupsToDeps(groups string) (deps jobqueue.Dependencies) { // the default repgrp. func parseCmdFile(jq *jobqueue.Client) ([]*jobqueue.Job, bool, bool) { var isLocal bool - currentIP, errc := jobqueue.CurrentIP("") + currentIP, errc := internal.CurrentIP("") if errc != nil { warn("Could not get current IP: %s", errc) } @@ -406,6 +419,7 @@ func parseCmdFile(jq *jobqueue.Client) ([]*jobqueue.Job, bool, bool) { CloudConfigFiles: cmdCloudConfigs, CloudOSRam: cmdOsRAM, CloudFlavor: cmdFlavor, + CloudShared: cmdCloudSharedDisk, BsubMode: bsubMode, } @@ -546,7 +560,7 @@ func parseCmdFile(jq *jobqueue.Client) ([]*jobqueue.Job, bool, bool) { if jvj.Cwd == "" && jd.Cwd == "" { if remoteWarning { - warn("command working directories defaulting to /tmp since the manager is running remotely") + warn("command working directories defaulting to %s since the manager is running remotely", pwd) } jd.Cwd = pwd } diff --git a/cmd/manager.go b/cmd/manager.go index 31ec877e0..553ec93c9 100644 --- a/cmd/manager.go +++ b/cmd/manager.go @@ -235,7 +235,7 @@ commands they were running. It is more graceful to use 'drain' instead.`, // real pid; though it may actually be running on a remote host and we // managed to connect to it via ssh port forwarding; compare the server // ip to our own - currentIP, err := jobqueue.CurrentIP("") + currentIP, err := internal.CurrentIP("") if err != nil { warn("Could not get current IP: %s", err) } @@ -433,7 +433,7 @@ func logStarted(s *jobqueue.ServerInfo, token []byte) { info("wr's web interface can be reached at https://%s:%s/?token=%s", s.Host, s.WebPort, string(token)) if setDomainIP { - ip, err := jobqueue.CurrentIP("") + ip, err := internal.CurrentIP("") if err != nil { warn("could not get IP address of localhost: %s", err) } diff --git a/internal/utils.go b/internal/utils.go index a8f743895..2d23d1cf1 100644 --- a/internal/utils.go +++ b/internal/utils.go @@ -28,6 +28,7 @@ import ( "io/ioutil" "log" "math/rand" + "net" "net/url" "os" "os/exec" @@ -367,3 +368,86 @@ func RandomString() string { } return string(b) } + +// CurrentIP returns the IP address of the machine we're running on right now. +// The cidr argument can be an empty string, but if set to the CIDR of the +// machine's primary network, it helps us be sure of getting the correct IP +// address (for when there are multiple network interfaces on the machine). +func CurrentIP(cidr string) (string, error) { + var ipNet *net.IPNet + if cidr != "" { + _, ipn, err := net.ParseCIDR(cidr) + if err == nil { + ipNet = ipn + } + // *** ignoring error since I don't want to change the return value of + // this method... + } + + conn, err := net.Dial("udp", "8.8.8.8:80") // doesn't actually connect, dest doesn't need to exist + if err != nil { + // fall-back on the old method we had... + + // first just hope http://stackoverflow.com/a/25851186/675083 gives us a + // cross-linux&MacOS solution that works reliably... + var out []byte + out, err = exec.Command("sh", "-c", "ip -4 route get 8.8.8.8 | head -1 | cut -d' ' -f8 | tr -d '\\n'").Output() // #nosec + var ip string + if err != nil { + ip = string(out) + + // paranoid confirmation this ip is in our CIDR + if ip != "" && ipNet != nil { + pip := net.ParseIP(ip) + if pip != nil { + if !ipNet.Contains(pip) { + ip = "" + } + } + } + } + + // if the above fails, fall back on manually going through all our + // network interfaces + if ip == "" { + var addrs []net.Addr + addrs, err = net.InterfaceAddrs() + if err != nil { + return "", err + } + for _, address := range addrs { + if thisIPNet, ok := address.(*net.IPNet); ok && !thisIPNet.IP.IsLoopback() { + if thisIPNet.IP.To4() != nil { + if ipNet != nil { + if ipNet.Contains(thisIPNet.IP) { + ip = thisIPNet.IP.String() + break + } + } else { + ip = thisIPNet.IP.String() + break + } + } + } + } + } + + return ip, nil + } + + defer func() { + err = conn.Close() + }() + localAddr := conn.LocalAddr().(*net.UDPAddr) + ip := localAddr.IP + + // paranoid confirmation this ip is in our CIDR + if ipNet != nil { + if ipNet.Contains(ip) { + return ip.String(), err + } + } else { + return ip.String(), err + } + return "", err +} diff --git a/jobqueue/client.go b/jobqueue/client.go index 3dbfff112..81721378d 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -593,11 +593,13 @@ func (c *Client) Execute(job *Job, shell string) error { // run on the same host as us and therefore any mounts are expected to // fail) simplified := &Job{ - MountConfigs: job.MountConfigs, Requirements: job.Requirements, BsubMode: job.BsubMode, Host: host, } + if _, exists := job.Requirements.Other["cloud_shared"]; !exists { + simplified.MountConfigs = job.MountConfigs + } jobJSON, errm := json.Marshal(simplified) if errm != nil { errb := c.Bury(job, nil, fmt.Sprintf("could not convert job to JSON: %s", errm)) @@ -1180,7 +1182,7 @@ func (c *Client) Started(job *Job, pid int) error { host = localhost } job.Host = host - job.HostIP, err = CurrentIP("") + job.HostIP, err = internal.CurrentIP("") if err != nil { return err } diff --git a/jobqueue/job.go b/jobqueue/job.go index a3d5106e1..52d14efd1 100644 --- a/jobqueue/job.go +++ b/jobqueue/job.go @@ -169,7 +169,12 @@ type Job struct { MountConfigs MountConfigs // BsubMode set to either Production or Development when Add()ing a job will - // result in the job being assigned a BsubID. + // result in the job being assigned a BsubID. Such jobs, when they run, will + // see bsub, bjobs and bkill as symlinks to wr, thus if they call bsub, they + // will actually add jobs to the jobqueue etc. Those jobs will pick up the + // same Requirements.Other as this job, and the same MountConfigs. If + // Requirements.Other["cloud_shared"] is "true", the MountConfigs are not + // reused. BsubMode string // MonitorDocker turns on monitoring of a docker container identified by its diff --git a/jobqueue/jobqueue_test.go b/jobqueue/jobqueue_test.go index ec85dc5a0..46896d8fa 100644 --- a/jobqueue/jobqueue_test.go +++ b/jobqueue/jobqueue_test.go @@ -80,13 +80,13 @@ func TestJobqueueUtils(t *testing.T) { } Convey("CurrentIP() works", t, func() { - ip, err := CurrentIP("") + ip, err := internal.CurrentIP("") So(err, ShouldBeNil) So(ip, ShouldNotBeBlank) - ip, err = CurrentIP("9.9.9.9/24") + ip, err = internal.CurrentIP("9.9.9.9/24") So(err, ShouldBeNil) So(ip, ShouldBeBlank) - ip, err = CurrentIP(ip + "/16") + ip, err = internal.CurrentIP(ip + "/16") So(err, ShouldBeNil) So(ip, ShouldEqual, ip) }) @@ -4410,7 +4410,7 @@ func runner() { func setDomainIP(domain string) { host, _ := os.Hostname() if host == "vr-2-2-02" { - ip, _ := CurrentIP("") + ip, _ := internal.CurrentIP("") internal.InfobloxSetDomainIP(domain, ip) } } diff --git a/jobqueue/scheduler/openstack.go b/jobqueue/scheduler/openstack.go index 8cb529a39..c704be276 100644 --- a/jobqueue/scheduler/openstack.go +++ b/jobqueue/scheduler/openstack.go @@ -190,6 +190,7 @@ type standin struct { os string script []byte configFiles string // in cloud.Server.CopyOver() format + sharedDisk bool usedRAM int usedCores int usedDisk int @@ -205,7 +206,7 @@ type standin struct { } // newStandin returns a new standin server -func newStandin(id string, flavor *cloud.Flavor, disk int, osPrefix string, script []byte, configFiles string, logger log15.Logger) *standin { +func newStandin(id string, flavor *cloud.Flavor, disk int, osPrefix string, script []byte, configFiles string, sharedDisk bool, logger log15.Logger) *standin { availableDisk := flavor.Disk if disk > availableDisk { availableDisk = disk @@ -217,6 +218,7 @@ func newStandin(id string, flavor *cloud.Flavor, disk int, osPrefix string, scri os: osPrefix, script: script, configFiles: configFiles, + sharedDisk: sharedDisk, waitingToSpawn: true, endWait: make(chan *cloud.Server), readyToSpawn: make(chan bool), @@ -226,8 +228,8 @@ func newStandin(id string, flavor *cloud.Flavor, disk int, osPrefix string, scri } // matches is like cloud.Server.Matches() -func (s *standin) matches(os string, script []byte, configFiles string, flavor *cloud.Flavor) bool { - return s.os == os && bytes.Equal(s.script, script) && s.configFiles == configFiles && (flavor == nil || flavor.ID == s.flavor.ID) +func (s *standin) matches(os string, script []byte, configFiles string, flavor *cloud.Flavor, sharedDisk bool) bool { + return s.os == os && bytes.Equal(s.script, script) && s.configFiles == configFiles && (flavor == nil || flavor.ID == s.flavor.ID) && s.sharedDisk == sharedDisk } // allocate is like cloud.Server.Allocate() @@ -445,7 +447,7 @@ func (s *opst) initialize(config interface{}, logger log15.Logger) error { // initialise our servers with details of ourself s.servers = make(map[string]*cloud.Server) - localhost, err := provider.LocalhostServer(s.config.OSPrefix, s.config.PostCreationScript, s.config.ConfigFiles) + localhost, err := provider.LocalhostServer(s.config.OSPrefix, s.config.PostCreationScript, s.config.ConfigFiles, s.config.CIDR) if err != nil { return err } @@ -547,7 +549,7 @@ func (s *opst) getFlavor(name string) (*cloud.Flavor, error) { // of server has been requested. If not specified, the returned os defaults to // the configured OSPrefix, script defaults to PostCreationScript, config files // defaults to ConfigFiles and flavor will be nil. -func (s *opst) serverReqs(req *Requirements) (osPrefix string, osScript []byte, osConfigFiles string, flavor *cloud.Flavor, err error) { +func (s *opst) serverReqs(req *Requirements) (osPrefix string, osScript []byte, osConfigFiles string, flavor *cloud.Flavor, sharedDisk bool, err error) { if val, defined := req.Other["cloud_os"]; defined { osPrefix = val } else { @@ -572,9 +574,19 @@ func (s *opst) serverReqs(req *Requirements) (osPrefix string, osScript []byte, if name, defined := req.Other["cloud_flavor"]; defined { flavor, err = s.getFlavor(name) + if err != nil { + return osPrefix, osScript, osConfigFiles, flavor, sharedDisk, err + } } - return osPrefix, osScript, osConfigFiles, flavor, err + if val, defined := req.Other["cloud_shared"]; defined && val == "true" { + sharedDisk = true + + // create a shared disk on our "head" node (if not already done) + err = s.servers["localhost"].CreateSharedDisk() + } + + return osPrefix, osScript, osConfigFiles, flavor, sharedDisk, err } // canCount tells you how many jobs with the given RAM and core requirements it @@ -583,7 +595,7 @@ func (s *opst) canCount(req *Requirements) int { s.resourceMutex.RLock() defer s.resourceMutex.RUnlock() - requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, err := s.serverReqs(req) + requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, needsSharedDisk, err := s.serverReqs(req) if err != nil { s.Warn("Failed to determine server requirements", "err", err) return 0 @@ -609,7 +621,7 @@ func (s *opst) canCount(req *Requirements) int { // by one in to the first bin that has room for it.” var canCount int for _, server := range s.servers { - if !server.IsBad() && server.Matches(requestedOS, requestedScript, requestedConfigFiles, requestedFlavor) { + if !server.IsBad() && server.Matches(requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, needsSharedDisk) { space := server.HasSpaceFor(req.Cores, req.RAM, req.Disk) canCount += space } @@ -783,7 +795,7 @@ func (s *opst) runCmd(cmd string, req *Requirements, reservedCh chan bool) error // which runCmd call is doing what logger := s.Logger.New("call", logext.RandId(8)) - requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, err := s.serverReqs(req) + requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, needsSharedDisk, err := s.serverReqs(req) if err != nil { return err } @@ -813,7 +825,7 @@ func (s *opst) runCmd(cmd string, req *Requirements, reservedCh chan bool) error // of them var server *cloud.Server for sid, thisServer := range s.servers { - if !thisServer.IsBad() && thisServer.Matches(requestedOS, requestedScript, requestedConfigFiles, requestedFlavor) && thisServer.HasSpaceFor(req.Cores, req.RAM, req.Disk) > 0 { + if !thisServer.IsBad() && thisServer.Matches(requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, needsSharedDisk) && thisServer.HasSpaceFor(req.Cores, req.RAM, req.Disk) > 0 { server = thisServer server.Allocate(req.Cores, req.RAM, req.Disk) logger = logger.New("server", sid) @@ -825,7 +837,7 @@ func (s *opst) runCmd(cmd string, req *Requirements, reservedCh chan bool) error // else see if there will be space on a soon-to-be-spawned server if server == nil { for _, standinServer := range s.standins { - if standinServer.matches(requestedOS, requestedScript, requestedConfigFiles, requestedFlavor) && standinServer.hasSpaceFor(req) > 0 { + if standinServer.matches(requestedOS, requestedScript, requestedConfigFiles, requestedFlavor, needsSharedDisk) && standinServer.hasSpaceFor(req) > 0 { s.recordStandin(standinServer, cmd) standinServer.allocate(req) s.mutex.Unlock() @@ -889,7 +901,7 @@ func (s *opst) runCmd(cmd string, req *Requirements, reservedCh chan bool) error u, _ := uuid.NewV4() standinID := u.String() - standinServer := newStandin(standinID, flavor, req.Disk, requestedOS, requestedScript, requestedConfigFiles, s.Logger) + standinServer := newStandin(standinID, flavor, req.Disk, requestedOS, requestedScript, requestedConfigFiles, needsSharedDisk, s.Logger) standinServer.allocate(req) s.recordStandin(standinServer, cmd) logger = logger.New("standin", standinID) @@ -998,10 +1010,14 @@ func (s *opst) runCmd(cmd string, req *Requirements, reservedCh chan bool) error s.mutex.Unlock() logger.Debug("waiting for server ready") if err == nil { - // wait until boot is finished, ssh is ready, and osScript has + // wait until boot is finished, ssh is ready and osScript has // completed err = server.WaitUntilReady(requestedConfigFiles, requestedScript) + if err == nil && needsSharedDisk { + err = server.MountSharedDisk(s.servers["localhost"].IP) + } + if err == nil { // check that the exe of the cmd we're supposed to run exists on the // new server, and if not, copy it over *** this is just a hack to @@ -1073,7 +1089,7 @@ func (s *opst) runCmd(cmd string, req *Requirements, reservedCh chan bool) error s.mutex.Unlock() // now we have a server, ssh over and run the cmd on it - if server.IP == "127.0.0.1" { + if server.Name == "localhost" { logger.Debug("running command locally", "cmd", cmd) reserved := make(chan bool) go func() { diff --git a/jobqueue/scheduler/scheduler_test.go b/jobqueue/scheduler/scheduler_test.go index a6f9a9bf4..04a735857 100644 --- a/jobqueue/scheduler/scheduler_test.go +++ b/jobqueue/scheduler/scheduler_test.go @@ -766,6 +766,40 @@ func TestOpenstack(t *testing.T) { Convey("Schedule() lets you...", func() { oFile := filepath.Join(tmpdir, "out") + + Convey("Run jobs that use a NFS shared disk", func() { + cmd := "touch /shared/test1" + other := make(map[string]string) + other["cloud_shared"] = "true" + localReq := &Requirements{100, 1 * time.Minute, 1, 1, other} + err := s.Schedule(cmd, localReq, 1) + So(err, ShouldBeNil) + + remoteReq := oss.reqForSpawn(localReq) + for _, server := range oss.servers { + if server.Flavor.RAM >= remoteReq.RAM { + remoteReq.RAM = server.Flavor.RAM + 1000 + } + } + remoteReq.Other = other + cmd = "touch /shared/test2" + err = s.Schedule(cmd, remoteReq, 1) + So(err, ShouldBeNil) + + So(s.Busy(), ShouldBeTrue) + So(waitToFinish(s, 240, 1000), ShouldBeTrue) + + _, err = os.Stat("/shared/test1") + So(err, ShouldBeNil) + _, err = os.Stat("/shared/test2") + So(err, ShouldBeNil) + + err = os.Remove("/shared/test1") + So(err, ShouldBeNil) + err = os.Remove("/shared/test2") + So(err, ShouldBeNil) + }) + if flavorRegex == `^m.*$` && os.Getenv("OS_TENANT_ID") == "" { Convey("Run a job on a specific flavor", func() { cmd := "sleep 10" diff --git a/jobqueue/server.go b/jobqueue/server.go index 2bc5cc258..527a38907 100644 --- a/jobqueue/server.go +++ b/jobqueue/server.go @@ -459,7 +459,7 @@ func Serve(config ServerConfig) (s *Server, msg string, token []byte, err error) // if we end up spawning clients on other machines, they'll need to know // our non-loopback ip address so they can connect to us - ip, err := CurrentIP(config.CIDR) + ip, err := internal.CurrentIP(config.CIDR) if err != nil { serverLogger.Error("getting current IP failed", "err", err) } diff --git a/jobqueue/serverREST.go b/jobqueue/serverREST.go index 90a3692bd..baaebd228 100644 --- a/jobqueue/serverREST.go +++ b/jobqueue/serverREST.go @@ -82,7 +82,8 @@ type JobViaJSON struct { CloudConfigFiles string `json:"cloud_config_files"` CloudOSRam *int `json:"cloud_ram"` CloudFlavor string `json:"cloud_flavor"` - BsubMode string `jsob:"bsub_mode"` + CloudShared bool `json:"cloud_shared"` + BsubMode string `json:"bsub_mode"` } // JobDefaults is supplied to JobViaJSON.Convert() to provide default values for @@ -124,6 +125,7 @@ type JobDefaults struct { // CloudOSRam is the number of Megabytes that CloudOS needs to run. Defaults // to 1000. CloudOSRam int + CloudShared bool BsubMode string compressedEnv []byte osRAM string @@ -408,6 +410,10 @@ func (jvj *JobViaJSON) Convert(jd *JobDefaults) (*Job, error) { other["cloud_os_ram"] = jd.DefaultCloudOSRam() } + if jvj.CloudShared || jd.CloudShared { + other["cloud_shared"] = "true" + } + return &Job{ RepGroup: repg, Cmd: cmd, @@ -627,6 +633,9 @@ func restJobsAdd(r *http.Request, s *Server) ([]*Job, int, error) { if r.Form.Get("change_home") == restFormTrue { jd.ChangeHome = true } + if r.Form.Get("cloud_shared") == restFormTrue { + jd.CloudShared = true + } if r.Form.Get("memory") != "" { mb, err := bytefmt.ToMegabytes(r.Form.Get("memory")) if err != nil { diff --git a/jobqueue/utils.go b/jobqueue/utils.go index af4aab858..753a15a71 100644 --- a/jobqueue/utils.go +++ b/jobqueue/utils.go @@ -31,9 +31,7 @@ import ( "fmt" "io" "io/ioutil" - "net" "os" - "os/exec" "path/filepath" "strconv" "strings" @@ -60,89 +58,6 @@ var cr = []byte("\r") var lf = []byte("\n") var ellipses = []byte("[...]\n") -// CurrentIP returns the IP address of the machine we're running on right now. -// The cidr argument can be an empty string, but if set to the CIDR of the -// machine's primary network, it helps us be sure of getting the correct IP -// address (for when there are multiple network interfaces on the machine). -func CurrentIP(cidr string) (string, error) { - var ipNet *net.IPNet - if cidr != "" { - _, ipn, err := net.ParseCIDR(cidr) - if err == nil { - ipNet = ipn - } - // *** ignoring error since I don't want to change the return value of - // this method... - } - - conn, err := net.Dial("udp", "8.8.8.8:80") // doesn't actually connect, dest doesn't need to exist - if err != nil { - // fall-back on the old method we had... - - // first just hope http://stackoverflow.com/a/25851186/675083 gives us a - // cross-linux&MacOS solution that works reliably... - var out []byte - out, err = exec.Command("sh", "-c", "ip -4 route get 8.8.8.8 | head -1 | cut -d' ' -f8 | tr -d '\\n'").Output() // #nosec - var ip string - if err != nil { - ip = string(out) - - // paranoid confirmation this ip is in our CIDR - if ip != "" && ipNet != nil { - pip := net.ParseIP(ip) - if pip != nil { - if !ipNet.Contains(pip) { - ip = "" - } - } - } - } - - // if the above fails, fall back on manually going through all our - // network interfaces - if ip == "" { - var addrs []net.Addr - addrs, err = net.InterfaceAddrs() - if err != nil { - return "", err - } - for _, address := range addrs { - if thisIPNet, ok := address.(*net.IPNet); ok && !thisIPNet.IP.IsLoopback() { - if thisIPNet.IP.To4() != nil { - if ipNet != nil { - if ipNet.Contains(thisIPNet.IP) { - ip = thisIPNet.IP.String() - break - } - } else { - ip = thisIPNet.IP.String() - break - } - } - } - } - } - - return ip, nil - } - - defer func() { - err = conn.Close() - }() - localAddr := conn.LocalAddr().(*net.UDPAddr) - ip := localAddr.IP - - // paranoid confirmation this ip is in our CIDR - if ipNet != nil { - if ipNet.Contains(ip) { - return ip.String(), err - } - } else { - return ip.String(), err - } - return "", err -} - // generateToken creates a cryptographically secure pseudorandom URL-safe base64 // encoded string 43 bytes long. Used by the server to create a token passed to // to the caller for subsequent client authentication. From bf47e762a5a666c4f1bdd116afffadb53d3ac4ec Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Thu, 17 May 2018 16:58:24 +0100 Subject: [PATCH 29/33] Delint. --- cloud/server.go | 24 ++++++++++++------------ cmd/lsf.go | 15 +++++++++------ 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/cloud/server.go b/cloud/server.go index cdc1bb0e7..b6712536f 100644 --- a/cloud/server.go +++ b/cloud/server.go @@ -671,33 +671,33 @@ func (s *Server) CreateSharedDisk() error { ctx, cancel := context.WithTimeout(context.Background(), 120*time.Second) defer cancel() - cmd := exec.CommandContext(ctx, "bash", "-c", "sudo apt-get install nfs-kernel-server -y") + cmd := exec.CommandContext(ctx, "bash", "-c", "sudo apt-get install nfs-kernel-server -y") // #nosec err := cmd.Run() if err != nil { return err } - cmd = exec.CommandContext(ctx, "bash", "-c", fmt.Sprintf("echo '%s *(rw,sync,no_root_squash)' | sudo tee --append /etc/exports > /dev/null", sharePath)) + cmd = exec.CommandContext(ctx, "bash", "-c", fmt.Sprintf("echo '%s *(rw,sync,no_root_squash)' | sudo tee --append /etc/exports > /dev/null", sharePath)) // #nosec err = cmd.Run() if err != nil { return err } - if _, err := os.Stat(sharePath); err != nil && os.IsNotExist(err) { - cmd = exec.CommandContext(ctx, "bash", "-c", "sudo mkdir "+sharePath) - err = cmd.Run() - if err != nil { - return err + if _, errs := os.Stat(sharePath); errs != nil && os.IsNotExist(errs) { + cmd = exec.CommandContext(ctx, "bash", "-c", "sudo mkdir "+sharePath) // #nosec + errs = cmd.Run() + if errs != nil { + return errs } - cmd = exec.CommandContext(ctx, "bash", "-c", fmt.Sprintf("sudo chown %s:%s %s", s.UserName, s.UserName, sharePath)) - err = cmd.Run() - if err != nil { - return err + cmd = exec.CommandContext(ctx, "bash", "-c", fmt.Sprintf("sudo chown %s:%s %s", s.UserName, s.UserName, sharePath)) // #nosec + errs = cmd.Run() + if errs != nil { + return errs } } - cmd = exec.CommandContext(ctx, "bash", "-c", "sudo systemctl start nfs-kernel-server.service && sudo exportfs -a") + cmd = exec.CommandContext(ctx, "bash", "-c", "sudo systemctl start nfs-kernel-server.service && sudo export"+"fs -a") // #nosec (the split is to avoid a false-positive spelling mistake) err = cmd.Run() if err != nil { return err diff --git a/cmd/lsf.go b/cmd/lsf.go index 67c3679d8..63842f067 100644 --- a/cmd/lsf.go +++ b/cmd/lsf.go @@ -76,8 +76,8 @@ NB: currently the emulation is extremely limited, supporting only the interactive "console" mode where you run bsub without any arguments, and it only supports single flags per #BSUB line, and it only pays attention to -J, -n and -M flags. (This is sufficient for compatibility with 10x Genomic's cellranger -software (which has Maritan built in), and to work as the scheduler for -nextflow.) There is only one "queue", called 'wr'. +software (which has Martian built in), and to work as the scheduler for +nextflow in LSF mode.) There is only one "queue", called 'wr'. The best way to use this LSF emulation is not to call this command yourself directly, but to use 'wr add --bsubs [other opts]' to add the command that you @@ -216,7 +216,7 @@ var lsfBjobsCmd = &cobra.Command{ syntax and being formatted the way bjobs display this information. Only lists all incomplete jobs. Unlike real bjobs, does not list recently -completed jobs. Unlike real bjobs, does not truncate columns (always effectivly +completed jobs. Unlike real bjobs, does not truncate columns (always effectively in -w mode). Only supports this limited set of real bjobs options: @@ -231,7 +231,7 @@ eg. -o 'JOBID STAT SUBMIT_TIME delimiter=","' While -q can be provided, and that provided queue will be displayed in the output, in reality there is only 1 queue called 'wr', so -q has no real function -other than providing compatability with real bjobs command line args.`, +other than providing compatibility with real bjobs command line args.`, Run: func(cmd *cobra.Command, args []string) { user, err := internal.Username() if err != nil { @@ -336,7 +336,10 @@ other than providing compatability with real bjobs command line args.`, if lsfFormat == "" { tw := w.(*tabwriter.Writer) - tw.Flush() + errf := tw.Flush() + if errf != nil { + warn("failed to flush output: %s", errf) + } } if !found { @@ -351,7 +354,7 @@ var lsfBkillCmd = &cobra.Command{ Short: "Kill jobs added using bsub", Long: `Kill jobs that have been added using the lsf bsub command. -Only supports providing jobIds as command line arguements. Does not currently +Only supports providing jobIds as command line arguments. Does not currently understand any of the options that real bkill does. Note that if a given jobId is not currently in the queue, always just claims From 60eeca4aaa783dc5ce62085c4c8fa166d42457fa Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Fri, 18 May 2018 17:05:29 +0100 Subject: [PATCH 30/33] Fix #116: reported job CPU time and memory usage now accurate for forking jobs. New getChildProcesses() method, used by client Execute() to fully kill jobs that fork. currentMemory() now includes memory usage of pid's children, recursively. Reported CPU time fixed to include user time, not just system time. --- jobqueue/client.go | 52 +++++++++++++----- jobqueue/jobqueue_test.go | 110 ++++++++++++++++++++++++++++++++++++++ jobqueue/utils.go | 53 ++++++++++++++++-- 3 files changed, 198 insertions(+), 17 deletions(-) diff --git a/jobqueue/client.go b/jobqueue/client.go index 81721378d..78cb33777 100644 --- a/jobqueue/client.go +++ b/jobqueue/client.go @@ -720,8 +720,20 @@ func (c *Client) Execute(job *Job, shell string) error { var dockerContainerID string killCmd := func() error { + // get children first + children, errc := getChildProcesses(int32(cmd.Process.Pid)) + + // then kill *** race condition if cmd spawns more children... errk := cmd.Process.Kill() + if errc != nil { + if errk == nil { + errk = errc + } else { + errk = fmt.Errorf("%s, and getting child processes failed: %s", errk.Error(), errc.Error()) + } + } + if dockerContainerID != "" { // kill the docker container as well errd := dockerClient.ContainerKill(context.Background(), dockerContainerID, "SIGKILL") @@ -732,6 +744,17 @@ func (c *Client) Execute(job *Job, shell string) error { } } + for _, child := range children { + // try and kill any children in case the above didn't already + // result in their death + errc = child.Kill() + if errk == nil { + errk = errc + } else { + errk = fmt.Errorf("%s, and killing its child process failed: %s", errk.Error(), errc.Error()) + } + } + return errk } @@ -885,18 +908,21 @@ func (c *Client) Execute(job *Job, shell string) error { stateMutex.Lock() defer stateMutex.Unlock() - // we could get the max rss from ProcessState.SysUsage, but we'll stick with - // our better (?) pss-based Peakmem, unless the command exited so quickly - // we never ticked and calculated it - if peakmem == 0 { - ru := cmd.ProcessState.SysUsage().(*syscall.Rusage) - if runtime.GOOS == "darwin" { - // Maxrss values are bytes - peakmem = int((ru.Maxrss / 1024) / 1024) - } else { - // Maxrss values are kb - peakmem = int(ru.Maxrss / 1024) - } + // though we have tried to track peak memory while the cmd ran (mainly to + // know if we use too much memory and kill during a run), our method might + // miss a peak that cmd.ProcessState can tell us about, so use that if + // higher + peakRSS := cmd.ProcessState.SysUsage().(*syscall.Rusage).Maxrss + var peakRSSMB int + if runtime.GOOS == "darwin" { + // Maxrss values are bytes + peakRSSMB = int((peakRSS / 1024) / 1024) + } else { + // Maxrss values are kb + peakRSSMB = int(peakRSS / 1024) + } + if peakRSSMB > peakmem { + peakmem = peakRSSMB } // include our own memory usage in the peakmem of the command, since the @@ -1087,7 +1113,7 @@ func (c *Client) Execute(job *Job, shell string) error { Cwd: actualCwd, Exitcode: exitcode, PeakRAM: peakmem, - CPUtime: cmd.ProcessState.SystemTime() + time.Duration(dockerCPU)*time.Second, + CPUtime: cmd.ProcessState.SystemTime() + cmd.ProcessState.UserTime() + time.Duration(dockerCPU)*time.Second, Stdout: finalStdOut, Stderr: finalStdErr, Exited: true, diff --git a/jobqueue/jobqueue_test.go b/jobqueue/jobqueue_test.go index 46896d8fa..33378cbc9 100644 --- a/jobqueue/jobqueue_test.go +++ b/jobqueue/jobqueue_test.go @@ -1070,6 +1070,116 @@ func TestJobqueue(t *testing.T) { RecMBRound = 100 // revert back to normal + Convey("Jobs that fork and change processgroup can still be fully killed", func() { + jobs = nil + + tmpdir, err := ioutil.TempDir("", "wr_kill_test") + So(err, ShouldBeNil) + defer os.RemoveAll(tmpdir) + + cmd := fmt.Sprintf("perl -Mstrict -we 'open(OUT, qq[>%s/$$]); my $pid = fork; if ($pid == 0) { setpgrp; my $subpid = fork; if ($subpid == 0) { sleep(60); exit 0; } open(OUT, qq[>%s/$subpid]); waitpid $subpid, 0; exit 0; } open(OUT, qq[>%s/$pid]); sleep(30); waitpid $pid, 0'", tmpdir, tmpdir, tmpdir) + jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "fake_group", Requirements: standardReqs, Retries: uint8(0), RepGroup: "forker"}) + inserts, already, err := jq.Add(jobs, envVars, true) + So(err, ShouldBeNil) + So(inserts, ShouldEqual, 1) + So(already, ShouldEqual, 0) + + job, err := jq.Reserve(50 * time.Millisecond) + So(err, ShouldBeNil) + So(job.Cmd, ShouldEqual, cmd) + So(job.State, ShouldEqual, JobStateReserved) + + ich := make(chan int, 1) + ech := make(chan error, 1) + go func() { + <-time.After(1 * time.Second) + i, errk := jq.Kill([]*JobEssence{job.ToEssense()}) + ich <- i + ech <- errk + }() + + err = jq.Execute(job, config.RunnerExecShell) + So(err, ShouldNotBeNil) + jqerr, ok := err.(Error) + So(ok, ShouldBeTrue) + So(jqerr.Err, ShouldEqual, FailReasonKilled) + So(job.State, ShouldEqual, JobStateBuried) + So(job.Exited, ShouldBeTrue) + So(job.Exitcode, ShouldEqual, -1) + So(job.FailReason, ShouldEqual, FailReasonKilled) + + i := <-ich + So(i, ShouldEqual, 1) + err = <-ech + So(err, ShouldBeNil) + + files, err := ioutil.ReadDir(tmpdir) + So(err, ShouldBeNil) + count := 0 + for _, file := range files { + if file.IsDir() { + continue + } + count++ + pid, err := strconv.Atoi(file.Name()) + So(err, ShouldBeNil) + process, err := os.FindProcess(pid) + So(err, ShouldBeNil) + err = process.Signal(syscall.Signal(0)) + So(err, ShouldNotBeNil) + So(err.Error(), ShouldContainSubstring, "process already finished") + } + So(count, ShouldEqual, 3) + + deleted, errd := jq.Delete([]*JobEssence{{Cmd: cmd}}) + So(errd, ShouldBeNil) + So(deleted, ShouldEqual, 1) + }) + + Convey("Jobs that fork and change processgroup have correct memory usage reported", func() { + jobs = nil + cmd := `perl -Mstrict -we 'my $pid = fork; if ($pid == 0) { setpgrp; my $subpid = fork; if ($subpid == 0) { my @a; for (1..100) { push(@a, q[a] x 10000000); } exit 0; } waitpid $subpid, 0; exit 0; } my @b; for (1..100) { push(@b, q[b] x 1000000); } waitpid $pid, 0'` + jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "fake_group", Requirements: standardReqs, Retries: uint8(0), RepGroup: "forker"}) + inserts, already, err := jq.Add(jobs, envVars, true) + So(err, ShouldBeNil) + So(inserts, ShouldEqual, 1) + So(already, ShouldEqual, 0) + + job, err := jq.Reserve(50 * time.Millisecond) + So(err, ShouldBeNil) + So(job.Cmd, ShouldEqual, cmd) + So(job.State, ShouldEqual, JobStateReserved) + + err = jq.Execute(job, config.RunnerExecShell) + So(err, ShouldBeNil) + So(job.State, ShouldEqual, JobStateComplete) + So(job.Exited, ShouldBeTrue) + So(job.Exitcode, ShouldEqual, 0) + So(job.PeakRAM, ShouldBeGreaterThan, 500) + }) + + Convey("Jobs that fork and change processgroup have correct CPU time reported", func() { + jobs = nil + cmd := `perl -Mstrict -we 'my $pid = fork; if ($pid == 0) { setpgrp; my $subpid = fork; if ($subpid == 0) { my $a = 2; for (1..10000000) { $a *= $a } exit 0; } waitpid $subpid, 0; exit 0; } my $b = 2; for (1..10000000) { $b *= $b } waitpid $pid, 0'` + jobs = append(jobs, &Job{Cmd: cmd, Cwd: "/tmp", ReqGroup: "fake_group", Requirements: standardReqs, Retries: uint8(0), RepGroup: "forker"}) + inserts, already, err := jq.Add(jobs, envVars, true) + So(err, ShouldBeNil) + So(inserts, ShouldEqual, 1) + So(already, ShouldEqual, 0) + + job, err := jq.Reserve(50 * time.Millisecond) + So(err, ShouldBeNil) + So(job.Cmd, ShouldEqual, cmd) + So(job.State, ShouldEqual, JobStateReserved) + + err = jq.Execute(job, config.RunnerExecShell) + So(err, ShouldBeNil) + So(job.State, ShouldEqual, JobStateComplete) + So(job.Exited, ShouldBeTrue) + So(job.Exitcode, ShouldEqual, 0) + So(job.CPUtime, ShouldBeGreaterThanOrEqualTo, job.WallTime()+(job.WallTime()/2)) + }) + Convey("The stdout/err of jobs is only kept for failed jobs, and cwd&TMPDIR&HOME get set appropriately", func() { jobs = nil baseDir, err := ioutil.TempDir("", "wr_jobqueue_test_runner_dir_") diff --git a/jobqueue/utils.go b/jobqueue/utils.go index 753a15a71..b9b40b4a6 100644 --- a/jobqueue/utils.go +++ b/jobqueue/utils.go @@ -39,6 +39,7 @@ import ( "github.com/VertebrateResequencing/wr/internal" "github.com/dgryski/go-farm" multierror "github.com/hashicorp/go-multierror" + "github.com/shirou/gopsutil/process" ) // AppName gets used in certain places like naming the base directory of created @@ -156,10 +157,9 @@ func decompress(compressed []byte) ([]byte, error) { return buf.Bytes(), err } -// get the current memory usage of a pid, relying on modern linux /proc/*/smaps -// (based on http://stackoverflow.com/a/31881979/675083). +// get the current memory usage of a pid and all its children, relying on modern +// linux /proc/*/smaps (based on http://stackoverflow.com/a/31881979/675083). func currentMemory(pid int) (int, error) { - var err error f, err := os.Open(fmt.Sprintf("/proc/%d/smaps", pid)) if err != nil { return 0, err @@ -195,7 +195,52 @@ func currentMemory(pid int) (int, error) { // convert kB to MB mem := int(kb / 1024) - return mem, err + // recurse for children + p, err := process.NewProcess(int32(pid)) + if err != nil { + return mem, err + } + children, err := p.Children() + if err != nil && err.Error() != "process does not have children" { // err != process.ErrorNoChildren + return mem, err + } + for _, child := range children { + childMem, errr := currentMemory(int(child.Pid)) + if errr != nil { + continue + } + mem += childMem + } + + return mem, nil +} + +// getChildProcesses gets the child processes of the given pid, recursively. +func getChildProcesses(pid int32) ([]*process.Process, error) { + var children []*process.Process + p, err := process.NewProcess(pid) + if err != nil { + // we ignore errors, since we allow for working on processes that we're in + // the process of killing + return children, nil + } + + children, err = p.Children() + if err != nil && err.Error() != "process does not have children" { + return children, err + } + + for _, child := range children { + theseKids, errk := getChildProcesses(child.Pid) + if errk != nil { + continue + } + if len(theseKids) > 0 { + children = append(children, theseKids...) + } + } + + return children, nil } // this prefixSuffixSaver-related code is taken from os/exec, since they are not From 814bcd0fc87da68710f52795ad4708b4f7376dcf Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Mon, 21 May 2018 09:51:27 +0100 Subject: [PATCH 31/33] Update test expectations following changes to authors system. --- jobqueue/scheduler/scheduler_test.go | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/jobqueue/scheduler/scheduler_test.go b/jobqueue/scheduler/scheduler_test.go index 04a735857..21f6f4d4a 100644 --- a/jobqueue/scheduler/scheduler_test.go +++ b/jobqueue/scheduler/scheduler_test.go @@ -609,8 +609,8 @@ func TestOpenstack(t *testing.T) { } else { // author's pike install So(flavor.ID, ShouldEqual, "2000") - So(flavor.RAM, ShouldEqual, 9100) - So(flavor.Disk, ShouldEqual, 16) + So(flavor.RAM, ShouldEqual, 8600) + So(flavor.Disk, ShouldEqual, 15) So(flavor.Cores, ShouldEqual, 1) flavor, err = oss.determineFlavor(&Requirements{100, 1 * time.Minute, 1, 20, otherReqs}) @@ -624,29 +624,29 @@ func TestOpenstack(t *testing.T) { flavor, err = oss.determineFlavor(&Requirements{100, 1 * time.Minute, 2, 1, otherReqs}) So(err, ShouldBeNil) So(flavor.ID, ShouldEqual, "2001") - So(flavor.RAM, ShouldEqual, 18200) - So(flavor.Disk, ShouldEqual, 32) + So(flavor.RAM, ShouldEqual, 17200) + So(flavor.Disk, ShouldEqual, 31) So(flavor.Cores, ShouldEqual, 2) flavor, err = oss.determineFlavor(&Requirements{30000, 1 * time.Minute, 1, 1, otherReqs}) So(err, ShouldBeNil) So(flavor.ID, ShouldEqual, "2002") - So(flavor.RAM, ShouldEqual, 36400) - So(flavor.Disk, ShouldEqual, 64) + So(flavor.RAM, ShouldEqual, 34400) + So(flavor.Disk, ShouldEqual, 62) So(flavor.Cores, ShouldEqual, 4) flavor, err = oss.determineFlavor(&Requirements{64000, 1 * time.Minute, 1, 1, otherReqs}) So(err, ShouldBeNil) So(flavor.ID, ShouldEqual, "2003") - So(flavor.RAM, ShouldEqual, 72800) - So(flavor.Disk, ShouldEqual, 129) + So(flavor.RAM, ShouldEqual, 68800) + So(flavor.Disk, ShouldEqual, 125) So(flavor.Cores, ShouldEqual, 8) flavor, err = oss.determineFlavor(&Requirements{120000, 1 * time.Minute, 1, 1, otherReqs}) So(err, ShouldBeNil) So(flavor.ID, ShouldEqual, "2004") - So(flavor.RAM, ShouldEqual, 145600) - So(flavor.Disk, ShouldEqual, 258) + So(flavor.RAM, ShouldEqual, 137600) + So(flavor.Disk, ShouldEqual, 250) So(flavor.Cores, ShouldEqual, 16) flavor, err = oss.determineFlavor(&Requirements{100, 1 * time.Minute, 3, 1, otherReqs}) From fe4ef65aea210b76ecfd941f4078008d0e05e089 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Mon, 21 May 2018 11:35:10 +0100 Subject: [PATCH 32/33] Try (and fail) to improve consistency of local start ordering test. --- jobqueue/scheduler/scheduler_test.go | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/jobqueue/scheduler/scheduler_test.go b/jobqueue/scheduler/scheduler_test.go index 21f6f4d4a..b839594b1 100644 --- a/jobqueue/scheduler/scheduler_test.go +++ b/jobqueue/scheduler/scheduler_test.go @@ -246,11 +246,11 @@ func TestLocal(t *testing.T) { } defer os.RemoveAll(bigTmpdir) - blockCmd := "perl -e 'select(undef, undef, undef, 0.25)'" // sleep for 0.25s + blockCmd := "sleep 0.25" blockReq := &Requirements{1, 1 * time.Second, maxCPU, 0, otherReqs} - smallCmd := fmt.Sprintf("perl -MFile::Temp=tempfile -e '@a = tempfile(DIR => q[%s]); select(undef, undef, undef, 0.75); exit(0);'", smallTmpdir) // creates a file and sleeps for 0.75s + smallCmd := fmt.Sprintf("mktemp --tmpdir=%s tmp.XXXXXX && sleep 0.75", smallTmpdir) smallReq := &Requirements{1, 1 * time.Second, 1, 0, otherReqs} - bigCmd := fmt.Sprintf("perl -MFile::Temp=tempfile -e '@a = tempfile(DIR => q[%s]); select(undef, undef, undef, 0.75); exit(0);'", bigTmpdir) + bigCmd := fmt.Sprintf("mktemp --tmpdir=%s tmp.XXXXXX && sleep 0.75", bigTmpdir) bigReq := &Requirements{1, 1 * time.Second, maxCPU - 1, 0, otherReqs} // schedule 2 big cmds and then a small one to prove the small @@ -282,12 +282,13 @@ func TestLocal(t *testing.T) { // schedule a blocker so that subsequent schedules will be // compared to each other, then schedule 2 small cmds and a big - // one to prove that the big one takes priority + // command that uses all cpus to prove that the biggest one + // takes priority err = s.Schedule(blockCmd, blockReq, 1) So(err, ShouldBeNil) err = s.Schedule(smallCmd, smallReq, 2) So(err, ShouldBeNil) - err = s.Schedule(bigCmd, bigReq, 1) + err = s.Schedule(bigCmd, blockReq, 1) So(err, ShouldBeNil) for { @@ -303,6 +304,11 @@ func TestLocal(t *testing.T) { So(len(smallTimes), ShouldEqual, 2) So(bigTimes[0], ShouldHappenOnOrBefore, smallTimes[0]) So(bigTimes[0], ShouldHappenOnOrBefore, smallTimes[1]) + // *** one of the above 2 tests can fail; the jobs start in the + // correct order, which is what we're trying to test for, but + // finish in the wrong order. That is, the big job takes a few + // extra ms before it does anything. Not sure how to test for + // actual job start time order... }) } From 4e36873c561b8c93f18dec77c2cf54aa64311a99 Mon Sep 17 00:00:00 2001 From: Sendu Bala Date: Mon, 21 May 2018 14:03:36 +0100 Subject: [PATCH 33/33] Update CHANGELOG for v0.13.0 --- CHANGELOG.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 00fc71c40..962c98afd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,50 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## [0.13.0] - 2018-05-21 +### Added +- Minimual LSF client (bsub, bjobs, bkill) emulation, for using wr as the + backend scheduler and runner for legacy or alternate workflow systems, such as + Nextflow or Martian. +- `wr add` has new --monitor_docker option to get accurate memory and cpu usage + stats for jobs that run docker, and to kill those dockers when you kill the + job. +- `wr add` has new --cloud_shared option, for turning on a simple NFS shared + disk when using the OpenStack scheduler with Ubuntu. +- `wr status`, `retry`, `kill` and `remove` take new -z and -y modifiers to + treat -i as a repgroup substr (show status of jobs in multiple repgroups) or + as an internal job identifier (which are now displayed by `status`). +- `wr status` has new -o option to define the output format, including new json + and summary formats (shows mean resource usage across a repgroup). + +### Changed +- Jobs are now only killed if they both use more than expected memory and more + than 90% of total physical memory. +- Local scheduler (and by extension some behaviour of the OpenStack scheduler) + now does bin packing, trying to run as many jobs as possible in parallel by + filling in "gaps" in resource usage. Commands that use more resources will be + scheduled to run before other commands. Job priority only decides the order + that jobs of equal resource usage run in. +- Trying to start the manager in OpenStack mode outside of OpenStack now + immediately returns an error. +- `wr manager start` now shows error/crit lines from the log, on failure to + start. +- Backwards incompatible changes to cloud API. + +### Fixed +- `wr manager start` no longer logs its authentication token. +- Race condition where an OpenStack server could be destroyed yet be considered + usable. +- `wr` client commands now obey managerhost config option when not running on + the same host as the manager. +- OpenStack scheduler no longer ignores requested job memory when non-default + OS disk set. +- Reported peak memory usage of jobs fixed to consider usage of all child + processes of the initial command (even if they change their process group and + fork). +- Reported CPU time of jobs fixed to include user time, not just system time. + + ## [0.12.0] - 2018-04-27 ### Added - All communications to the manager are now via TLS, and authentication is