-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
122 lines (115 loc) · 3.29 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package main
import (
"bufio"
"log"
"os"
"os/exec"
"strings"
"time"
"github.com/coreos/go-systemd/v22/sdjournal"
"github.com/jessevdk/go-flags"
)
var badLines = []string{"torch.cuda.OutOfMemoryError", "torch.OutOfMemoryError", "TypeError: VanillaTemporalModule.forward()", "RuntimeError: Expected all tensors", "RuntimeError: The size of tensor a", "RuntimeError: CUDA error", "einops.EinopsError", "ZeroDivisionError", "ValueError: range", "cudaMalloc failed: out of memory"}
var params struct {
DockerDir string `short:"d" description:"Main directory with docker-compose.yml" required:"true"`
ServiceNames []string `short:"s" description:"Docker compose service name to watch and restart, can be specified multiple times" required:"true"`
FifoPath string `short:"f" description:"FIFO control file"`
PrometheusPort int `short:"p" description:"Prometheus HTTP metrics port"`
}
func restarter(dockerDir string) chan string {
svcChan := make(chan string, 10)
go func() {
for serviceName := range svcChan {
restartCmd := exec.Command("docker", "compose", "restart", serviceName, "-t", "0")
restartCmd.Dir = dockerDir
restartCmd.Run()
log.Printf("Service %s restarted", serviceName)
}
}()
return svcChan
}
func watchLog(dockerDir string, serviceNames []string, restarter chan string, promchan chan<- MetricUpdate) {
quit := make(chan struct{})
for _, serviceName := range serviceNames {
go func() {
for {
logCmd := exec.Command("docker", "compose", "logs", serviceName, "-n", "1", "-f")
logCmd.Dir = dockerDir
logPipe, err := logCmd.StdoutPipe()
if err != nil {
log.Fatal("Error watching log: ", err)
}
s := bufio.NewScanner(logPipe)
logCmd.Start()
for s.Scan() {
line := s.Text()
for _, l := range badLines {
if strings.Contains(line, l) {
log.Printf("Service %s misbehaving, restarting...", serviceName)
restarter <- serviceName
promchan <- MetricUpdate{Reason: "python", Value: 1}
}
}
}
logCmd.Wait()
time.Sleep(time.Second * 5)
log.Println("Reconnecting to the log...")
}
}()
}
<-quit
}
func main() {
_, err := flags.Parse(¶ms)
if err != nil {
os.Exit(1)
}
promchan := addMetrics(params.PrometheusPort)
restarterChan := restarter(params.DockerDir)
go watchLog(params.DockerDir, params.ServiceNames, restarterChan, promchan)
if params.FifoPath != "" {
err = fifo(params.FifoPath, params.ServiceNames, restarterChan, promchan)
if err != nil {
log.Fatal(err)
}
}
j, err := sdjournal.NewJournal()
if err != nil {
log.Fatal(err)
}
err = j.AddMatch(sdjournal.SD_JOURNAL_FIELD_SYSLOG_IDENTIFIER + "=kernel")
if err != nil {
log.Print(err)
}
err = j.SeekTail()
if err != nil {
log.Print(err)
}
_, err = j.Previous()
if err != nil {
log.Print(err)
}
for {
i, err := j.Next()
if err != nil {
log.Print(err)
}
if i == 0 {
j.Wait(sdjournal.IndefiniteWait)
continue
}
e, err := j.GetEntry()
if err != nil {
log.Print(err)
continue
}
v := e.Fields[sdjournal.SD_JOURNAL_FIELD_MESSAGE]
if strings.Contains(v, "Xid") && strings.Contains(v, "python") {
log.Printf("GPU error detected: %+v", v)
for _, s := range params.ServiceNames {
restarterChan <- s
}
promchan <- MetricUpdate{Reason: "xid", Value: 1}
}
}
}