-
Notifications
You must be signed in to change notification settings - Fork 2
/
sacct.py
120 lines (99 loc) · 3.16 KB
/
sacct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
import datetime
import json
import subprocess
import re
from ClusterShell import NodeSet
args = ["sacct", "-X", "--allusers", "--parsable2", "--format",
"jobid,jobidraw,cluster,partition,account,group,gid,"
"user,uid,submit,eligible,start,end,elapsed,elapsedraw,exitcode,state,nnodes,"
"ncpus,reqcpus,reqmem,reqgres,reqtres,timelimit,nodelist,jobname",
"--state",
"CANCELLED,COMPLETED,FAILED,NODE_FAIL,PREEMPTED,TIMEOUT"]
SLURM_DATE_FORMAT = "%Y-%m-%dT%H:%M:%S"
TIMESTAMP_FILE = "lasttimestamp"
# Work out starttime and endtime
now = datetime.datetime.utcnow()
end_str = now.strftime(SLURM_DATE_FORMAT)
try:
with open(TIMESTAMP_FILE) as f:
start_str = f.read()
except FileNotFoundError:
# Default to start of today
start_str = "00:00:00"
args += ["--starttime", start_str]
args += ["--endtime", end_str]
#print(" ".join(args))
process = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding="UTF-8")
# Use the title line to work out the attribute order
lines = process.stdout.split("\n")
titles_line = lines[0]
attributes = titles_line.split("|")
# Try to output any errors we might have hit
if len(attributes) < 3:
print(lines)
exit(-1)
# Parse each line of sacct output into a dict
items = []
for line in lines[1:]:
components = line.split("|")
if len(components) != len(attributes):
continue
item = {}
for i in range(len(attributes)):
key = attributes[i]
value = components[i]
# Try to convert to int
if "JobID" not in key:
try:
value = int(value)
except:
pass
item[key] = value
# Unpack NodeList format, so its easier to search for hostnames
nodelist = item.get("NodeList")
if nodelist:
nodeset = NodeSet.NodeSet(nodelist)
nodes = list([x for x in nodeset])
item["AllNodes"] = nodes
# Produce a prometheus style regex
nodes_regex = "|".join([re.escape(x) for x in nodes])
item["AllNodesRegex"] = nodes_regex
start = item.get("Start")
if start:
item["StartEpoch"] = int(datetime.datetime.strptime(start, SLURM_DATE_FORMAT).timestamp() * 1000)
end = item.get("End")
if end:
item["EndEpoch"] = int(datetime.datetime.strptime(end, SLURM_DATE_FORMAT).timestamp() * 1000)
# Exclude job steps
jobid = item.get("JobID")
if jobid and "." not in jobid:
items.append(item)
print(json.dumps(item))
# Write out timestamp, so we know where to start next time
next = now + datetime.timedelta(seconds=1)
next_str = next.strftime(SLURM_DATE_FORMAT)
with open(TIMESTAMP_FILE, 'w') as f:
f.write(next_str)
#print(len(items))
# Do a per node summary of job ids
# TODO: arguments to toggle this output
import collections
node_jobs = collections.defaultdict(list)
jobs = {}
for job in items:
jobs[job["JobID"]] = job
for node in job["AllNodes"]:
node_jobs[node] += [{
"id": job["JobID"],
"start": job["Start"],
"end": job["End"],
}]
#print(jobs)
node_info = {
"node_info": dict(node_jobs),
"start": start_str,
"end": end_str,
}
#if node_info["node_info"]:
# print(node_info)