-
Notifications
You must be signed in to change notification settings - Fork 27
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a memory monitor daemon for GPU and ARM memory.
- Loading branch information
Showing
3 changed files
with
313 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#!/usr/bin/env python | ||
|
||
# kano-memory-monitor | ||
# | ||
# Copyright (C) 2015-2016 Kano Computing Ltd. | ||
# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2 | ||
# | ||
# DBus daemon which handles memory reservation and monitoring | ||
|
||
# The deamon uses a system bus under the bus name 'me.kano.memory' | ||
|
||
|
||
import sys | ||
import traceback | ||
import dbus.exceptions | ||
from dbus.mainloop.glib import DBusGMainLoop | ||
|
||
from gi.repository import GObject | ||
|
||
from kano.logging import logger | ||
|
||
from kano_monitor.service import MonitorService, BUS_NAME | ||
|
||
|
||
|
||
def main(): | ||
GObject.threads_init() | ||
DBusGMainLoop(set_as_default=True) | ||
|
||
try: | ||
# reserving the bus name, other instances of $0 will fail | ||
bus_name = dbus.service.BusName(BUS_NAME, bus=dbus.SystemBus(), do_not_queue=True) | ||
|
||
# available services, add more here | ||
service = MonitorService(bus_name) | ||
|
||
except dbus.exceptions.NameExistsException as e: | ||
logger.warn('Could not reserve the SystemBus name, most likely another instance' | ||
' of kano-boards-daemon already exists. - [{}]'.format(e)) | ||
return 1 | ||
|
||
except Exception as e: | ||
logger.error('Unexpected error when starting the services.\n{}' | ||
.format(traceback.format_exc())) | ||
return 2 | ||
|
||
GObject.MainLoop().run() | ||
|
||
|
||
if __name__ == '__main__': | ||
sys.exit(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# __init__.py | ||
# | ||
# Copyright (C) 2015-2016 Kano Computing Ltd. | ||
# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2 | ||
# |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,257 @@ | ||
# service.py | ||
# | ||
# Copyright (C) 2015-2016 Kano Computing Ltd. | ||
# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2 | ||
# | ||
# Memory tracking service. | ||
# | ||
# All numbers are in kB | ||
|
||
import os | ||
|
||
import dbus | ||
import dbus.service | ||
import json | ||
import subprocess | ||
|
||
from gi.repository import GObject | ||
|
||
from kano.logging import logger | ||
|
||
BUS_NAME = 'me.kano.monitor' | ||
MONITOR_OBJECT_PATH = '/me/kano/monitor/memory' | ||
MONITOR_IFACE = 'me.kano.monitor.memory' | ||
|
||
MEM_INFO_PATH = "/proc/meminfo" | ||
FUDGE_FACTOR = 9*1024 | ||
|
||
warnings = {} | ||
|
||
|
||
def do_warn(code, warning): | ||
global warnings | ||
print str(warnings) | ||
if code not in warnings: | ||
logger.error(warning) | ||
if logger.get_output_level() == 'debug': | ||
os.system("kano-start-splash -b 0 /usr/share/linux-story/media/images/rm.png &") | ||
|
||
warnings[code] = 1 | ||
|
||
|
||
def get_arm_mem(): | ||
total = None | ||
avail = None | ||
for line in open(MEM_INFO_PATH).readlines(): | ||
(name, num) = line.split(":") | ||
num = int(num.lstrip().split(' ', 1)[0]) | ||
if name == 'MemTotal': | ||
total = num | ||
elif name == 'MemAvailable': | ||
avail = num | ||
if total is not None and avail is not None: | ||
break | ||
return total, avail | ||
|
||
|
||
class pidTrack: | ||
def __init__(self, pid, gpu_reservation, arm_reservation, name): | ||
self.pid = pid | ||
self.gpu_reservation = gpu_reservation | ||
self.arm_reservation = arm_reservation | ||
self.name = name | ||
self.orig_dir = os.getcwd() | ||
try: | ||
self.proc_dir = os.open('/proc/{}'.format(pid), os.O_DIRECTORY) | ||
except OSError: | ||
# mark as already exitted | ||
self.proc_dir = None | ||
|
||
def has_exited(self): | ||
if not self.proc_dir: | ||
return True | ||
try: | ||
os.fchdir(self.proc_dir) | ||
os.chdir(self.orig_dir) | ||
except OSError: | ||
logger.debug("pid {} quit".format(self.pid)) | ||
os.close(self.proc_dir) | ||
return True | ||
return False | ||
|
||
def to_dict(self): | ||
res = {} | ||
res['pid'] = self.pid | ||
res['gpu_reservation'] = self.gpu_reservation | ||
res['arm_reservation'] = self.arm_reservation | ||
res['name'] = self.name | ||
res['rss'] = self.get_mem() | ||
return res | ||
|
||
def get_mem(self): | ||
try: | ||
stat = open('/proc/{}/stat'.format(self.pid)).read() | ||
# field 23 of proc stat is rss, which is in 4kb pages | ||
rss = int(stat.split(' ')[23]) * 4 | ||
except: | ||
rss = -1 | ||
return rss | ||
|
||
def check_arm_mem(self): | ||
try: | ||
rss = self.get_mem() | ||
logger.debug('rss {}'.format(rss)) | ||
if rss > self.arm_reservation: | ||
warning = " {} (pid {}) using more than allocated {} > {}".format( | ||
self.name, self.pid, rss, self.arm_reservation) | ||
do_warn('arm {}'.format(self.pid), warning) | ||
except: | ||
logger.warn("error parsing stat file for {}".format(self.pid)) | ||
|
||
|
||
|
||
class MonitorService(dbus.service.Object): | ||
""" | ||
This is a DBus Service provided by kano-boards-daemon. | ||
It exports an object to /me/kano/monitor and | ||
its interface to me.kano.monitor | ||
Does not require sudo. | ||
""" | ||
|
||
# the top priority level for an api lock | ||
MAX_PRIORITY_LEVEL = 10 # this is public | ||
|
||
def __init__(self, bus_name): | ||
dbus.service.Object.__init__(self, bus_name, MONITOR_OBJECT_PATH) | ||
|
||
self.MONITOR_POLL_RATE = 1000 * 5 | ||
self.current_gpu_reloc_free = 99 # Should be overridden | ||
self.current_gpu_reloc_total = 99 # Should be overridden | ||
self.current_gpu_reserved = 0 | ||
|
||
self.get_arm_mem_info() | ||
self.current_arm_reserved = 0 | ||
|
||
self.tracking_pids = [] | ||
|
||
|
||
GObject.threads_init() | ||
GObject.timeout_add(self.MONITOR_POLL_RATE, self._monitor_thread) | ||
|
||
def get_arm_mem_info(self): | ||
try: | ||
(total, avail) = get_arm_mem() | ||
self.current_arm_free = avail | ||
self.current_arm_total = total | ||
except: | ||
logger.error("error reading arm mem info") | ||
|
||
def check_gpu_mem(self): | ||
reservation_gpu_total = 0 | ||
for pid in self.tracking_pids: | ||
reservation_gpu_total = reservation_gpu_total + pid.gpu_reservation | ||
|
||
if reservation_gpu_total != self.current_gpu_reserved: | ||
do_warn('intmem', "Internal error in gpumem calc") | ||
|
||
gpu_used = self.current_gpu_reloc_total - self.current_gpu_reloc_free | ||
|
||
if reservation_gpu_total < gpu_used: | ||
warning = "More GPU used than reserved: {} > {}\n".format(gpu_used, reservation_gpu_total) | ||
warning = warning + self.status() + '\n' | ||
try: | ||
gpu_procs = subprocess.check_output('lsof /dev/vchiq') | ||
warning += gpu_procs | ||
except: | ||
pass | ||
do_warn('res: {}'.format(reservation_gpu_total), warning) | ||
|
||
|
||
def _monitor_thread(self): | ||
""" | ||
""" | ||
try: | ||
print "monitor" | ||
self.get_arm_mem_info() | ||
self.check_pids() | ||
|
||
self.check_gpu_mem() | ||
|
||
finally: | ||
return True # keep calling this method indefinitely | ||
|
||
@dbus.service.method(MONITOR_IFACE, in_signature='ii', out_signature='') | ||
def set_current_gpu_reloc_free(self, gpu_reloc_free, gpu_reloc_total): | ||
''' | ||
This is called by the GPU monitor in kano-notifications-daemon to inform | ||
us of the current free gpu memory. We also set the total every time (alhough | ||
it doesn't change) because this is more robust to the startup sequence of | ||
the different daemons. | ||
''' | ||
self.current_gpu_reloc_free = gpu_reloc_free | ||
self.current_gpu_reloc_total = gpu_reloc_total - FUDGE_FACTOR | ||
logger.debug('Free reloc: {} Total: {} '.format(gpu_reloc_free, gpu_reloc_total)) | ||
|
||
@dbus.service.method(MONITOR_IFACE, in_signature='iiis', out_signature='b') | ||
def reserve(self, pid, gpu_reservation, arm_reservation, name): | ||
|
||
predicted_gpu_free = self.current_gpu_reloc_total - self.current_gpu_reserved | ||
|
||
if (gpu_reservation > predicted_gpu_free or | ||
gpu_reservation > self.current_gpu_reloc_free): | ||
logger.warn("Failed reservation: {}(pid {}) wanted GPU mem {} " | ||
"current free {} predicted free {}".format(name, | ||
pid, | ||
gpu_reservation, | ||
self.current_gpu_reloc_total, | ||
predicted_gpu_free)) | ||
return False | ||
|
||
predicted_arm_free = self.current_arm_total - self.current_arm_reserved | ||
if (arm_reservation > predicted_arm_free or | ||
arm_reservation > self.current_arm_free): | ||
logger.warn("Failed reservation: {}(pid {}) wanted ARM mem {} " | ||
"current free {} predicted free {}".format(name, | ||
pid, | ||
arm_reservation, | ||
self.current_arm_total, | ||
predicted_arm_free)) | ||
return False | ||
|
||
self.tracking_pids.append(pidTrack(pid, | ||
gpu_reservation, | ||
arm_reservation, | ||
name)) | ||
self.current_gpu_reserved += gpu_reservation | ||
self.current_arm_reserved += arm_reservation | ||
return True | ||
|
||
@dbus.service.method(MONITOR_IFACE, in_signature='', out_signature='s') | ||
def status(self): | ||
res = {} | ||
res['current_gpu_reloc_free'] = self.current_gpu_reloc_free | ||
res['current_gpu_reloc_total'] = self.current_gpu_reloc_total | ||
res['current_arm_free'] = self.current_arm_free | ||
res['current_arm_total'] = self.current_arm_total | ||
res['processes'] = [] | ||
for tp in self.tracking_pids: | ||
res['processes'].append(tp.to_dict()) | ||
return json.dumps(res) | ||
|
||
def check_pids(self): | ||
i = 0 | ||
while i < len(self.tracking_pids): | ||
logger.debug(str(i)) | ||
if self.tracking_pids[i].has_exited(): | ||
self.free_reservation_accounting(self.tracking_pids[i]) | ||
del self.tracking_pids[i] | ||
else: | ||
logger.debug('s'+str(i)) | ||
self.tracking_pids[i].check_arm_mem() | ||
i = i + 1 | ||
|
||
def free_reservation_accounting(self, pidtrack): | ||
self.current_gpu_reserved -= pidtrack.gpu_reservation | ||
self.current_arm_reserved -= pidtrack.arm_reservation |