diff --git a/bin/kano-memory-monitor b/bin/kano-memory-monitor new file mode 100644 index 0000000..eceee89 --- /dev/null +++ b/bin/kano-memory-monitor @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +# kano-memory-monitor +# +# Copyright (C) 2015-2016 Kano Computing Ltd. +# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2 +# +# DBus daemon which handles memory reservation and monitoring + +# The deamon uses a system bus under the bus name 'me.kano.memory' + + +import sys +import traceback +import dbus.exceptions +from dbus.mainloop.glib import DBusGMainLoop + +from gi.repository import GObject + +from kano.logging import logger + +from kano_monitor.service import MonitorService, BUS_NAME + + + +def main(): + GObject.threads_init() + DBusGMainLoop(set_as_default=True) + + try: + # reserving the bus name, other instances of $0 will fail + bus_name = dbus.service.BusName(BUS_NAME, bus=dbus.SystemBus(), do_not_queue=True) + + # available services, add more here + service = MonitorService(bus_name) + + except dbus.exceptions.NameExistsException as e: + logger.warn('Could not reserve the SystemBus name, most likely another instance' + ' of kano-boards-daemon already exists. - [{}]'.format(e)) + return 1 + + except Exception as e: + logger.error('Unexpected error when starting the services.\n{}' + .format(traceback.format_exc())) + return 2 + + GObject.MainLoop().run() + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/kano_monitor/__init__.py b/kano_monitor/__init__.py new file mode 100644 index 0000000..1fe3125 --- /dev/null +++ b/kano_monitor/__init__.py @@ -0,0 +1,5 @@ +# __init__.py +# +# Copyright (C) 2015-2016 Kano Computing Ltd. +# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2 +# diff --git a/kano_monitor/service.py b/kano_monitor/service.py new file mode 100644 index 0000000..444ee10 --- /dev/null +++ b/kano_monitor/service.py @@ -0,0 +1,257 @@ +# service.py +# +# Copyright (C) 2015-2016 Kano Computing Ltd. +# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2 +# +# Memory tracking service. +# +# All numbers are in kB + +import os + +import dbus +import dbus.service +import json +import subprocess + +from gi.repository import GObject + +from kano.logging import logger + +BUS_NAME = 'me.kano.monitor' +MONITOR_OBJECT_PATH = '/me/kano/monitor/memory' +MONITOR_IFACE = 'me.kano.monitor.memory' + +MEM_INFO_PATH = "/proc/meminfo" +FUDGE_FACTOR = 9*1024 + +warnings = {} + + +def do_warn(code, warning): + global warnings + print str(warnings) + if code not in warnings: + logger.error(warning) + if logger.get_output_level() == 'debug': + os.system("kano-start-splash -b 0 /usr/share/linux-story/media/images/rm.png &") + + warnings[code] = 1 + + +def get_arm_mem(): + total = None + avail = None + for line in open(MEM_INFO_PATH).readlines(): + (name, num) = line.split(":") + num = int(num.lstrip().split(' ', 1)[0]) + if name == 'MemTotal': + total = num + elif name == 'MemAvailable': + avail = num + if total is not None and avail is not None: + break + return total, avail + + +class pidTrack: + def __init__(self, pid, gpu_reservation, arm_reservation, name): + self.pid = pid + self.gpu_reservation = gpu_reservation + self.arm_reservation = arm_reservation + self.name = name + self.orig_dir = os.getcwd() + try: + self.proc_dir = os.open('/proc/{}'.format(pid), os.O_DIRECTORY) + except OSError: + # mark as already exitted + self.proc_dir = None + + def has_exited(self): + if not self.proc_dir: + return True + try: + os.fchdir(self.proc_dir) + os.chdir(self.orig_dir) + except OSError: + logger.debug("pid {} quit".format(self.pid)) + os.close(self.proc_dir) + return True + return False + + def to_dict(self): + res = {} + res['pid'] = self.pid + res['gpu_reservation'] = self.gpu_reservation + res['arm_reservation'] = self.arm_reservation + res['name'] = self.name + res['rss'] = self.get_mem() + return res + + def get_mem(self): + try: + stat = open('/proc/{}/stat'.format(self.pid)).read() + # field 23 of proc stat is rss, which is in 4kb pages + rss = int(stat.split(' ')[23]) * 4 + except: + rss = -1 + return rss + + def check_arm_mem(self): + try: + rss = self.get_mem() + logger.debug('rss {}'.format(rss)) + if rss > self.arm_reservation: + warning = " {} (pid {}) using more than allocated {} > {}".format( + self.name, self.pid, rss, self.arm_reservation) + do_warn('arm {}'.format(self.pid), warning) + except: + logger.warn("error parsing stat file for {}".format(self.pid)) + + + +class MonitorService(dbus.service.Object): + """ + This is a DBus Service provided by kano-boards-daemon. + + It exports an object to /me/kano/monitor and + its interface to me.kano.monitor + + Does not require sudo. + """ + + # the top priority level for an api lock + MAX_PRIORITY_LEVEL = 10 # this is public + + def __init__(self, bus_name): + dbus.service.Object.__init__(self, bus_name, MONITOR_OBJECT_PATH) + + self.MONITOR_POLL_RATE = 1000 * 5 + self.current_gpu_reloc_free = 99 # Should be overridden + self.current_gpu_reloc_total = 99 # Should be overridden + self.current_gpu_reserved = 0 + + self.get_arm_mem_info() + self.current_arm_reserved = 0 + + self.tracking_pids = [] + + + GObject.threads_init() + GObject.timeout_add(self.MONITOR_POLL_RATE, self._monitor_thread) + + def get_arm_mem_info(self): + try: + (total, avail) = get_arm_mem() + self.current_arm_free = avail + self.current_arm_total = total + except: + logger.error("error reading arm mem info") + + def check_gpu_mem(self): + reservation_gpu_total = 0 + for pid in self.tracking_pids: + reservation_gpu_total = reservation_gpu_total + pid.gpu_reservation + + if reservation_gpu_total != self.current_gpu_reserved: + do_warn('intmem', "Internal error in gpumem calc") + + gpu_used = self.current_gpu_reloc_total - self.current_gpu_reloc_free + + if reservation_gpu_total < gpu_used: + warning = "More GPU used than reserved: {} > {}\n".format(gpu_used, reservation_gpu_total) + warning = warning + self.status() + '\n' + try: + gpu_procs = subprocess.check_output('lsof /dev/vchiq') + warning += gpu_procs + except: + pass + do_warn('res: {}'.format(reservation_gpu_total), warning) + + + def _monitor_thread(self): + """ + """ + try: + print "monitor" + self.get_arm_mem_info() + self.check_pids() + + self.check_gpu_mem() + + finally: + return True # keep calling this method indefinitely + + @dbus.service.method(MONITOR_IFACE, in_signature='ii', out_signature='') + def set_current_gpu_reloc_free(self, gpu_reloc_free, gpu_reloc_total): + ''' + This is called by the GPU monitor in kano-notifications-daemon to inform + us of the current free gpu memory. We also set the total every time (alhough + it doesn't change) because this is more robust to the startup sequence of + the different daemons. + ''' + self.current_gpu_reloc_free = gpu_reloc_free + self.current_gpu_reloc_total = gpu_reloc_total - FUDGE_FACTOR + logger.debug('Free reloc: {} Total: {} '.format(gpu_reloc_free, gpu_reloc_total)) + + @dbus.service.method(MONITOR_IFACE, in_signature='iiis', out_signature='b') + def reserve(self, pid, gpu_reservation, arm_reservation, name): + + predicted_gpu_free = self.current_gpu_reloc_total - self.current_gpu_reserved + + if (gpu_reservation > predicted_gpu_free or + gpu_reservation > self.current_gpu_reloc_free): + logger.warn("Failed reservation: {}(pid {}) wanted GPU mem {} " + "current free {} predicted free {}".format(name, + pid, + gpu_reservation, + self.current_gpu_reloc_total, + predicted_gpu_free)) + return False + + predicted_arm_free = self.current_arm_total - self.current_arm_reserved + if (arm_reservation > predicted_arm_free or + arm_reservation > self.current_arm_free): + logger.warn("Failed reservation: {}(pid {}) wanted ARM mem {} " + "current free {} predicted free {}".format(name, + pid, + arm_reservation, + self.current_arm_total, + predicted_arm_free)) + return False + + self.tracking_pids.append(pidTrack(pid, + gpu_reservation, + arm_reservation, + name)) + self.current_gpu_reserved += gpu_reservation + self.current_arm_reserved += arm_reservation + return True + + @dbus.service.method(MONITOR_IFACE, in_signature='', out_signature='s') + def status(self): + res = {} + res['current_gpu_reloc_free'] = self.current_gpu_reloc_free + res['current_gpu_reloc_total'] = self.current_gpu_reloc_total + res['current_arm_free'] = self.current_arm_free + res['current_arm_total'] = self.current_arm_total + res['processes'] = [] + for tp in self.tracking_pids: + res['processes'].append(tp.to_dict()) + return json.dumps(res) + + def check_pids(self): + i = 0 + while i < len(self.tracking_pids): + logger.debug(str(i)) + if self.tracking_pids[i].has_exited(): + self.free_reservation_accounting(self.tracking_pids[i]) + del self.tracking_pids[i] + else: + logger.debug('s'+str(i)) + self.tracking_pids[i].check_arm_mem() + i = i + 1 + + def free_reservation_accounting(self, pidtrack): + self.current_gpu_reserved -= pidtrack.gpu_reservation + self.current_arm_reserved -= pidtrack.arm_reservation