Add a memory monitor daemon for GPU and ARM memory.

KanoComputing · Nov 16, 2016 · 0924c8c · 0924c8c
1 parent 00e6ab1
commit 0924c8c
Show file tree

Hide file tree

Showing 3 changed files with 313 additions and 0 deletions.
diff --git a/bin/kano-memory-monitor b/bin/kano-memory-monitor
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+# kano-memory-monitor
+#
+# Copyright (C) 2015-2016 Kano Computing Ltd.
+# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2
+#
+# DBus daemon which handles memory reservation and monitoring
+
+# The deamon uses a system bus under the bus name 'me.kano.memory'
+
+
+import sys
+import traceback
+import dbus.exceptions
+from dbus.mainloop.glib import DBusGMainLoop
+
+from gi.repository import GObject
+
+from kano.logging import logger
+
+from kano_monitor.service import MonitorService, BUS_NAME
+
+
+
+def main():
+    GObject.threads_init()
+    DBusGMainLoop(set_as_default=True)
+
+    try:
+        # reserving the bus name, other instances of $0 will fail
+        bus_name = dbus.service.BusName(BUS_NAME, bus=dbus.SystemBus(), do_not_queue=True)
+
+        # available services, add more here
+        service = MonitorService(bus_name)
+
+    except dbus.exceptions.NameExistsException as e:
+        logger.warn('Could not reserve the SystemBus name, most likely another instance'
+                    ' of kano-boards-daemon already exists. - [{}]'.format(e))
+        return 1
+
+    except Exception as e:
+        logger.error('Unexpected error when starting the services.\n{}'
+                     .format(traceback.format_exc()))
+        return 2
+
+    GObject.MainLoop().run()
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/kano_monitor/__init__.py b/kano_monitor/__init__.py
@@ -0,0 +1,5 @@
+# __init__.py
+#
+# Copyright (C) 2015-2016 Kano Computing Ltd.
+# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2
+#
diff --git a/kano_monitor/service.py b/kano_monitor/service.py
@@ -0,0 +1,257 @@
+# service.py
+#
+# Copyright (C) 2015-2016 Kano Computing Ltd.
+# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2
+#
+# Memory tracking service.
+#
+# All numbers are in kB
+
+import os
+
+import dbus
+import dbus.service
+import json
+import subprocess
+
+from gi.repository import GObject
+
+from kano.logging import logger
+
+BUS_NAME = 'me.kano.monitor'
+MONITOR_OBJECT_PATH = '/me/kano/monitor/memory'
+MONITOR_IFACE = 'me.kano.monitor.memory'
+
+MEM_INFO_PATH = "/proc/meminfo"
+FUDGE_FACTOR = 9*1024
+
+warnings = {}
+
+
+def do_warn(code, warning):
+    global warnings
+    print str(warnings)
+    if code not in warnings:
+        logger.error(warning)
+        if logger.get_output_level() == 'debug':
+            os.system("kano-start-splash -b 0 /usr/share/linux-story/media/images/rm.png &")
+
+        warnings[code] = 1
+
+
+def get_arm_mem():
+    total = None
+    avail = None
+    for line in open(MEM_INFO_PATH).readlines():
+        (name, num) = line.split(":")
+        num = int(num.lstrip().split(' ', 1)[0])
+        if name == 'MemTotal':
+            total = num
+        elif name == 'MemAvailable':
+            avail = num
+        if total is not None and avail is not None:
+            break
+    return total, avail
+
+
+class pidTrack:
+    def __init__(self, pid, gpu_reservation, arm_reservation, name):
+        self.pid = pid
+        self.gpu_reservation = gpu_reservation
+        self.arm_reservation = arm_reservation
+        self.name = name
+        self.orig_dir = os.getcwd()
+        try:
+            self.proc_dir = os.open('/proc/{}'.format(pid), os.O_DIRECTORY)
+        except OSError:
+            # mark as already exitted
+            self.proc_dir = None
+
+    def has_exited(self):
+        if not self.proc_dir:
+            return True
+        try:
+            os.fchdir(self.proc_dir)
+            os.chdir(self.orig_dir)
+        except OSError:
+            logger.debug("pid {} quit".format(self.pid))
+            os.close(self.proc_dir)
+            return True
+        return False
+
+    def to_dict(self):
+        res = {}
+        res['pid'] = self.pid
+        res['gpu_reservation'] = self.gpu_reservation
+        res['arm_reservation'] = self.arm_reservation
+        res['name'] = self.name
+        res['rss'] = self.get_mem()
+        return res
+
+    def get_mem(self):
+        try:
+            stat = open('/proc/{}/stat'.format(self.pid)).read()
+            # field 23 of proc stat is rss, which is in 4kb pages
+            rss = int(stat.split(' ')[23]) * 4
+        except:
+            rss = -1
+        return rss
+
+    def check_arm_mem(self):
+        try:
+            rss = self.get_mem()
+            logger.debug('rss {}'.format(rss))
+            if rss > self.arm_reservation:
+                warning = " {} (pid {}) using more than allocated {} > {}".format(
+                    self.name, self.pid, rss, self.arm_reservation)
+                do_warn('arm {}'.format(self.pid), warning)
+        except:
+            logger.warn("error parsing stat file for {}".format(self.pid))
+
+
+
+class MonitorService(dbus.service.Object):
+    """
+    This is a DBus Service provided by kano-boards-daemon.
+
+    It exports an object to /me/kano/monitor and
+    its interface to me.kano.monitor
+
+    Does not require sudo.
+    """
+
+    # the top priority level for an api lock
+    MAX_PRIORITY_LEVEL = 10  # this is public
+
+    def __init__(self, bus_name):
+        dbus.service.Object.__init__(self, bus_name, MONITOR_OBJECT_PATH)
+
+        self.MONITOR_POLL_RATE = 1000 * 5
+        self.current_gpu_reloc_free = 99   # Should be overridden 
+        self.current_gpu_reloc_total = 99 # Should be overridden
+        self.current_gpu_reserved = 0
+
+        self.get_arm_mem_info()
+        self.current_arm_reserved = 0
+
+        self.tracking_pids = []
+
+
+        GObject.threads_init()
+        GObject.timeout_add(self.MONITOR_POLL_RATE, self._monitor_thread)
+
+    def get_arm_mem_info(self):
+        try:
+            (total, avail) = get_arm_mem()
+            self.current_arm_free = avail
+            self.current_arm_total = total
+        except:
+            logger.error("error reading arm mem info")
+
+    def check_gpu_mem(self):
+        reservation_gpu_total = 0
+        for pid in self.tracking_pids:
+            reservation_gpu_total = reservation_gpu_total + pid.gpu_reservation
+
+        if reservation_gpu_total != self.current_gpu_reserved:
+            do_warn('intmem', "Internal error in gpumem calc")
+
+        gpu_used = self.current_gpu_reloc_total - self.current_gpu_reloc_free
+
+        if reservation_gpu_total < gpu_used:
+            warning = "More GPU used than reserved: {} > {}\n".format(gpu_used, reservation_gpu_total)
+            warning = warning + self.status() + '\n'
+            try:
+                gpu_procs = subprocess.check_output('lsof /dev/vchiq')
+                warning += gpu_procs
+            except:
+                pass
+            do_warn('res: {}'.format(reservation_gpu_total), warning)
+
+
+    def _monitor_thread(self):
+        """
+        """
+        try:
+            print "monitor"
+            self.get_arm_mem_info()
+            self.check_pids()
+
+            self.check_gpu_mem()
+
+        finally:
+            return True  # keep calling this method indefinitely
+
+    @dbus.service.method(MONITOR_IFACE, in_signature='ii', out_signature='')
+    def set_current_gpu_reloc_free(self, gpu_reloc_free, gpu_reloc_total):
+        '''
+        This is called by the GPU monitor in kano-notifications-daemon to inform
+        us of the current free gpu memory. We also set the total every time (alhough
+        it doesn't change) because this is more robust to the startup sequence of 
+        the different daemons.
+        '''
+        self.current_gpu_reloc_free = gpu_reloc_free
+        self.current_gpu_reloc_total = gpu_reloc_total - FUDGE_FACTOR
+        logger.debug('Free reloc: {} Total: {} '.format(gpu_reloc_free, gpu_reloc_total))
+
+    @dbus.service.method(MONITOR_IFACE, in_signature='iiis', out_signature='b')
+    def reserve(self, pid, gpu_reservation, arm_reservation, name):
+
+        predicted_gpu_free = self.current_gpu_reloc_total - self.current_gpu_reserved
+
+        if (gpu_reservation > predicted_gpu_free or
+            gpu_reservation > self.current_gpu_reloc_free):
+                logger.warn("Failed reservation: {}(pid {}) wanted GPU mem {} "
+                               "current free {} predicted free {}".format(name,
+                                                                          pid,
+                                                                          gpu_reservation,
+                                                                          self.current_gpu_reloc_total,
+                                                                          predicted_gpu_free))
+                return False
+
+        predicted_arm_free = self.current_arm_total - self.current_arm_reserved
+        if (arm_reservation > predicted_arm_free or
+            arm_reservation > self.current_arm_free):
+                logger.warn("Failed reservation: {}(pid {}) wanted ARM mem {} "
+                               "current free {} predicted free {}".format(name,
+                                                                          pid,
+                                                                          arm_reservation,
+                                                                          self.current_arm_total,
+                                                                          predicted_arm_free))
+                return False
+
+        self.tracking_pids.append(pidTrack(pid,
+                                           gpu_reservation,
+                                           arm_reservation,
+                                           name))
+        self.current_gpu_reserved += gpu_reservation
+        self.current_arm_reserved += arm_reservation
+        return True
+
+    @dbus.service.method(MONITOR_IFACE, in_signature='', out_signature='s')
+    def status(self):
+        res = {}
+        res['current_gpu_reloc_free'] = self.current_gpu_reloc_free
+        res['current_gpu_reloc_total'] = self.current_gpu_reloc_total
+        res['current_arm_free'] = self.current_arm_free
+        res['current_arm_total'] = self.current_arm_total
+        res['processes'] = []
+        for tp in self.tracking_pids:
+            res['processes'].append(tp.to_dict())
+        return json.dumps(res)
+
+    def check_pids(self):
+        i = 0
+        while i < len(self.tracking_pids):
+            logger.debug(str(i))
+            if self.tracking_pids[i].has_exited():
+                self.free_reservation_accounting(self.tracking_pids[i])
+                del self.tracking_pids[i]
+            else:
+                logger.debug('s'+str(i))
+                self.tracking_pids[i].check_arm_mem()
+                i = i + 1
+
+    def free_reservation_accounting(self, pidtrack):
+        self.current_gpu_reserved -= pidtrack.gpu_reservation
+        self.current_arm_reserved -= pidtrack.arm_reservation