Skip to content

Commit

Permalink
Add a memory monitor daemon for GPU and ARM memory.
Browse files Browse the repository at this point in the history
  • Loading branch information
Ealdwulf committed Nov 16, 2016
1 parent 00e6ab1 commit 0924c8c
Show file tree
Hide file tree
Showing 3 changed files with 313 additions and 0 deletions.
51 changes: 51 additions & 0 deletions bin/kano-memory-monitor
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env python

# kano-memory-monitor
#
# Copyright (C) 2015-2016 Kano Computing Ltd.
# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2
#
# DBus daemon which handles memory reservation and monitoring

# The deamon uses a system bus under the bus name 'me.kano.memory'


import sys
import traceback
import dbus.exceptions
from dbus.mainloop.glib import DBusGMainLoop

from gi.repository import GObject

from kano.logging import logger

from kano_monitor.service import MonitorService, BUS_NAME



def main():
GObject.threads_init()
DBusGMainLoop(set_as_default=True)

try:
# reserving the bus name, other instances of $0 will fail
bus_name = dbus.service.BusName(BUS_NAME, bus=dbus.SystemBus(), do_not_queue=True)

# available services, add more here
service = MonitorService(bus_name)

except dbus.exceptions.NameExistsException as e:
logger.warn('Could not reserve the SystemBus name, most likely another instance'
' of kano-boards-daemon already exists. - [{}]'.format(e))
return 1

except Exception as e:
logger.error('Unexpected error when starting the services.\n{}'
.format(traceback.format_exc()))
return 2

GObject.MainLoop().run()


if __name__ == '__main__':
sys.exit(main())
5 changes: 5 additions & 0 deletions kano_monitor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# __init__.py
#
# Copyright (C) 2015-2016 Kano Computing Ltd.
# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2
#
257 changes: 257 additions & 0 deletions kano_monitor/service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
# service.py
#
# Copyright (C) 2015-2016 Kano Computing Ltd.
# License: http://www.gnu.org/licenses/gpl-2.0.txt GNU GPL v2
#
# Memory tracking service.
#
# All numbers are in kB

import os

import dbus
import dbus.service
import json
import subprocess

from gi.repository import GObject

from kano.logging import logger

BUS_NAME = 'me.kano.monitor'
MONITOR_OBJECT_PATH = '/me/kano/monitor/memory'
MONITOR_IFACE = 'me.kano.monitor.memory'

MEM_INFO_PATH = "/proc/meminfo"
FUDGE_FACTOR = 9*1024

warnings = {}


def do_warn(code, warning):
global warnings
print str(warnings)
if code not in warnings:
logger.error(warning)
if logger.get_output_level() == 'debug':
os.system("kano-start-splash -b 0 /usr/share/linux-story/media/images/rm.png &")

warnings[code] = 1


def get_arm_mem():
total = None
avail = None
for line in open(MEM_INFO_PATH).readlines():
(name, num) = line.split(":")
num = int(num.lstrip().split(' ', 1)[0])
if name == 'MemTotal':
total = num
elif name == 'MemAvailable':
avail = num
if total is not None and avail is not None:
break
return total, avail


class pidTrack:
def __init__(self, pid, gpu_reservation, arm_reservation, name):
self.pid = pid
self.gpu_reservation = gpu_reservation
self.arm_reservation = arm_reservation
self.name = name
self.orig_dir = os.getcwd()
try:
self.proc_dir = os.open('/proc/{}'.format(pid), os.O_DIRECTORY)
except OSError:
# mark as already exitted
self.proc_dir = None

def has_exited(self):
if not self.proc_dir:
return True
try:
os.fchdir(self.proc_dir)
os.chdir(self.orig_dir)
except OSError:
logger.debug("pid {} quit".format(self.pid))
os.close(self.proc_dir)
return True
return False

def to_dict(self):
res = {}
res['pid'] = self.pid
res['gpu_reservation'] = self.gpu_reservation
res['arm_reservation'] = self.arm_reservation
res['name'] = self.name
res['rss'] = self.get_mem()
return res

def get_mem(self):
try:
stat = open('/proc/{}/stat'.format(self.pid)).read()
# field 23 of proc stat is rss, which is in 4kb pages
rss = int(stat.split(' ')[23]) * 4
except:
rss = -1
return rss

def check_arm_mem(self):
try:
rss = self.get_mem()
logger.debug('rss {}'.format(rss))
if rss > self.arm_reservation:
warning = " {} (pid {}) using more than allocated {} > {}".format(
self.name, self.pid, rss, self.arm_reservation)
do_warn('arm {}'.format(self.pid), warning)
except:
logger.warn("error parsing stat file for {}".format(self.pid))



class MonitorService(dbus.service.Object):
"""
This is a DBus Service provided by kano-boards-daemon.
It exports an object to /me/kano/monitor and
its interface to me.kano.monitor
Does not require sudo.
"""

# the top priority level for an api lock
MAX_PRIORITY_LEVEL = 10 # this is public

def __init__(self, bus_name):
dbus.service.Object.__init__(self, bus_name, MONITOR_OBJECT_PATH)

self.MONITOR_POLL_RATE = 1000 * 5
self.current_gpu_reloc_free = 99 # Should be overridden
self.current_gpu_reloc_total = 99 # Should be overridden
self.current_gpu_reserved = 0

self.get_arm_mem_info()
self.current_arm_reserved = 0

self.tracking_pids = []


GObject.threads_init()
GObject.timeout_add(self.MONITOR_POLL_RATE, self._monitor_thread)

def get_arm_mem_info(self):
try:
(total, avail) = get_arm_mem()
self.current_arm_free = avail
self.current_arm_total = total
except:
logger.error("error reading arm mem info")

def check_gpu_mem(self):
reservation_gpu_total = 0
for pid in self.tracking_pids:
reservation_gpu_total = reservation_gpu_total + pid.gpu_reservation

if reservation_gpu_total != self.current_gpu_reserved:
do_warn('intmem', "Internal error in gpumem calc")

gpu_used = self.current_gpu_reloc_total - self.current_gpu_reloc_free

if reservation_gpu_total < gpu_used:
warning = "More GPU used than reserved: {} > {}\n".format(gpu_used, reservation_gpu_total)
warning = warning + self.status() + '\n'
try:
gpu_procs = subprocess.check_output('lsof /dev/vchiq')
warning += gpu_procs
except:
pass
do_warn('res: {}'.format(reservation_gpu_total), warning)


def _monitor_thread(self):
"""
"""
try:
print "monitor"
self.get_arm_mem_info()
self.check_pids()

self.check_gpu_mem()

finally:
return True # keep calling this method indefinitely

@dbus.service.method(MONITOR_IFACE, in_signature='ii', out_signature='')
def set_current_gpu_reloc_free(self, gpu_reloc_free, gpu_reloc_total):
'''
This is called by the GPU monitor in kano-notifications-daemon to inform
us of the current free gpu memory. We also set the total every time (alhough
it doesn't change) because this is more robust to the startup sequence of
the different daemons.
'''
self.current_gpu_reloc_free = gpu_reloc_free
self.current_gpu_reloc_total = gpu_reloc_total - FUDGE_FACTOR
logger.debug('Free reloc: {} Total: {} '.format(gpu_reloc_free, gpu_reloc_total))

@dbus.service.method(MONITOR_IFACE, in_signature='iiis', out_signature='b')
def reserve(self, pid, gpu_reservation, arm_reservation, name):

predicted_gpu_free = self.current_gpu_reloc_total - self.current_gpu_reserved

if (gpu_reservation > predicted_gpu_free or
gpu_reservation > self.current_gpu_reloc_free):
logger.warn("Failed reservation: {}(pid {}) wanted GPU mem {} "
"current free {} predicted free {}".format(name,
pid,
gpu_reservation,
self.current_gpu_reloc_total,
predicted_gpu_free))
return False

predicted_arm_free = self.current_arm_total - self.current_arm_reserved
if (arm_reservation > predicted_arm_free or
arm_reservation > self.current_arm_free):
logger.warn("Failed reservation: {}(pid {}) wanted ARM mem {} "
"current free {} predicted free {}".format(name,
pid,
arm_reservation,
self.current_arm_total,
predicted_arm_free))
return False

self.tracking_pids.append(pidTrack(pid,
gpu_reservation,
arm_reservation,
name))
self.current_gpu_reserved += gpu_reservation
self.current_arm_reserved += arm_reservation
return True

@dbus.service.method(MONITOR_IFACE, in_signature='', out_signature='s')
def status(self):
res = {}
res['current_gpu_reloc_free'] = self.current_gpu_reloc_free
res['current_gpu_reloc_total'] = self.current_gpu_reloc_total
res['current_arm_free'] = self.current_arm_free
res['current_arm_total'] = self.current_arm_total
res['processes'] = []
for tp in self.tracking_pids:
res['processes'].append(tp.to_dict())
return json.dumps(res)

def check_pids(self):
i = 0
while i < len(self.tracking_pids):
logger.debug(str(i))
if self.tracking_pids[i].has_exited():
self.free_reservation_accounting(self.tracking_pids[i])
del self.tracking_pids[i]
else:
logger.debug('s'+str(i))
self.tracking_pids[i].check_arm_mem()
i = i + 1

def free_reservation_accounting(self, pidtrack):
self.current_gpu_reserved -= pidtrack.gpu_reservation
self.current_arm_reserved -= pidtrack.arm_reservation

0 comments on commit 0924c8c

Please sign in to comment.