From 58fdfff97ed7d4fe4af7ae80cc6ec01efe211053 Mon Sep 17 00:00:00 2001 From: Mustafa Kemal Gilor Date: Tue, 20 Sep 2022 16:26:00 +0300 Subject: [PATCH] unitdata: Only keep delta of env variable changes as revision (#736) The unitdata code keeps environment variable revisions in kv_revisions table, and all of the environment variables serialized into JSON and pushed into the table on each hook invocation. There is a check to see if current env variable set is same with the previous one, but some of the environment variables (e.g. JUJU_CONTEXT_ID) is always changing, which means whole env variable list is serialized and pushed to the DB on every single hook invocation. That has become a problem in deployments that has very large (e.g. ~70 KiB) environment variable lists. Given that update-status is running periodically (every 5 mins by default) and having many charms in the same environment, the disk space of the host running the charms runs out over time and needs manual intervention to clear up kv_revisions table. This fix aims to avoid that by keeping only the delta of environment variable list as a revision. Note that this is a breaking change and will break any charms that are relying on `env` value's layout on returned from `gethistory()` function. Closes-Bug: LP#1930173 --- charmhelpers/core/unitdata.py | 92 +++++++++++++++++++------- tests/core/test_unitdata.py | 120 ++++++++++++++++++++++++++++++++-- 2 files changed, 183 insertions(+), 29 deletions(-) diff --git a/charmhelpers/core/unitdata.py b/charmhelpers/core/unitdata.py index d9b8d0b09..a250939b7 100644 --- a/charmhelpers/core/unitdata.py +++ b/charmhelpers/core/unitdata.py @@ -147,6 +147,7 @@ def config_changed(): """ import collections +import collections.abc import contextlib import datetime import itertools @@ -271,12 +272,13 @@ def unsetrange(self, keys=None, prefix=""): 'insert into kv_revisions values (?, ?, ?)', ['%s%%' % prefix, self.revision, json.dumps('DELETED')]) - def set(self, key, value): + def set(self, key, value, delta_revisions=False): """ Set a value in the database. :param str key: Key to set the value for :param value: Any JSON-serializable value to be set + :param delta_revisions: Only keep delta of the changes as revision """ serialized = json.dumps(value) @@ -302,6 +304,21 @@ def set(self, key, value): if not self.revision: return value + serialized_revision = serialized + if delta_revisions: + # Delta revisions is only available for Mapping + # types at the moment. + if isinstance(value, collections.abc.Mapping): + # If the invocation opted in for delta-revisions + # only the delta between current and new will be + # serialized and pushed as a revision. + # This is for keeping a low footprint on disk space + current = None + if exists: + current = json.loads(exists[0]) + delta = self.mapping_delta(value, current) + serialized_revision = json.dumps(delta) + self.cursor.execute( 'select 1 from kv_revisions where key=? and revision=?', [key, self.revision]) @@ -311,7 +328,7 @@ def set(self, key, value): self.cursor.execute( '''insert into kv_revisions ( revision, key, data) values (?, ?, ?)''', - (self.revision, key, serialized)) + (self.revision, key, serialized_revision)) else: self.cursor.execute( ''' @@ -319,39 +336,52 @@ def set(self, key, value): set data = ? where key = ? and revision = ?''', - [serialized, key, self.revision]) + [serialized_revision, key, self.revision]) return value - def delta(self, mapping, prefix): + def mapping_delta(self, current_mapping, previous_mapping): """ - return a delta containing values that have changed. + return the difference between two Mapping objects. """ - previous = self.getrange(prefix, strip=True) - if not previous: - pk = set() - else: - pk = set(previous.keys()) - ck = set(mapping.keys()) delta = DeltaSet() + previous_keys = set() + current_keys = set() + if previous_mapping: + previous_keys = set(previous_mapping.keys()) + else: + previous_mapping = dict() + + if current_mapping: + current_keys = set(current_mapping.keys()) + else: + current_mapping = dict() - # added - for k in ck.difference(pk): - delta[k] = Delta(None, mapping[k]) + # Added + for k in current_keys.difference(previous_keys): + delta[k] = Delta(None, current_mapping[k]) - # removed - for k in pk.difference(ck): - delta[k] = Delta(previous[k], None) + # Removed + for k in previous_keys.difference(current_keys): + delta[k] = Delta(previous_mapping[k], None) - # changed - for k in pk.intersection(ck): - c = mapping[k] - p = previous[k] + # Changed + for k in previous_keys.intersection(current_keys): + c = current_mapping[k] + p = previous_mapping[k] if c != p: delta[k] = Delta(p, c) return delta + def delta(self, mapping, prefix): + """ + return a delta containing values that have changed. + """ + previous = self.getrange(prefix, strip=True) + + return self.mapping_delta(mapping, previous) + @contextlib.contextmanager def hook_scope(self, name=""): """Scope all future interactions to the current hook execution @@ -491,7 +521,25 @@ def _record_hook(self, hookenv): data = hookenv.execution_environment() self.conf = conf_delta = self.kv.delta(data['conf'], 'config') self.rels = rels_delta = self.kv.delta(data['rels'], 'rels') - self.kv.set('env', dict(data['env'])) + + # NOTE(mustafakemalgilor): The environment variable revisions were + # originally kept as a whole. That has caused disk space issues on + # some deployments where the size of environment variable list were + # large. This should not been an issue if the environment variable + # list were stable, but some of the environment variables (e.g. + # JUJU_CONTEXT_ID) are always changing. I've seen some deployments + # with environment variable lists ~70 KiB in size. Considering the + # fact that the record_hook function is being called on each hook + # invocation, it is no surprise that the unit-state database file + # grows with time. To give a more concrete example, even if only + # update-status hook is being called for a charm, that means at + # least 70 KiB of data is being pushed into kv_relations table every + # five minutes which is roughly equivalent to 7 GiB of kv_relations + # history in a year -- not to mention this is valid for every single + # charm. Multiple charms performing this in same environment over + # extended periods of time is enough to fill the storage space quickly. + + self.kv.set('env', dict(data['env']), delta_revisions=True) self.kv.set('unit', data['unit']) self.kv.set('relid', data.get('relid')) return conf_delta, rels_delta diff --git a/tests/core/test_unitdata.py b/tests/core/test_unitdata.py index c6a85ecf5..cfb8d0b2d 100644 --- a/tests/core/test_unitdata.py +++ b/tests/core/test_unitdata.py @@ -17,15 +17,15 @@ from mock import patch -from charmhelpers.core.unitdata import Storage, HookData, kv +from charmhelpers.core.unitdata import Storage, HookData class HookDataTest(unittest.TestCase): - def setUp(self): self.charm_dir = tempfile.mkdtemp() self.addCleanup(lambda: shutil.rmtree(self.charm_dir)) self.change_environment(CHARM_DIR=self.charm_dir) + self.kv = Storage() def change_environment(self, **kw): original_env = dict(os.environ) @@ -40,22 +40,67 @@ def cleanup_env(): @patch('charmhelpers.core.hookenv.hook_name') @patch('charmhelpers.core.hookenv.execution_environment') @patch('charmhelpers.core.hookenv.charm_dir') - def test_hook_data_records(self, cdir, ctx, name): + @patch('charmhelpers.core.unitdata.kv') + def test_hook_data_records(self, kv, cdir, ctx, name): + kv.return_value = self.kv + hook_data = HookData() name.return_value = 'config-changed' ctx.return_value = { 'rels': {}, 'conf': {'a': 1}, 'env': {}, 'unit': 'someunit'} cdir.return_value = self.charm_dir with open(os.path.join(self.charm_dir, 'revision'), 'w') as fh: fh.write('1') - hook_data = HookData() with hook_data(): - self.assertEqual(kv(), hook_data.kv) - self.assertEqual(kv().get('charm_revisions'), ['1']) - self.assertEqual(kv().get('unit'), 'someunit') + self.assertEqual(self.kv, hook_data.kv) + self.assertEqual(self.kv.get('charm_revisions'), ['1']) + self.assertEqual(self.kv.get('unit'), 'someunit') self.assertEqual(list(hook_data.conf), ['a']) self.assertEqual(tuple(hook_data.conf.a), (None, 1)) + @patch('charmhelpers.core.hookenv.hook_name') + @patch('charmhelpers.core.hookenv.execution_environment') + @patch('charmhelpers.core.hookenv.charm_dir') + @patch('charmhelpers.core.unitdata.kv') + def test_hook_data_environment(self, kv, cdir, ctx, name): + kv.return_value = self.kv + hook_data = HookData() + name.return_value = 'config-changed' + mock_env = { + "SHELL": "/bin/bash", + "SESSION_MANAGER": "local/workstation:@/tmp/.ICE-unix/8101,unix/workstation:/tmp/.ICE-unix/8101", + "COLORTERM": "truecolor", + "LC_ADDRESS": "tr_TR.UTF-8", + "LC_NAME": "tr_TR.UTF-8", + "DESKTOP_SESSION": "ubuntu", + "LC_MONETARY": "tr_TR.UTF-8", + "PWD": "/tmp", + "XDG_SESSION_DESKTOP": "ubuntu", + "LOGNAME": "user", + "HOME": "/home/user", + "USERNAME": "user", + "LC_PAPER": "tr_TR.UTF-8", + "LANG": "en_US.UTF-8" + } + + mock_env_delta = {} + for key in mock_env: + mock_env_delta[key] = [None, mock_env[key]] + + ctx.return_value = { + 'rels': {}, 'conf': {'a': 2}, 'env': mock_env, 'unit': 'someunit'} + cdir.return_value = self.charm_dir + with open(os.path.join(self.charm_dir, 'revision'), 'w') as fh: + fh.write('2') + + self.maxDiff = 50000 + with hook_data(): + self.assertEqual(self.kv.get('env'), mock_env) + + history = list(self.kv.gethistory('env', deserialize=True)) + self.assertEqual(1, len(history)) + self.assertCountEqual(history[0][2], mock_env_delta) + class StorageTest(unittest.TestCase): @@ -108,6 +153,44 @@ def test_hook_scope(self): [(1, 'a', 1, 'config-changed'), (2, 'a', True, 'start')]) + def test_hook_scope_delta_revisions(self): + kv = Storage(':memory:') + with kv.hook_scope('some-hook') as rev: + self.assertEqual(rev, 1) + kv.set('env', {'ENVVAR1': "DUMMY1", 'ENVVAR2': "DUMMY2"}, delta_revisions=True) + kv.set('env', {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"}, delta_revisions=True) + self.assertEqual(kv.get('env'), {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"}) + + with kv.hook_scope('some-other-hook') as rev: + self.assertEqual(rev, 2) + kv.set('env', {'ENVVAR3': "DUMMY3"}, delta_revisions=True) + self.assertEqual(kv.get('env'), {'ENVVAR3': "DUMMY3"}) + + history = [h[:-1] for h in kv.gethistory('env', deserialize=True)] + self.assertEqual( + history, + [(1, 'env', {"ENVVAR3": [None, "DUMMY3"], "ENVVAR2": ["DUMMY2", None], "ENVVAR1": ["DUMMY1", "DUMMY1_2"]}, 'some-hook'), + (2, 'env', {"ENVVAR1": ["DUMMY1_2", None]}, 'some-other-hook')]) + + def test_hook_scope_no_delta_revisions(self): + kv = Storage(':memory:') + with kv.hook_scope('some-hook') as rev: + self.assertEqual(rev, 1) + kv.set('env', {'ENVVAR1': "DUMMY1", 'ENVVAR2': "DUMMY2"}, delta_revisions=False) + kv.set('env', {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"}, delta_revisions=False) + self.assertEqual(kv.get('env'), {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"}) + + with kv.hook_scope('some-other-hook') as rev: + self.assertEqual(rev, 2) + kv.set('env', {'ENVVAR3': "DUMMY3"}, delta_revisions=False) + self.assertEqual(kv.get('env'), {'ENVVAR3': "DUMMY3"}) + + history = [h[:-1] for h in kv.gethistory('env', deserialize=True)] + self.assertEqual( + history, + [(1, 'env', {"ENVVAR3": "DUMMY3", "ENVVAR1": "DUMMY1_2"}, 'some-hook'), + (2, 'env', {"ENVVAR3": "DUMMY3"}, 'some-other-hook')]) + def test_delta_no_previous_and_history(self): kv = Storage(':memory:') with kv.hook_scope('install'): @@ -177,6 +260,29 @@ def test_record(self): else: self.fail('attribute error should fire on nonexistant') + def test_mapping_delta(self): + # Add + kv = Storage(':memory:') + current_1 = {'a': 0, 'c': False} + new_1 = {'a': 0, 'b': "test", 'c': False} + delta_1 = kv.mapping_delta(new_1, current_1) + + self.assertFalse(hasattr(delta_1, 'a')) + self.assertFalse(hasattr(delta_1, 'c')) + self.assertEqual(delta_1.b.previous, None) + self.assertEqual(delta_1.b.current, "test") + + current_2 = {'a': 0, 'c': False} + new_2 = {'a': 1, 'b': "test"} + delta_2 = kv.mapping_delta(new_2, current_2) + + self.assertEqual(delta_2.a.previous, 0) + self.assertEqual(delta_2.a.current, 1) + self.assertEqual(delta_2.b.previous, None) + self.assertEqual(delta_2.b.current, "test") + self.assertEqual(delta_2.c.previous, False) + self.assertEqual(delta_2.c.current, None) + def test_delta(self): kv = Storage(':memory:') kv.update({'a': 1, 'b': 2.2}, prefix="x")