Skip to content

Commit

Permalink
unitdata: Only keep delta of env variable changes as revision (juju#736)
Browse files Browse the repository at this point in the history
The unitdata code keeps environment variable revisions in kv_revisions
table, and all of the environment variables serialized into JSON and
pushed into the table on each hook invocation. There is a check to see
if current env variable set is same with the previous one, but some of
the environment variables (e.g. JUJU_CONTEXT_ID) is always changing, which
means whole env variable list is serialized and pushed to the DB on every
single hook invocation. That has become a problem in deployments that
has very large (e.g. ~70 KiB) environment variable lists. Given that
update-status is running periodically (every 5 mins by default) and
having many charms in the same environment, the disk space of the host
running the charms runs out over time and needs manual intervention to
clear up kv_revisions table.

This fix aims to avoid that by keeping only the delta of environment
variable list as a revision. Note that this is a breaking change and
will break any charms that are relying on `env` value's layout on
returned from `gethistory()` function.

Closes-Bug: LP#1930173
  • Loading branch information
xmkg committed Sep 21, 2022
1 parent 4137a3d commit 58fdfff
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 29 deletions.
92 changes: 70 additions & 22 deletions charmhelpers/core/unitdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ def config_changed():
"""

import collections
import collections.abc
import contextlib
import datetime
import itertools
Expand Down Expand Up @@ -271,12 +272,13 @@ def unsetrange(self, keys=None, prefix=""):
'insert into kv_revisions values (?, ?, ?)',
['%s%%' % prefix, self.revision, json.dumps('DELETED')])

def set(self, key, value):
def set(self, key, value, delta_revisions=False):
"""
Set a value in the database.
:param str key: Key to set the value for
:param value: Any JSON-serializable value to be set
:param delta_revisions: Only keep delta of the changes as revision
"""
serialized = json.dumps(value)

Expand All @@ -302,6 +304,21 @@ def set(self, key, value):
if not self.revision:
return value

serialized_revision = serialized
if delta_revisions:
# Delta revisions is only available for Mapping
# types at the moment.
if isinstance(value, collections.abc.Mapping):
# If the invocation opted in for delta-revisions
# only the delta between current and new will be
# serialized and pushed as a revision.
# This is for keeping a low footprint on disk space
current = None
if exists:
current = json.loads(exists[0])
delta = self.mapping_delta(value, current)
serialized_revision = json.dumps(delta)

self.cursor.execute(
'select 1 from kv_revisions where key=? and revision=?',
[key, self.revision])
Expand All @@ -311,47 +328,60 @@ def set(self, key, value):
self.cursor.execute(
'''insert into kv_revisions (
revision, key, data) values (?, ?, ?)''',
(self.revision, key, serialized))
(self.revision, key, serialized_revision))
else:
self.cursor.execute(
'''
update kv_revisions
set data = ?
where key = ?
and revision = ?''',
[serialized, key, self.revision])
[serialized_revision, key, self.revision])

return value

def delta(self, mapping, prefix):
def mapping_delta(self, current_mapping, previous_mapping):
"""
return a delta containing values that have changed.
return the difference between two Mapping objects.
"""
previous = self.getrange(prefix, strip=True)
if not previous:
pk = set()
else:
pk = set(previous.keys())
ck = set(mapping.keys())
delta = DeltaSet()
previous_keys = set()
current_keys = set()
if previous_mapping:
previous_keys = set(previous_mapping.keys())
else:
previous_mapping = dict()

if current_mapping:
current_keys = set(current_mapping.keys())
else:
current_mapping = dict()

# added
for k in ck.difference(pk):
delta[k] = Delta(None, mapping[k])
# Added
for k in current_keys.difference(previous_keys):
delta[k] = Delta(None, current_mapping[k])

# removed
for k in pk.difference(ck):
delta[k] = Delta(previous[k], None)
# Removed
for k in previous_keys.difference(current_keys):
delta[k] = Delta(previous_mapping[k], None)

# changed
for k in pk.intersection(ck):
c = mapping[k]
p = previous[k]
# Changed
for k in previous_keys.intersection(current_keys):
c = current_mapping[k]
p = previous_mapping[k]
if c != p:
delta[k] = Delta(p, c)

return delta

def delta(self, mapping, prefix):
"""
return a delta containing values that have changed.
"""
previous = self.getrange(prefix, strip=True)

return self.mapping_delta(mapping, previous)

@contextlib.contextmanager
def hook_scope(self, name=""):
"""Scope all future interactions to the current hook execution
Expand Down Expand Up @@ -491,7 +521,25 @@ def _record_hook(self, hookenv):
data = hookenv.execution_environment()
self.conf = conf_delta = self.kv.delta(data['conf'], 'config')
self.rels = rels_delta = self.kv.delta(data['rels'], 'rels')
self.kv.set('env', dict(data['env']))

# NOTE(mustafakemalgilor): The environment variable revisions were
# originally kept as a whole. That has caused disk space issues on
# some deployments where the size of environment variable list were
# large. This should not been an issue if the environment variable
# list were stable, but some of the environment variables (e.g.
# JUJU_CONTEXT_ID) are always changing. I've seen some deployments
# with environment variable lists ~70 KiB in size. Considering the
# fact that the record_hook function is being called on each hook
# invocation, it is no surprise that the unit-state database file
# grows with time. To give a more concrete example, even if only
# update-status hook is being called for a charm, that means at
# least 70 KiB of data is being pushed into kv_relations table every
# five minutes which is roughly equivalent to 7 GiB of kv_relations
# history in a year -- not to mention this is valid for every single
# charm. Multiple charms performing this in same environment over
# extended periods of time is enough to fill the storage space quickly.

self.kv.set('env', dict(data['env']), delta_revisions=True)
self.kv.set('unit', data['unit'])
self.kv.set('relid', data.get('relid'))
return conf_delta, rels_delta
Expand Down
120 changes: 113 additions & 7 deletions tests/core/test_unitdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@

from mock import patch

from charmhelpers.core.unitdata import Storage, HookData, kv
from charmhelpers.core.unitdata import Storage, HookData


class HookDataTest(unittest.TestCase):

def setUp(self):
self.charm_dir = tempfile.mkdtemp()
self.addCleanup(lambda: shutil.rmtree(self.charm_dir))
self.change_environment(CHARM_DIR=self.charm_dir)
self.kv = Storage()

def change_environment(self, **kw):
original_env = dict(os.environ)
Expand All @@ -40,22 +40,67 @@ def cleanup_env():
@patch('charmhelpers.core.hookenv.hook_name')
@patch('charmhelpers.core.hookenv.execution_environment')
@patch('charmhelpers.core.hookenv.charm_dir')
def test_hook_data_records(self, cdir, ctx, name):
@patch('charmhelpers.core.unitdata.kv')
def test_hook_data_records(self, kv, cdir, ctx, name):
kv.return_value = self.kv
hook_data = HookData()
name.return_value = 'config-changed'
ctx.return_value = {
'rels': {}, 'conf': {'a': 1}, 'env': {}, 'unit': 'someunit'}
cdir.return_value = self.charm_dir
with open(os.path.join(self.charm_dir, 'revision'), 'w') as fh:
fh.write('1')
hook_data = HookData()

with hook_data():
self.assertEqual(kv(), hook_data.kv)
self.assertEqual(kv().get('charm_revisions'), ['1'])
self.assertEqual(kv().get('unit'), 'someunit')
self.assertEqual(self.kv, hook_data.kv)
self.assertEqual(self.kv.get('charm_revisions'), ['1'])
self.assertEqual(self.kv.get('unit'), 'someunit')
self.assertEqual(list(hook_data.conf), ['a'])
self.assertEqual(tuple(hook_data.conf.a), (None, 1))

@patch('charmhelpers.core.hookenv.hook_name')
@patch('charmhelpers.core.hookenv.execution_environment')
@patch('charmhelpers.core.hookenv.charm_dir')
@patch('charmhelpers.core.unitdata.kv')
def test_hook_data_environment(self, kv, cdir, ctx, name):
kv.return_value = self.kv
hook_data = HookData()
name.return_value = 'config-changed'
mock_env = {
"SHELL": "/bin/bash",
"SESSION_MANAGER": "local/workstation:@/tmp/.ICE-unix/8101,unix/workstation:/tmp/.ICE-unix/8101",
"COLORTERM": "truecolor",
"LC_ADDRESS": "tr_TR.UTF-8",
"LC_NAME": "tr_TR.UTF-8",
"DESKTOP_SESSION": "ubuntu",
"LC_MONETARY": "tr_TR.UTF-8",
"PWD": "/tmp",
"XDG_SESSION_DESKTOP": "ubuntu",
"LOGNAME": "user",
"HOME": "/home/user",
"USERNAME": "user",
"LC_PAPER": "tr_TR.UTF-8",
"LANG": "en_US.UTF-8"
}

mock_env_delta = {}
for key in mock_env:
mock_env_delta[key] = [None, mock_env[key]]

ctx.return_value = {
'rels': {}, 'conf': {'a': 2}, 'env': mock_env, 'unit': 'someunit'}
cdir.return_value = self.charm_dir
with open(os.path.join(self.charm_dir, 'revision'), 'w') as fh:
fh.write('2')

self.maxDiff = 50000
with hook_data():
self.assertEqual(self.kv.get('env'), mock_env)

history = list(self.kv.gethistory('env', deserialize=True))
self.assertEqual(1, len(history))
self.assertCountEqual(history[0][2], mock_env_delta)


class StorageTest(unittest.TestCase):

Expand Down Expand Up @@ -108,6 +153,44 @@ def test_hook_scope(self):
[(1, 'a', 1, 'config-changed'),
(2, 'a', True, 'start')])

def test_hook_scope_delta_revisions(self):
kv = Storage(':memory:')
with kv.hook_scope('some-hook') as rev:
self.assertEqual(rev, 1)
kv.set('env', {'ENVVAR1': "DUMMY1", 'ENVVAR2': "DUMMY2"}, delta_revisions=True)
kv.set('env', {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"}, delta_revisions=True)
self.assertEqual(kv.get('env'), {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"})

with kv.hook_scope('some-other-hook') as rev:
self.assertEqual(rev, 2)
kv.set('env', {'ENVVAR3': "DUMMY3"}, delta_revisions=True)
self.assertEqual(kv.get('env'), {'ENVVAR3': "DUMMY3"})

history = [h[:-1] for h in kv.gethistory('env', deserialize=True)]
self.assertEqual(
history,
[(1, 'env', {"ENVVAR3": [None, "DUMMY3"], "ENVVAR2": ["DUMMY2", None], "ENVVAR1": ["DUMMY1", "DUMMY1_2"]}, 'some-hook'),
(2, 'env', {"ENVVAR1": ["DUMMY1_2", None]}, 'some-other-hook')])

def test_hook_scope_no_delta_revisions(self):
kv = Storage(':memory:')
with kv.hook_scope('some-hook') as rev:
self.assertEqual(rev, 1)
kv.set('env', {'ENVVAR1': "DUMMY1", 'ENVVAR2': "DUMMY2"}, delta_revisions=False)
kv.set('env', {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"}, delta_revisions=False)
self.assertEqual(kv.get('env'), {'ENVVAR1': "DUMMY1_2", 'ENVVAR3': "DUMMY3"})

with kv.hook_scope('some-other-hook') as rev:
self.assertEqual(rev, 2)
kv.set('env', {'ENVVAR3': "DUMMY3"}, delta_revisions=False)
self.assertEqual(kv.get('env'), {'ENVVAR3': "DUMMY3"})

history = [h[:-1] for h in kv.gethistory('env', deserialize=True)]
self.assertEqual(
history,
[(1, 'env', {"ENVVAR3": "DUMMY3", "ENVVAR1": "DUMMY1_2"}, 'some-hook'),
(2, 'env', {"ENVVAR3": "DUMMY3"}, 'some-other-hook')])

def test_delta_no_previous_and_history(self):
kv = Storage(':memory:')
with kv.hook_scope('install'):
Expand Down Expand Up @@ -177,6 +260,29 @@ def test_record(self):
else:
self.fail('attribute error should fire on nonexistant')

def test_mapping_delta(self):
# Add
kv = Storage(':memory:')
current_1 = {'a': 0, 'c': False}
new_1 = {'a': 0, 'b': "test", 'c': False}
delta_1 = kv.mapping_delta(new_1, current_1)

self.assertFalse(hasattr(delta_1, 'a'))
self.assertFalse(hasattr(delta_1, 'c'))
self.assertEqual(delta_1.b.previous, None)
self.assertEqual(delta_1.b.current, "test")

current_2 = {'a': 0, 'c': False}
new_2 = {'a': 1, 'b': "test"}
delta_2 = kv.mapping_delta(new_2, current_2)

self.assertEqual(delta_2.a.previous, 0)
self.assertEqual(delta_2.a.current, 1)
self.assertEqual(delta_2.b.previous, None)
self.assertEqual(delta_2.b.current, "test")
self.assertEqual(delta_2.c.previous, False)
self.assertEqual(delta_2.c.current, None)

def test_delta(self):
kv = Storage(':memory:')
kv.update({'a': 1, 'b': 2.2}, prefix="x")
Expand Down

0 comments on commit 58fdfff

Please sign in to comment.