From 2ba62a90c1629e4458390c67b3e6d1528c3a4203 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 30 Sep 2019 10:48:03 -0700 Subject: [PATCH] Encode variable values for use over URLs (#87) * TST: tests for encoded values * ENH/MAINT: centralize method for getting sample metadata values * Encode for URLs * Clean up some dev clutter * Fixed hardcoded test * Addressing @eldeveloper's comments and dropping py27 support * readd skbio install, and remove py37 due to dependency conflicts --- .travis.yml | 5 +-- redbiom/admin.py | 21 ++++------ redbiom/fetch.py | 78 +++++++++++++++++++++++-------------- redbiom/tests/test_admin.py | 16 ++++++++ redbiom/tests/test_fetch.py | 41 ++++++++++++++++++- redbiom/tests/test_rest.py | 2 +- 6 files changed, 113 insertions(+), 50 deletions(-) diff --git a/.travis.yml b/.travis.yml index 1c4326b..e4bbac9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ language: python env: - PYTHON_VERSION=3.6 - PYTHON_VERSION=3.5 - - PYTHON_VERSION=2.7 services: - redis-server before_install: @@ -16,9 +15,7 @@ before_install: install: - conda create --yes -n test-env -c bioconda python=$PYTHON_VERSION biom-format requests pandas click==6.7 nose sqlite joblib nltk msgpack-python cython - source activate test-env - - if [ ${PYTHON_VERSION} = "2.7" ]; then conda install -c biocore --yes scikit-bio==0.4.2; fi - - if [ ${PYTHON_VERSION} = "3.5" ]; then conda install -c conda-forge --yes scikit-bio; fi - - if [ ${PYTHON_VERSION} = "3.6" ]; then conda install -c conda-forge --yes scikit-bio; fi + - conda install -c conda-forge --yes scikit-bio - pip install flake8 msgpack - git clone https://github.com/nicolasff/webdis - pushd webdis diff --git a/redbiom/admin.py b/redbiom/admin.py index 7562973..8e12def 100644 --- a/redbiom/admin.py +++ b/redbiom/admin.py @@ -1,3 +1,6 @@ +from urllib.parse import quote_plus + + class ScriptManager: """Static singleton for managing Lua scripts in the Redis backend""" # derived from http://stackoverflow.com/a/43900922/19741 @@ -462,9 +465,9 @@ def load_sample_metadata(md, tag=None): put('metadata', 'SET', key, json.dumps(columns)) for col in indexed_columns: - bulk_set = ["%s/%s" % (idx, v) for idx, v in zip(md.index, md[col]) + bulk_set = ["%s/%s" % (idx, quote_plus(str(v))) + for idx, v in zip(md.index, md[col]) if _indexable(v, null_values)] - payload = "category:%s/%s" % (col, '/'.join(bulk_set)) post('metadata', 'HMSET', payload) @@ -545,18 +548,8 @@ def load_sample_metadata_full_search(md, tag=None): def _indexable(value, nullables): - """Returns true if the value appears to be something that storable - - IMPORTANT: we cannot store values which contain a "/" as that character - has a special meaning for a path. - """ - if value in nullables: - return False - - if isinstance(value, (float, int, bool)): - return True - else: - return '/' not in value + """Returns true if the value appears to be something that storable""" + return value not in nullables class AlreadyLoaded(ValueError): diff --git a/redbiom/fetch.py b/redbiom/fetch.py index 1a1d692..e8b6bde 100644 --- a/redbiom/fetch.py +++ b/redbiom/fetch.py @@ -222,16 +222,9 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None, metadata[sample_ambiguity]['#SampleID'] = sample_ambiguity for category in columns_to_get: - key = 'category:%s' % category - getter = redbiom._requests.buffered(iter(all_samples), None, 'HMGET', - 'metadata', get=get, - buffer_size=100, - multikey=key) - - for samples, category_values in getter: - for sample, value in zip(samples, category_values): - for sample_ambiguity in ambig_assoc[sample]: - metadata[sample_ambiguity][category] = value + for sample, value in get_sample_values(all_samples, category): + for sample_ambiguity in ambig_assoc[sample]: + metadata[sample_ambiguity][category] = value md = pd.DataFrame(metadata).T @@ -564,20 +557,12 @@ def category_sample_values(category, samples=None): get = redbiom._requests.make_get(redbiom.get_config()) - key = 'category:%s' % category - if samples is None: - keys_vals = list(get('metadata', 'HGETALL', key).items()) - else: + if samples is not None: untagged, _, _, tagged_clean = \ redbiom.util.partition_samples_by_tags(samples) samples = untagged + tagged_clean - getter = redbiom._requests.buffered(iter(samples), None, 'HMGET', - 'metadata', get=get, - buffer_size=100, multikey=key) - # there is probably some niftier method than this. - keys_vals = [(sample, obs_val) for idx, vals in getter - for sample, obs_val in zip(idx, vals)] + keys_vals = get_sample_values(samples, category, get=get) index = (v[0] for v in keys_vals) data = (v[1] for v in keys_vals) @@ -697,16 +682,8 @@ def metadata(where=None, tag=None, restrict_to=None): metadata[sample]['#SampleID'] = sample for category in categories: - key = 'category:%s' % category - getter = redbiom._requests.buffered(iter(samples_to_get), None, - 'HMGET', - 'metadata', get=get, - buffer_size=100, - multikey=key) - - for chunk in getter: - for sample, value in zip(*chunk): - metadata[sample][category] = value + for sample, value in get_sample_values(samples_to_get, category, get): + metadata[sample][category] = value md = pd.DataFrame(metadata).T @@ -715,3 +692,44 @@ def metadata(where=None, tag=None, restrict_to=None): else: md = redbiom.metadata.Metadata(md.set_index('#SampleID')) return md.ids(where=where) + + +def get_sample_values(samples, category, get=None): + """Obtain the metadata values associated with the requested samples + + Parameters + ---------- + samples : Iterable of str or None + The samples to obtain + category : str + The category to obtain values for. + get : function, optional + A get method + + Returns + ------- + [(str, str), ...] + A list of (sample, value) tuples + + Redis command summary + --------------------- + HMGET metadata:category: ... + HMKEYS metadata:category: + """ + import redbiom + + if get is None: + config = redbiom.get_config() + get = redbiom._requests.make_get(config) + + key = 'category:%s' % category + if samples is None: + samples = get('metadata', 'HKEYS', key) + + getter = redbiom._requests.buffered(iter(samples), None, + 'HMGET', + 'metadata', get=get, + buffer_size=100, + multikey=key) + + return [item for chunk in getter for item in zip(*chunk)] diff --git a/redbiom/tests/test_admin.py b/redbiom/tests/test_admin.py index a833bc4..4b188b3 100644 --- a/redbiom/tests/test_admin.py +++ b/redbiom/tests/test_admin.py @@ -268,6 +268,22 @@ def test_load_sample_metadata(self): obs = set(self.get('metadata', 'SMEMBERS', 'samples-represented')) self.assertEqual(obs, exp) + def test_load_sample_metadata_encoded(self): + md = metadata.copy() + md['http_quoted_characters'] = ['foo', 'bar', 'foo/bar', 'baz$12', + 'thing', 'stuff', 'asd#asd', + 'a', 'b', 'c'] + redbiom.admin.load_sample_metadata(md) + + # NOTE: webdis decodes the encoded strings so they are stored in redis + # in their native representation + exp = ['foo', 'bar', 'foo/bar', 'baz$12', + 'thing', 'stuff', 'asd#asd', 'a', 'b', 'c'] + obs = self.get('metadata:category', 'HGETALL', + 'http_quoted_characters') + self.assertEqual(sorted([v for k, v in obs.items()]), + sorted(exp)) + def test_load_sample_metadata_full_search(self): redbiom.admin.load_sample_metadata(metadata) redbiom.admin.load_sample_metadata_full_search(metadata) diff --git a/redbiom/tests/test_fetch.py b/redbiom/tests/test_fetch.py index b582b3a..d920ba5 100644 --- a/redbiom/tests/test_fetch.py +++ b/redbiom/tests/test_fetch.py @@ -10,7 +10,7 @@ import redbiom.fetch from redbiom.fetch import (_biom_from_samples, sample_metadata, samples_in_context, features_in_context, - sample_counts_per_category) + sample_counts_per_category, get_sample_values) from redbiom.tests import assert_test_env assert_test_env() @@ -208,6 +208,45 @@ def test_sample_metadata_samples_not_represented_in_context(self): sample_metadata(['10317.000047188', '10317.000046868'], context='test') + def test_get_sample_values(self): + redbiom.admin.create_context('test', 'a nice test') + redbiom.admin.load_sample_metadata(metadata) + exp = {'10317.000047188': '50s', + '10317.000051129': '30s', + '10317.000012975': '40s', + '10317.000033804': '20s', + '10317.000001405': '30s', + '10317.000022252': '30s', + '10317.000001378': '20s', + '10317.000005080': '30s'} + obs = dict(get_sample_values(None, 'AGE_CAT')) + self.assertEqual(obs, exp) + obs1, obs2, obs3 = get_sample_values(['10317.000033804', 'missing', + '10317.000005080'], 'AGE_CAT') + self.assertEqual(obs1, ('10317.000033804', '20s')) + self.assertEqual(obs2, ('missing', None)) + self.assertEqual(obs3, ('10317.000005080', '30s')) + + def test_get_sample_values_encoded(self): + redbiom.admin.create_context('test', 'a nice test') + + df = metadata.copy() + df.set_index('#SampleID', inplace=True) + + df.loc[['10317.000047188', + '10317.000051129', + '10317.000012975'], 'encoded'] = ['foo/bar', + 'baz$', + '#bing'] + df.loc[df['encoded'].isnull(), 'encoded'] = None + + redbiom.admin.load_sample_metadata(df) + exp = {'10317.000047188': 'foo/bar', + '10317.000051129': 'baz$', + '10317.000012975': '#bing'} + obs = dict(get_sample_values(None, 'encoded')) + self.assertEqual(obs, exp) + def test_sample_metadata_all_cols(self): redbiom.admin.load_sample_metadata(metadata) exp = metadata.copy() diff --git a/redbiom/tests/test_rest.py b/redbiom/tests/test_rest.py index fce304c..fe8c481 100644 --- a/redbiom/tests/test_rest.py +++ b/redbiom/tests/test_rest.py @@ -85,7 +85,7 @@ def test_metadata_categories(self): for idx, row in md.iterrows(): exp = [c for c, v in zip(md.columns, row.values) - if v not in null_values and '/' not in str(v)] + if v not in null_values] obs = json.loads(get('GET', 'metadata:categories:%s' % idx)) self.assertEqual(obs, exp)