Encode variable values for use over URLs (#87)

* TST: tests for encoded values * ENH/MAINT: centralize method for getting sample metadata values * Encode for URLs * Clean up some dev clutter * Fixed hardcoded test * Addressing @ElDeveloper's comments and dropping py27 support * readd skbio install, and remove py37 due to dependency conflicts
biocore · Sep 30, 2019 · 2ba62a9 · 2ba62a9
1 parent fe2a1f9
commit 2ba62a9
Show file tree

Hide file tree

Showing 6 changed files with 113 additions and 50 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -3,7 +3,6 @@ language: python
 env:
   - PYTHON_VERSION=3.6
   - PYTHON_VERSION=3.5
-  - PYTHON_VERSION=2.7
 services:
   - redis-server
 before_install:
@@ -16,9 +15,7 @@ before_install:
 install:
   - conda create --yes -n test-env -c bioconda python=$PYTHON_VERSION biom-format requests pandas click==6.7 nose sqlite joblib nltk msgpack-python cython
   - source activate test-env
-  - if [ ${PYTHON_VERSION} = "2.7" ]; then conda install -c biocore --yes scikit-bio==0.4.2; fi
-  - if [ ${PYTHON_VERSION} = "3.5" ]; then conda install -c conda-forge --yes scikit-bio; fi
-  - if [ ${PYTHON_VERSION} = "3.6" ]; then conda install -c conda-forge --yes scikit-bio; fi
+  - conda install -c conda-forge --yes scikit-bio
   - pip install flake8 msgpack
   - git clone https://github.com/nicolasff/webdis
   - pushd webdis

diff --git a/redbiom/admin.py b/redbiom/admin.py
@@ -1,3 +1,6 @@
+from urllib.parse import quote_plus
+
+
 class ScriptManager:
     """Static singleton for managing Lua scripts in the Redis backend"""
     # derived from http://stackoverflow.com/a/43900922/19741
@@ -462,9 +465,9 @@ def load_sample_metadata(md, tag=None):
         put('metadata', 'SET', key, json.dumps(columns))
 
     for col in indexed_columns:
-        bulk_set = ["%s/%s" % (idx, v) for idx, v in zip(md.index, md[col])
+        bulk_set = ["%s/%s" % (idx, quote_plus(str(v)))
+                    for idx, v in zip(md.index, md[col])
                     if _indexable(v, null_values)]
-
         payload = "category:%s/%s" % (col, '/'.join(bulk_set))
         post('metadata', 'HMSET', payload)
 
@@ -545,18 +548,8 @@ def load_sample_metadata_full_search(md, tag=None):
 
 
 def _indexable(value, nullables):
-    """Returns true if the value appears to be something that storable
-
-    IMPORTANT: we cannot store values which contain a "/" as that character
-    has a special meaning for a path.
-    """
-    if value in nullables:
-        return False
-
-    if isinstance(value, (float, int, bool)):
-        return True
-    else:
-        return '/' not in value
+    """Returns true if the value appears to be something that storable"""
+    return value not in nullables
 
 
 class AlreadyLoaded(ValueError):

diff --git a/redbiom/fetch.py b/redbiom/fetch.py
@@ -222,16 +222,9 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None,
             metadata[sample_ambiguity]['#SampleID'] = sample_ambiguity
 
     for category in columns_to_get:
-        key = 'category:%s' % category
-        getter = redbiom._requests.buffered(iter(all_samples), None, 'HMGET',
-                                            'metadata', get=get,
-                                            buffer_size=100,
-                                            multikey=key)
-
-        for samples, category_values in getter:
-            for sample, value in zip(samples, category_values):
-                for sample_ambiguity in ambig_assoc[sample]:
-                    metadata[sample_ambiguity][category] = value
+        for sample, value in get_sample_values(all_samples, category):
+            for sample_ambiguity in ambig_assoc[sample]:
+                metadata[sample_ambiguity][category] = value
 
     md = pd.DataFrame(metadata).T
 
@@ -564,20 +557,12 @@ def category_sample_values(category, samples=None):
 
     get = redbiom._requests.make_get(redbiom.get_config())
 
-    key = 'category:%s' % category
-    if samples is None:
-        keys_vals = list(get('metadata', 'HGETALL', key).items())
-    else:
+    if samples is not None:
         untagged, _, _, tagged_clean = \
             redbiom.util.partition_samples_by_tags(samples)
         samples = untagged + tagged_clean
-        getter = redbiom._requests.buffered(iter(samples), None, 'HMGET',
-                                            'metadata', get=get,
-                                            buffer_size=100, multikey=key)
 
-        # there is probably some niftier method than this.
-        keys_vals = [(sample, obs_val) for idx, vals in getter
-                     for sample, obs_val in zip(idx, vals)]
+    keys_vals = get_sample_values(samples, category, get=get)
 
     index = (v[0] for v in keys_vals)
     data = (v[1] for v in keys_vals)
@@ -697,16 +682,8 @@ def metadata(where=None, tag=None, restrict_to=None):
         metadata[sample]['#SampleID'] = sample
 
     for category in categories:
-        key = 'category:%s' % category
-        getter = redbiom._requests.buffered(iter(samples_to_get), None,
-                                            'HMGET',
-                                            'metadata', get=get,
-                                            buffer_size=100,
-                                            multikey=key)
-
-        for chunk in getter:
-            for sample, value in zip(*chunk):
-                metadata[sample][category] = value
+        for sample, value in get_sample_values(samples_to_get, category, get):
+            metadata[sample][category] = value
 
     md = pd.DataFrame(metadata).T
 
@@ -715,3 +692,44 @@ def metadata(where=None, tag=None, restrict_to=None):
     else:
         md = redbiom.metadata.Metadata(md.set_index('#SampleID'))
         return md.ids(where=where)
+
+
+def get_sample_values(samples, category, get=None):
+    """Obtain the metadata values associated with the requested samples
+
+    Parameters
+    ----------
+    samples : Iterable of str or None
+        The samples to obtain
+    category : str
+        The category to obtain values for.
+    get : function, optional
+        A get method
+
+    Returns
+    -------
+    [(str, str), ...]
+        A list of (sample, value) tuples
+
+    Redis command summary
+    ---------------------
+    HMGET metadata:category:<column> <sample_id> ... <sample_id>
+    HMKEYS metadata:category:<column>
+    """
+    import redbiom
+
+    if get is None:
+        config = redbiom.get_config()
+        get = redbiom._requests.make_get(config)
+
+    key = 'category:%s' % category
+    if samples is None:
+        samples = get('metadata', 'HKEYS', key)
+
+    getter = redbiom._requests.buffered(iter(samples), None,
+                                        'HMGET',
+                                        'metadata', get=get,
+                                        buffer_size=100,
+                                        multikey=key)
+
+    return [item for chunk in getter for item in zip(*chunk)]
diff --git a/redbiom/tests/test_admin.py b/redbiom/tests/test_admin.py
@@ -268,6 +268,22 @@ def test_load_sample_metadata(self):
         obs = set(self.get('metadata', 'SMEMBERS', 'samples-represented'))
         self.assertEqual(obs, exp)
 
+    def test_load_sample_metadata_encoded(self):
+        md = metadata.copy()
+        md['http_quoted_characters'] = ['foo', 'bar', 'foo/bar', 'baz$12',
+                                        'thing', 'stuff', 'asd#asd',
+                                        'a', 'b', 'c']
+        redbiom.admin.load_sample_metadata(md)
+
+        # NOTE: webdis decodes the encoded strings so they are stored in redis
+        # in their native representation
+        exp = ['foo', 'bar', 'foo/bar', 'baz$12',
+               'thing', 'stuff', 'asd#asd', 'a', 'b', 'c']
+        obs = self.get('metadata:category', 'HGETALL',
+                       'http_quoted_characters')
+        self.assertEqual(sorted([v for k, v in obs.items()]),
+                         sorted(exp))
+
     def test_load_sample_metadata_full_search(self):
         redbiom.admin.load_sample_metadata(metadata)
         redbiom.admin.load_sample_metadata_full_search(metadata)

diff --git a/redbiom/tests/test_fetch.py b/redbiom/tests/test_fetch.py
@@ -10,7 +10,7 @@
 import redbiom.fetch
 from redbiom.fetch import (_biom_from_samples, sample_metadata,
                            samples_in_context, features_in_context,
-                           sample_counts_per_category)
+                           sample_counts_per_category, get_sample_values)
 from redbiom.tests import assert_test_env
 
 assert_test_env()
@@ -208,6 +208,45 @@ def test_sample_metadata_samples_not_represented_in_context(self):
             sample_metadata(['10317.000047188', '10317.000046868'],
                             context='test')
 
+    def test_get_sample_values(self):
+        redbiom.admin.create_context('test', 'a nice test')
+        redbiom.admin.load_sample_metadata(metadata)
+        exp = {'10317.000047188': '50s',
+               '10317.000051129': '30s',
+               '10317.000012975': '40s',
+               '10317.000033804': '20s',
+               '10317.000001405': '30s',
+               '10317.000022252': '30s',
+               '10317.000001378': '20s',
+               '10317.000005080': '30s'}
+        obs = dict(get_sample_values(None, 'AGE_CAT'))
+        self.assertEqual(obs, exp)
+        obs1, obs2, obs3 = get_sample_values(['10317.000033804', 'missing',
+                                              '10317.000005080'], 'AGE_CAT')
+        self.assertEqual(obs1, ('10317.000033804', '20s'))
+        self.assertEqual(obs2, ('missing', None))
+        self.assertEqual(obs3, ('10317.000005080', '30s'))
+
+    def test_get_sample_values_encoded(self):
+        redbiom.admin.create_context('test', 'a nice test')
+
+        df = metadata.copy()
+        df.set_index('#SampleID', inplace=True)
+
+        df.loc[['10317.000047188',
+                '10317.000051129',
+                '10317.000012975'], 'encoded'] = ['foo/bar',
+                                                  'baz$',
+                                                  '#bing']
+        df.loc[df['encoded'].isnull(), 'encoded'] = None
+
+        redbiom.admin.load_sample_metadata(df)
+        exp = {'10317.000047188': 'foo/bar',
+               '10317.000051129': 'baz$',
+               '10317.000012975': '#bing'}
+        obs = dict(get_sample_values(None, 'encoded'))
+        self.assertEqual(obs, exp)
+
     def test_sample_metadata_all_cols(self):
         redbiom.admin.load_sample_metadata(metadata)
         exp = metadata.copy()

diff --git a/redbiom/tests/test_rest.py b/redbiom/tests/test_rest.py
@@ -85,7 +85,7 @@ def test_metadata_categories(self):
 
         for idx, row in md.iterrows():
             exp = [c for c, v in zip(md.columns, row.values)
-                   if v not in null_values and '/' not in str(v)]
+                   if v not in null_values]
             obs = json.loads(get('GET', 'metadata:categories:%s' % idx))
 
             self.assertEqual(obs, exp)