Skip to content

Commit

Permalink
Encode variable values for use over URLs (#87)
Browse files Browse the repository at this point in the history
* TST: tests for encoded values

* ENH/MAINT: centralize method for getting sample metadata values

* Encode for URLs

* Clean up some dev clutter

* Fixed hardcoded test

* Addressing @ElDeveloper's comments and dropping py27 support

* readd skbio install, and remove py37 due to dependency conflicts
  • Loading branch information
wasade authored and ElDeveloper committed Sep 30, 2019
1 parent fe2a1f9 commit 2ba62a9
Show file tree
Hide file tree
Showing 6 changed files with 113 additions and 50 deletions.
5 changes: 1 addition & 4 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ language: python
env:
- PYTHON_VERSION=3.6
- PYTHON_VERSION=3.5
- PYTHON_VERSION=2.7
services:
- redis-server
before_install:
Expand All @@ -16,9 +15,7 @@ before_install:
install:
- conda create --yes -n test-env -c bioconda python=$PYTHON_VERSION biom-format requests pandas click==6.7 nose sqlite joblib nltk msgpack-python cython
- source activate test-env
- if [ ${PYTHON_VERSION} = "2.7" ]; then conda install -c biocore --yes scikit-bio==0.4.2; fi
- if [ ${PYTHON_VERSION} = "3.5" ]; then conda install -c conda-forge --yes scikit-bio; fi
- if [ ${PYTHON_VERSION} = "3.6" ]; then conda install -c conda-forge --yes scikit-bio; fi
- conda install -c conda-forge --yes scikit-bio
- pip install flake8 msgpack
- git clone https://github.com/nicolasff/webdis
- pushd webdis
Expand Down
21 changes: 7 additions & 14 deletions redbiom/admin.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from urllib.parse import quote_plus


class ScriptManager:
"""Static singleton for managing Lua scripts in the Redis backend"""
# derived from http://stackoverflow.com/a/43900922/19741
Expand Down Expand Up @@ -462,9 +465,9 @@ def load_sample_metadata(md, tag=None):
put('metadata', 'SET', key, json.dumps(columns))

for col in indexed_columns:
bulk_set = ["%s/%s" % (idx, v) for idx, v in zip(md.index, md[col])
bulk_set = ["%s/%s" % (idx, quote_plus(str(v)))
for idx, v in zip(md.index, md[col])
if _indexable(v, null_values)]

payload = "category:%s/%s" % (col, '/'.join(bulk_set))
post('metadata', 'HMSET', payload)

Expand Down Expand Up @@ -545,18 +548,8 @@ def load_sample_metadata_full_search(md, tag=None):


def _indexable(value, nullables):
"""Returns true if the value appears to be something that storable
IMPORTANT: we cannot store values which contain a "/" as that character
has a special meaning for a path.
"""
if value in nullables:
return False

if isinstance(value, (float, int, bool)):
return True
else:
return '/' not in value
"""Returns true if the value appears to be something that storable"""
return value not in nullables


class AlreadyLoaded(ValueError):
Expand Down
78 changes: 48 additions & 30 deletions redbiom/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,16 +222,9 @@ def sample_metadata(samples, common=True, context=None, restrict_to=None,
metadata[sample_ambiguity]['#SampleID'] = sample_ambiguity

for category in columns_to_get:
key = 'category:%s' % category
getter = redbiom._requests.buffered(iter(all_samples), None, 'HMGET',
'metadata', get=get,
buffer_size=100,
multikey=key)

for samples, category_values in getter:
for sample, value in zip(samples, category_values):
for sample_ambiguity in ambig_assoc[sample]:
metadata[sample_ambiguity][category] = value
for sample, value in get_sample_values(all_samples, category):
for sample_ambiguity in ambig_assoc[sample]:
metadata[sample_ambiguity][category] = value

md = pd.DataFrame(metadata).T

Expand Down Expand Up @@ -564,20 +557,12 @@ def category_sample_values(category, samples=None):

get = redbiom._requests.make_get(redbiom.get_config())

key = 'category:%s' % category
if samples is None:
keys_vals = list(get('metadata', 'HGETALL', key).items())
else:
if samples is not None:
untagged, _, _, tagged_clean = \
redbiom.util.partition_samples_by_tags(samples)
samples = untagged + tagged_clean
getter = redbiom._requests.buffered(iter(samples), None, 'HMGET',
'metadata', get=get,
buffer_size=100, multikey=key)

# there is probably some niftier method than this.
keys_vals = [(sample, obs_val) for idx, vals in getter
for sample, obs_val in zip(idx, vals)]
keys_vals = get_sample_values(samples, category, get=get)

index = (v[0] for v in keys_vals)
data = (v[1] for v in keys_vals)
Expand Down Expand Up @@ -697,16 +682,8 @@ def metadata(where=None, tag=None, restrict_to=None):
metadata[sample]['#SampleID'] = sample

for category in categories:
key = 'category:%s' % category
getter = redbiom._requests.buffered(iter(samples_to_get), None,
'HMGET',
'metadata', get=get,
buffer_size=100,
multikey=key)

for chunk in getter:
for sample, value in zip(*chunk):
metadata[sample][category] = value
for sample, value in get_sample_values(samples_to_get, category, get):
metadata[sample][category] = value

md = pd.DataFrame(metadata).T

Expand All @@ -715,3 +692,44 @@ def metadata(where=None, tag=None, restrict_to=None):
else:
md = redbiom.metadata.Metadata(md.set_index('#SampleID'))
return md.ids(where=where)


def get_sample_values(samples, category, get=None):
"""Obtain the metadata values associated with the requested samples
Parameters
----------
samples : Iterable of str or None
The samples to obtain
category : str
The category to obtain values for.
get : function, optional
A get method
Returns
-------
[(str, str), ...]
A list of (sample, value) tuples
Redis command summary
---------------------
HMGET metadata:category:<column> <sample_id> ... <sample_id>
HMKEYS metadata:category:<column>
"""
import redbiom

if get is None:
config = redbiom.get_config()
get = redbiom._requests.make_get(config)

key = 'category:%s' % category
if samples is None:
samples = get('metadata', 'HKEYS', key)

getter = redbiom._requests.buffered(iter(samples), None,
'HMGET',
'metadata', get=get,
buffer_size=100,
multikey=key)

return [item for chunk in getter for item in zip(*chunk)]
16 changes: 16 additions & 0 deletions redbiom/tests/test_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,22 @@ def test_load_sample_metadata(self):
obs = set(self.get('metadata', 'SMEMBERS', 'samples-represented'))
self.assertEqual(obs, exp)

def test_load_sample_metadata_encoded(self):
md = metadata.copy()
md['http_quoted_characters'] = ['foo', 'bar', 'foo/bar', 'baz$12',
'thing', 'stuff', 'asd#asd',
'a', 'b', 'c']
redbiom.admin.load_sample_metadata(md)

# NOTE: webdis decodes the encoded strings so they are stored in redis
# in their native representation
exp = ['foo', 'bar', 'foo/bar', 'baz$12',
'thing', 'stuff', 'asd#asd', 'a', 'b', 'c']
obs = self.get('metadata:category', 'HGETALL',
'http_quoted_characters')
self.assertEqual(sorted([v for k, v in obs.items()]),
sorted(exp))

def test_load_sample_metadata_full_search(self):
redbiom.admin.load_sample_metadata(metadata)
redbiom.admin.load_sample_metadata_full_search(metadata)
Expand Down
41 changes: 40 additions & 1 deletion redbiom/tests/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import redbiom.fetch
from redbiom.fetch import (_biom_from_samples, sample_metadata,
samples_in_context, features_in_context,
sample_counts_per_category)
sample_counts_per_category, get_sample_values)
from redbiom.tests import assert_test_env

assert_test_env()
Expand Down Expand Up @@ -208,6 +208,45 @@ def test_sample_metadata_samples_not_represented_in_context(self):
sample_metadata(['10317.000047188', '10317.000046868'],
context='test')

def test_get_sample_values(self):
redbiom.admin.create_context('test', 'a nice test')
redbiom.admin.load_sample_metadata(metadata)
exp = {'10317.000047188': '50s',
'10317.000051129': '30s',
'10317.000012975': '40s',
'10317.000033804': '20s',
'10317.000001405': '30s',
'10317.000022252': '30s',
'10317.000001378': '20s',
'10317.000005080': '30s'}
obs = dict(get_sample_values(None, 'AGE_CAT'))
self.assertEqual(obs, exp)
obs1, obs2, obs3 = get_sample_values(['10317.000033804', 'missing',
'10317.000005080'], 'AGE_CAT')
self.assertEqual(obs1, ('10317.000033804', '20s'))
self.assertEqual(obs2, ('missing', None))
self.assertEqual(obs3, ('10317.000005080', '30s'))

def test_get_sample_values_encoded(self):
redbiom.admin.create_context('test', 'a nice test')

df = metadata.copy()
df.set_index('#SampleID', inplace=True)

df.loc[['10317.000047188',
'10317.000051129',
'10317.000012975'], 'encoded'] = ['foo/bar',
'baz$',
'#bing']
df.loc[df['encoded'].isnull(), 'encoded'] = None

redbiom.admin.load_sample_metadata(df)
exp = {'10317.000047188': 'foo/bar',
'10317.000051129': 'baz$',
'10317.000012975': '#bing'}
obs = dict(get_sample_values(None, 'encoded'))
self.assertEqual(obs, exp)

def test_sample_metadata_all_cols(self):
redbiom.admin.load_sample_metadata(metadata)
exp = metadata.copy()
Expand Down
2 changes: 1 addition & 1 deletion redbiom/tests/test_rest.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_metadata_categories(self):

for idx, row in md.iterrows():
exp = [c for c, v in zip(md.columns, row.values)
if v not in null_values and '/' not in str(v)]
if v not in null_values]
obs = json.loads(get('GET', 'metadata:categories:%s' % idx))

self.assertEqual(obs, exp)
Expand Down

0 comments on commit 2ba62a9

Please sign in to comment.