From c93c3cddaebc6161eb1135d84222cc1f6a6903d2 Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Wed, 20 Nov 2019 13:45:58 +0200 Subject: [PATCH 1/8] added store_many_vectors on mongo storage --- nearpy/storage/storage_mongo.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index 19fc01e..03551fd 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -34,7 +34,11 @@ except ImportError: import pickle -from future.builtins import bytes +try: + from pymongo import InsertOne +except ImportError: + pass + from nearpy.storage.storage import Storage @@ -45,7 +49,23 @@ def __init__(self, mongo_object): """ Uses specified pymongo object for storage. """ self.mongo_object = mongo_object + def store_many_vectors(self, hash_name, bucket_keys, vs, data): + requests = [] + + for v, d, bk in zip(vs, data, bucket_keys): + vc = self._get_vector(hash_name, bk, v, d) + + requests.append(InsertOne(vc)) + + self.mongo_object.bulk_write(requests, ordered=False) + def store_vector(self, hash_name, bucket_key, v, data): + val_dict = self._get_vector(hash_name, bucket_key, v, data) + + # Push JSON representation of dict to end of bucket list + self.mongo_object.insert_one(val_dict) + + def _get_vector(self, hash_name, bucket_key, v, data): """ Stores vector and JSON-serializable data in MongoDB with specified key. """ @@ -83,8 +103,7 @@ def store_vector(self, hash_name, bucket_key, v, data): if data is not None: val_dict['data'] = data - # Push JSON representation of dict to end of bucket list - self.mongo_object.insert_one(val_dict) + return val_dict def _format_mongo_key(self, hash_name, bucket_key): return '{}{}'.format(self._format_hash_prefix(hash_name), bucket_key) @@ -186,5 +205,6 @@ def load_hash_configuration(self, hash_name): conf = self.mongo_object.find_one( {'hash_conf_name': hash_name + '_conf'} ) + return pickle.loads(conf['hash_configuration']) if conf is not None\ else None From 86821b04fb715b8117e032c392904bd678202e5a Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Thu, 21 Nov 2019 11:39:41 +0200 Subject: [PATCH 2/8] added tests, fixed deprecation warnings --- nearpy/storage/storage_mongo.py | 8 ++++---- tests/storage_tests.py | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index 03551fd..d79a0c3 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -40,6 +40,7 @@ pass from nearpy.storage.storage import Storage +from future.builtins import zip class MongoStorage(Storage): @@ -62,7 +63,6 @@ def store_many_vectors(self, hash_name, bucket_keys, vs, data): def store_vector(self, hash_name, bucket_key, v, data): val_dict = self._get_vector(hash_name, bucket_key, v, data) - # Push JSON representation of dict to end of bucket list self.mongo_object.insert_one(val_dict) def _get_vector(self, hash_name, bucket_key, v, data): @@ -166,7 +166,7 @@ def get_bucket(self, hash_name, bucket_key): shape=(val_dict['dim'], 1)) else: - vector = numpy.fromstring(val_dict['vector'], + vector = numpy.frombuffer(val_dict['vector'], dtype=val_dict['dtype']) [val_dict.pop(k) for k in ['vector', 'dtype', '_id']] # Add data to result tuple, if present @@ -178,14 +178,14 @@ def clean_buckets(self, hash_name): """ Removes all buckets and their content for specified hash. """ - self.mongo_object.remove( + self.mongo_object.delete_many( {'lsh': {'$regex': self._format_hash_prefix(hash_name)}}) def clean_all_buckets(self): """ Removes all buckets from all hashes and their content. """ - self.mongo_object.remove( + self.mongo_object.delete_many( {'lsh': {'$regex': 'nearpy_'}}) def store_hash_configuration(self, lshash): diff --git a/tests/storage_tests.py b/tests/storage_tests.py index 8f5750d..c2137d1 100644 --- a/tests/storage_tests.py +++ b/tests/storage_tests.py @@ -185,6 +185,10 @@ def test_store_zero(self): _, data = bucket[0] self.assertEqual(data, 0) + def test_store_many_vectors(self): + x = numpy.random.randn(100, 10) + self.check_store_many_vectors(x) + if __name__ == '__main__': unittest.main() From 1579e2443050013a21cfb8b0c17dcf1a45b46d2c Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Mon, 25 Nov 2019 17:45:02 +0200 Subject: [PATCH 3/8] added pymongo to test req --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 498f421..18146bd 100644 --- a/setup.py +++ b/setup.py @@ -28,5 +28,6 @@ "redis", "mockredispy", "mongomock", + "pymongo" ] ) From 8ad62c48ea666deb703ac534d759f749498802b0 Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Fri, 29 Nov 2019 15:48:06 +0200 Subject: [PATCH 4/8] reverted the changes --- nearpy/storage/storage_mongo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index d79a0c3..9f7293b 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -166,7 +166,7 @@ def get_bucket(self, hash_name, bucket_key): shape=(val_dict['dim'], 1)) else: - vector = numpy.frombuffer(val_dict['vector'], + vector = numpy.fromstring(val_dict['vector'], dtype=val_dict['dtype']) [val_dict.pop(k) for k in ['vector', 'dtype', '_id']] # Add data to result tuple, if present @@ -178,14 +178,14 @@ def clean_buckets(self, hash_name): """ Removes all buckets and their content for specified hash. """ - self.mongo_object.delete_many( + self.mongo_object.remove( {'lsh': {'$regex': self._format_hash_prefix(hash_name)}}) def clean_all_buckets(self): """ Removes all buckets from all hashes and their content. """ - self.mongo_object.delete_many( + self.mongo_object.remove( {'lsh': {'$regex': 'nearpy_'}}) def store_hash_configuration(self, lshash): From 7ce25eca3275014668c55fbe813af575f5ab14a5 Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Mon, 2 Dec 2019 17:10:28 +0200 Subject: [PATCH 5/8] updated pymongo insert dict to unicode format --- nearpy/storage/storage_mongo.py | 4 +++- nearpy/utils/utils.py | 7 +++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index 9f7293b..cbdd362 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -29,6 +29,8 @@ import numpy import scipy +from nearpy.utils.utils import convert2unicode + try: import cPickle as pickle except ImportError: @@ -63,7 +65,7 @@ def store_many_vectors(self, hash_name, bucket_keys, vs, data): def store_vector(self, hash_name, bucket_key, v, data): val_dict = self._get_vector(hash_name, bucket_key, v, data) - self.mongo_object.insert_one(val_dict) + self.mongo_object.insert_one(convert2unicode(val_dict)) def _get_vector(self, hash_name, bucket_key, v, data): """ diff --git a/nearpy/utils/utils.py b/nearpy/utils/utils.py index 69f9b4c..15a6a34 100644 --- a/nearpy/utils/utils.py +++ b/nearpy/utils/utils.py @@ -90,3 +90,10 @@ def want_string(arg, encoding='utf-8'): rv = arg return rv + +def convert2unicode(mydict): + for k, v in mydict.iteritems(): + if isinstance(v, str): + mydict[k] = unicode(v, errors='replace') + elif isinstance(v, dict): + convert2unicode(v) From 4506f5b335b3b6bfab0ddcb928aab649e0158502 Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Mon, 2 Dec 2019 17:23:43 +0200 Subject: [PATCH 6/8] add convert to unicode --- nearpy/storage/storage_mongo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index cbdd362..2c6a212 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -65,7 +65,7 @@ def store_many_vectors(self, hash_name, bucket_keys, vs, data): def store_vector(self, hash_name, bucket_key, v, data): val_dict = self._get_vector(hash_name, bucket_key, v, data) - self.mongo_object.insert_one(convert2unicode(val_dict)) + self.mongo_object.insert_one(val_dict) def _get_vector(self, hash_name, bucket_key, v, data): """ @@ -105,7 +105,7 @@ def _get_vector(self, hash_name, bucket_key, v, data): if data is not None: val_dict['data'] = data - return val_dict + return convert2unicode(val_dict) def _format_mongo_key(self, hash_name, bucket_key): return '{}{}'.format(self._format_hash_prefix(hash_name), bucket_key) From 25b4c48fc19adafe7c10eb3afe1d366d0f52af69 Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Mon, 2 Dec 2019 17:25:08 +0200 Subject: [PATCH 7/8] fixed return --- nearpy/storage/storage_mongo.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index 2c6a212..44f1db0 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -105,7 +105,9 @@ def _get_vector(self, hash_name, bucket_key, v, data): if data is not None: val_dict['data'] = data - return convert2unicode(val_dict) + convert2unicode(val_dict) + + return val_dict def _format_mongo_key(self, hash_name, bucket_key): return '{}{}'.format(self._format_hash_prefix(hash_name), bucket_key) From 152c49af60a5c7eed835f1c05b484c4188688b19 Mon Sep 17 00:00:00 2001 From: Tudorel Enache Date: Mon, 2 Dec 2019 17:35:12 +0200 Subject: [PATCH 8/8] fix encoding error --- nearpy/storage/storage_mongo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nearpy/storage/storage_mongo.py b/nearpy/storage/storage_mongo.py index 44f1db0..da5962e 100644 --- a/nearpy/storage/storage_mongo.py +++ b/nearpy/storage/storage_mongo.py @@ -170,7 +170,7 @@ def get_bucket(self, hash_name, bucket_key): shape=(val_dict['dim'], 1)) else: - vector = numpy.fromstring(val_dict['vector'], + vector = numpy.frombuffer(val_dict['vector'], dtype=val_dict['dtype']) [val_dict.pop(k) for k in ['vector', 'dtype', '_id']] # Add data to result tuple, if present