From 116645e2b2fd0a9606360ee502d08c01cf16eb56 Mon Sep 17 00:00:00 2001 From: ljunhui Date: Fri, 13 Oct 2023 13:55:03 +0800 Subject: [PATCH] Allow nested fields --- esdedupe/esdedupe.py | 8 +++++--- requirements.txt | 3 ++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/esdedupe/esdedupe.py b/esdedupe/esdedupe.py index 3e56e90..58bc568 100755 --- a/esdedupe/esdedupe.py +++ b/esdedupe/esdedupe.py @@ -9,6 +9,7 @@ import requests import sys +from benedict import benedict from elasticsearch import Elasticsearch, helpers from elasticsearch.helpers import parallel_bulk from elasticsearch.helpers import streaming_bulk @@ -28,15 +29,16 @@ def __init__(self): # Process documents returned by the current search/scroll def build_index(self, docs_hash, unique_fields, hit): hashval = None - _id = hit["_id"] + hit_benedict = benedict(hit) + _id = hit_benedict["_id"] # there's no need to hash, if we have just single unique key if len(unique_fields) > 1: combined_key = "" for field in unique_fields: - combined_key += str(hit['_source'][field]) + combined_key += str(hit_benedict['_source'][field]) hashval = hashlib.md5(combined_key.encode('utf-8')).digest() else: - hashval = str(hit['_source'][unique_fields[0]]) + hashval = str(hit_benedict['_source'][unique_fields[0]]) docs_hash.setdefault(hashval, []).append(_id) diff --git a/requirements.txt b/requirements.txt index bb42c8c..3bb247d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,5 @@ ujson tqdm psutil elasticsearch -requests \ No newline at end of file +requests +python-benedict