hplt-project · XapaJIaMnu · May 16, 2023 · Feb 22, 2023 · Feb 22, 2023 · Feb 22, 2023
diff --git a/.gitignore b/.gitignore
@@ -32,3 +32,6 @@ coverage
 *__pycache__*
 .env
 placeholders/mappings.yml
+
+# Pickled dedup dictionaries
+*.pickle
diff --git a/utils/dedup/README.md b/utils/dedup/README.md
@@ -0,0 +1,30 @@
+# Deduplicator
+Once we have finished preprocessing all the corpora with our filters we are left with many different files that might contain duplicates. To use the deduplicator you do:
+`./dedup.sh ROOT_DIRECTORY_TO_CLEAN_FILES NEW_DIRECTORY_WHERE_ALL_DEDUPPED_FILES_WILL_BE_PLACED`
+
+`dedup.sh` will take care of creating the same directory structure at the target location. The deduplicator also incrementally dumps `shashes.pickle` and `thashes.pickle` so that it keeps track of what lines it has seen so far across different files. The script will delete those files before it runs in order to make sure that you don't run into issues when moving to new languages. Example usage:
+
+```bash
+./dedup.sh ../../../CLEAN_ZH/ ../../../CLEAN_ZH_DEDUP
+Deduplicating CLEAN/Tatoeba-v2022-03-03.clean.en-zh ...
+Deduplicating CLEAN/MultiUN-v1.clean.en-zh ...
+Deduplicating CLEAN/TED2013-v1.1.en-zh ...
+Deduplicating CLEAN/WMT-News-v2019.clean.en-zh ...
+Deduplicating CLEAN/TED2020-v1.clean.en-zh ...
+Deduplicating CLEAN/ELRC-3056-wikipedia_health-v1.clean.en-zh ...
+Deduplicating CLEAN/News-Commentary-v9.1.clean.en-zh ...
+Deduplicating MEDIUM/ELRC_2922-v1.clean.en-zh ...
+Deduplicating MEDIUM/WikiMatrix-v1.clean.en-zh ...
+Deduplicating MEDIUM/tico-19-v2020-10-28.clean.en-zh ...
+Deduplicating MEDIUM/UNPC-v20090831.clean.en-zh ...
+Deduplicating MEDIUM/Tanzil-v1.clean.en-zh ...
+Deduplicating MEDIUM/bible-uedin-v1.clean.en-zh ...
+Deduplicating DIRTY/infopankki-v1.clean.en-zh ...
+Deduplicating DIRTY/LinguaTools-WikiTitles-v2014.clean.en-zh ...
+Deduplicating DIRTY/CCMatrix.clean.en-zh ...
+Deduplicating DIRTY/CCAligned.clean.en-zh ...
+Deduplicating DIRTY/OpenSubtitles-v2016.clean.en-zh ...
+Deduplicating DIRTY/ParaCrawl-v9.clean.en-zh ...
+```
+
+Based on the work of @ZJaume , the deduplicator also takes care of near duplicates that differ only by lower and upper case.
diff --git a/utils/dedup/dedup.sh b/utils/dedup/dedup.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# $1 old documents source
+# $2 new documents target
+
+# Delete old hashes
+rm *.pickle
+
+# Create the matching directory structure in the target location
+dirs=`find $1 -type d | sed "s#${1}##g"`
+
+for dir in $dirs; do
+    mkdir -p ${2}/${dir}
+done
+
+# Create a list of files to be deduplicated
+files=`find $1 -type f`
+
+# Run the deduplicator on each one of them. In the process aggregate the hashes of seen files so that we
+# Do not output the same sentence if it has been seen in a previous version of the dataset.
+for file in $files; do
+    myfileout=`sed "s#${1}##g" <<< "${file}"`
+    echo "Deduplicating ${myfileout} ..."
+    cat ${file} | ./hash-seg.py | ./superdedup.py | cut -f 1,2 > ${2}/${myfileout}
+done
diff --git a/utils/dedup/hash-seg.py b/utils/dedup/hash-seg.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+from argparse import ArgumentParser, FileType
+from unicodedata import category as cat
+from unidecode import unidecode
+from xxhash import xxh64
+import sys
+
+parser = ArgumentParser()
+parser.add_argument('-a', '--aggressive', action='store_true', default=False)
+args = parser.parse_args()
+
+# Translate table to remove non alphabetic characters
+tbl = [chr(i) for i in range(sys.maxunicode) if not cat(chr(i)).startswith('L')]
+remove_non_alpha = str.maketrans('', '', ''.join(tbl))
+
+def main():
+    shashes, thashes = set(), set()
+    for line in sys.stdin:
+        sline = line.rstrip('\n')
+        parts = sline.split('\t')
+        src = parts[0]
+        trg = parts[1]
+
+        if args.aggressive:
+            src = unidecode(src.lower().translate(remove_non_alpha))
+            trg = unidecode(trg.lower().translate(remove_non_alpha))
+
+        src_hash = xxh64(src).hexdigest()
+        trg_hash = xxh64(trg).hexdigest()
+
+        sys.stdout.write(f"{sline}\t{src_hash}\t{trg_hash}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/utils/dedup/superdedup.py b/utils/dedup/superdedup.py
@@ -0,0 +1,32 @@
+#!/usr/bin/env python
+import sys
+import os
+import pickle
+
+def main():
+    shashes, thashes = set(), set()
+    # Try to old existing hashes
+    if os.path.isfile('shashes.pickle'):
+        with open('shashes.pickle', 'rb') as f:
+            shashes = pickle.load(f)
+    if os.path.isfile('thashes.pickle'):
+        with open('thashes.pickle', 'rb') as f:
+            thashes = pickle.load(f)
+    for line in sys.stdin:
+        parts = line.rstrip("\n").split('\t')
+
+        src_hash = parts[2]
+        trg_hash = parts[3]
+
+        if src_hash not in shashes and trg_hash not in thashes:
+            sys.stdout.write(line)
+        shashes.add(src_hash)
+        thashes.add(trg_hash)
+    # Write a list of seen hashes
+    with open('shashes.pickle','wb') as f:
+         pickle.dump(shashes, f)
+    with open('thashes.pickle','wb') as f:
+         pickle.dump(thashes, f)
+
+if __name__ == "__main__":
+    main()