diff --git a/.gitignore b/.gitignore index fd8f18e..9023d3a 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,11 @@ coverage *__pycache__* .env placeholders/mappings.yml + + +# Pickled dedup dictionaries +*.pickle + *.sublime-workspace *.sublime-project *.whl diff --git a/utils/dedup/README.md b/utils/dedup/README.md new file mode 100644 index 0000000..310261b --- /dev/null +++ b/utils/dedup/README.md @@ -0,0 +1,30 @@ +# Deduplicator +Once we have finished preprocessing all the corpora with our filters we are left with many different files that might contain duplicates. To use the deduplicator you do: +`./dedup.sh ROOT_DIRECTORY_TO_CLEAN_FILES NEW_DIRECTORY_WHERE_ALL_DEDUPPED_FILES_WILL_BE_PLACED` + +`dedup.sh` will take care of creating the same directory structure at the target location. The deduplicator also incrementally dumps `shashes.pickle` and `thashes.pickle` so that it keeps track of what lines it has seen so far across different files. The script will delete those files before it runs in order to make sure that you don't run into issues when moving to new languages. Example usage: + +```bash +./dedup.sh ../../../CLEAN_ZH/ ../../../CLEAN_ZH_DEDUP +Deduplicating CLEAN/Tatoeba-v2022-03-03.clean.en-zh ... +Deduplicating CLEAN/MultiUN-v1.clean.en-zh ... +Deduplicating CLEAN/TED2013-v1.1.en-zh ... +Deduplicating CLEAN/WMT-News-v2019.clean.en-zh ... +Deduplicating CLEAN/TED2020-v1.clean.en-zh ... +Deduplicating CLEAN/ELRC-3056-wikipedia_health-v1.clean.en-zh ... +Deduplicating CLEAN/News-Commentary-v9.1.clean.en-zh ... +Deduplicating MEDIUM/ELRC_2922-v1.clean.en-zh ... +Deduplicating MEDIUM/WikiMatrix-v1.clean.en-zh ... +Deduplicating MEDIUM/tico-19-v2020-10-28.clean.en-zh ... +Deduplicating MEDIUM/UNPC-v20090831.clean.en-zh ... +Deduplicating MEDIUM/Tanzil-v1.clean.en-zh ... +Deduplicating MEDIUM/bible-uedin-v1.clean.en-zh ... +Deduplicating DIRTY/infopankki-v1.clean.en-zh ... +Deduplicating DIRTY/LinguaTools-WikiTitles-v2014.clean.en-zh ... +Deduplicating DIRTY/CCMatrix.clean.en-zh ... +Deduplicating DIRTY/CCAligned.clean.en-zh ... +Deduplicating DIRTY/OpenSubtitles-v2016.clean.en-zh ... +Deduplicating DIRTY/ParaCrawl-v9.clean.en-zh ... +``` + +Based on the work of @ZJaume , the deduplicator also takes care of near duplicates that differ only by lower and upper case. diff --git a/utils/dedup/dedup.sh b/utils/dedup/dedup.sh new file mode 100755 index 0000000..4dafe6b --- /dev/null +++ b/utils/dedup/dedup.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# $1 old documents source +# $2 new documents target + +# Delete old hashes +rm *.pickle + +# Create the matching directory structure in the target location +dirs=`find $1 -type d | sed "s#${1}##g"` + +for dir in $dirs; do + mkdir -p ${2}/${dir} +done + +# Create a list of files to be deduplicated +files=`find $1 -type f` + +# Run the deduplicator on each one of them. In the process aggregate the hashes of seen files so that we +# Do not output the same sentence if it has been seen in a previous version of the dataset. +for file in $files; do + myfileout=`sed "s#${1}##g" <<< "${file}"` + echo "Deduplicating ${myfileout} ..." + cat ${file} | ./hash-seg.py -a | ./superdedup.py | cut -f 1,2 > ${2}/${myfileout} +done diff --git a/utils/dedup/hash-seg.py b/utils/dedup/hash-seg.py new file mode 100755 index 0000000..9d250b7 --- /dev/null +++ b/utils/dedup/hash-seg.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +from argparse import ArgumentParser, FileType +from unicodedata import category as cat +from unidecode import unidecode +from xxhash import xxh64 +import sys + +parser = ArgumentParser() +parser.add_argument('-a', '--aggressive', action='store_true', default=False) +args = parser.parse_args() + +# Translate table to remove non alphabetic characters +tbl = [chr(i) for i in range(sys.maxunicode) if not cat(chr(i)).startswith('L')] +remove_non_alpha = str.maketrans('', '', ''.join(tbl)) + +def main(): + shashes, thashes = set(), set() + for line in sys.stdin: + sline = line.rstrip('\n') + parts = sline.split('\t') + src = parts[0] + trg = parts[1] + + if args.aggressive: + src = unidecode(src.lower().translate(remove_non_alpha)) + trg = unidecode(trg.lower().translate(remove_non_alpha)) + + src_hash = xxh64(src).hexdigest() + trg_hash = xxh64(trg).hexdigest() + + sys.stdout.write(f"{sline}\t{src_hash}\t{trg_hash}\n") + + +if __name__ == "__main__": + main() diff --git a/utils/dedup/superdedup.py b/utils/dedup/superdedup.py new file mode 100755 index 0000000..333eb30 --- /dev/null +++ b/utils/dedup/superdedup.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python +import sys +import os +import pickle + +def main(): + shashes, thashes = set(), set() + # Try to old existing hashes + if os.path.isfile('shashes.pickle'): + with open('shashes.pickle', 'rb') as f: + shashes = pickle.load(f) + if os.path.isfile('thashes.pickle'): + with open('thashes.pickle', 'rb') as f: + thashes = pickle.load(f) + for line in sys.stdin: + parts = line.rstrip("\n").split('\t') + + src_hash = parts[2] + trg_hash = parts[3] + + if src_hash not in shashes and trg_hash not in thashes: + sys.stdout.write(line) + shashes.add(src_hash) + thashes.add(trg_hash) + # Write a list of seen hashes + with open('shashes.pickle','wb') as f: + pickle.dump(shashes, f) + with open('thashes.pickle','wb') as f: + pickle.dump(thashes, f) + +if __name__ == "__main__": + main()