From b4147246106b597c351b935200fd52323412f11c Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 31 Jul 2020 13:00:11 +0200 Subject: [PATCH 01/14] No _run_strip_accents --- model/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/tokenization.py b/model/tokenization.py index b345256..2975a19 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -162,7 +162,7 @@ def tokenize(self, text): for token in orig_tokens: if self.do_lower_case: token = token.lower() - token = self._run_strip_accents(token) + #token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) From 988a077419b5eede9dd1a55a1e48be57fa8a25c7 Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 31 Jul 2020 13:21:23 +0200 Subject: [PATCH 02/14] add strip_accents param --- model/tokenization.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/model/tokenization.py b/model/tokenization.py index 2975a19..4cf5269 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -112,10 +112,10 @@ def whitespace_tokenize(text): class FullTokenizer(object): """Runs end-to-end tokenziation.""" - def __init__(self, vocab_file, do_lower_case=True): + def __init__(self, vocab_file, do_lower_case=True, strip_accents=True): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, strip_accents=strip_accents) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) def tokenize(self, text): @@ -136,13 +136,15 @@ def convert_ids_to_tokens(self, ids): class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - def __init__(self, do_lower_case=True): + def __init__(self, do_lower_case=True, strip_accents=True): """Constructs a BasicTokenizer. Args: do_lower_case: Whether to lower case the input. + strip_accents: Whether to strip the accents. """ self.do_lower_case = do_lower_case + self.strip_accents = strip_accents def tokenize(self, text): """Tokenizes a piece of text.""" @@ -162,7 +164,8 @@ def tokenize(self, text): for token in orig_tokens: if self.do_lower_case: token = token.lower() - #token = self._run_strip_accents(token) + if strip_accents: + token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens)) From 13b196cf153a68ff6b36b803559be1e866a21f69 Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 31 Jul 2020 13:26:23 +0200 Subject: [PATCH 03/14] Fix bug accessing strip_accents --- model/tokenization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/model/tokenization.py b/model/tokenization.py index 4cf5269..2774d5f 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -164,7 +164,7 @@ def tokenize(self, text): for token in orig_tokens: if self.do_lower_case: token = token.lower() - if strip_accents: + if self.strip_accents: token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) From 229f7854be3c0bdbb1d5b8232106d6f9e9979833 Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 31 Jul 2020 13:40:20 +0200 Subject: [PATCH 04/14] Add strip_acc. opt. to build_pretraining_dataset --- build_pretraining_dataset.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 547b4b1..f85e1b1 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -119,11 +119,12 @@ class ExampleWriter(object): def __init__(self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, - num_out_files=1000): + num_out_files=1000, strip_accents): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, - do_lower_case=do_lower_case) + do_lower_case=do_lower_case, + strip_accents=strip_accents) self._example_builder = ExampleBuilder(tokenizer, max_seq_length) self._writers = [] for i in range(num_out_files): @@ -171,7 +172,8 @@ def log(*args): max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=args.blanks_separate_docs, - do_lower_case=args.do_lower_case + do_lower_case=args.do_lower_case, + strip_accents=args.strip_accents, ) log("Writing tf examples") fnames = sorted(tf.io.gfile.listdir(args.corpus_dir)) @@ -210,7 +212,12 @@ def main(): action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") + parser.add_argument("--strip_accents", dest='strip_accents', + action='store_true', help="Strip accents.") + parser.add_argument("--no-strip_accents", dest='strip_accents', + action='store_false', help="Don't strip accents.") parser.set_defaults(do_lower_case=True) + parser.set_defaults(strip_accents=True) args = parser.parse_args() utils.rmkdir(args.output_dir) From cde962e9e40f99ef62022b3f851f97fa786dc327 Mon Sep 17 00:00:00 2001 From: Philip May Date: Fri, 31 Jul 2020 22:14:29 +0200 Subject: [PATCH 05/14] Update build_pretraining_dataset.py --- build_pretraining_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index f85e1b1..9ab8356 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -212,7 +212,7 @@ def main(): action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") - parser.add_argument("--strip_accents", dest='strip_accents', + parser.add_argument("--do-strip_accents", dest='strip_accents', action='store_true', help="Strip accents.") parser.add_argument("--no-strip_accents", dest='strip_accents', action='store_false', help="Don't strip accents.") From 719d16b67f3ebb4a904409dfba2482dc3f8479f5 Mon Sep 17 00:00:00 2001 From: Philip May Date: Sat, 1 Aug 2020 09:09:45 +0200 Subject: [PATCH 06/14] Update build_pretraining_dataset.py --- build_pretraining_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 9ab8356..8462553 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -119,7 +119,7 @@ class ExampleWriter(object): def __init__(self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, - num_out_files=1000, strip_accents): + num_out_files=1000, strip_accents=strip_accents): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, From 2923de7f69b088aead15c1cc7a14f7201161cb13 Mon Sep 17 00:00:00 2001 From: Philip May Date: Sat, 1 Aug 2020 09:10:47 +0200 Subject: [PATCH 07/14] Update build_pretraining_dataset.py --- build_pretraining_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 8462553..9ab8356 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -119,7 +119,7 @@ class ExampleWriter(object): def __init__(self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, - num_out_files=1000, strip_accents=strip_accents): + num_out_files=1000, strip_accents): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, From 575f96a6a731a82589fe5c1875a8e2c0655a9872 Mon Sep 17 00:00:00 2001 From: Philip May Date: Sat, 1 Aug 2020 09:11:51 +0200 Subject: [PATCH 08/14] Update build_pretraining_dataset.py --- build_pretraining_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 9ab8356..bb3c052 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -119,7 +119,7 @@ class ExampleWriter(object): def __init__(self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, - num_out_files=1000, strip_accents): + num_out_files=1000, strip_accents=True): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, From 3bbce097659af54c0f186bd2af449d4d66b313c7 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 10:35:00 +0200 Subject: [PATCH 09/14] rename strip-accents command line argument --- build_pretraining_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index bb3c052..5789f4d 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -212,9 +212,9 @@ def main(): action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") - parser.add_argument("--do-strip_accents", dest='strip_accents', + parser.add_argument("--do-strip-accents", dest='strip_accents', action='store_true', help="Strip accents.") - parser.add_argument("--no-strip_accents", dest='strip_accents', + parser.add_argument("--no-strip-accents", dest='strip_accents', action='store_false', help="Don't strip accents.") parser.set_defaults(do_lower_case=True) parser.set_defaults(strip_accents=True) From 229efd20fbf4fbace54363f0fc7de8ab5eca184e Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 10:39:36 +0200 Subject: [PATCH 10/14] code doc for command line params --- build_pretraining_dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 5789f4d..6380eb2 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -208,16 +208,23 @@ def main(): help="Parallelize across multiple processes.") parser.add_argument("--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries.") + + # toggle lower-case parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") + + # toggle strip-accents parser.add_argument("--do-strip-accents", dest='strip_accents', action='store_true', help="Strip accents.") parser.add_argument("--no-strip-accents", dest='strip_accents', - action='store_false', help="Don't strip accents.") + action='store_false', help="Don't strip accents.") + + # set defaults for toggles parser.set_defaults(do_lower_case=True) parser.set_defaults(strip_accents=True) + args = parser.parse_args() utils.rmkdir(args.output_dir) From 1cd5c4c471b22e0be022eccefb3553617b0b2cb4 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 10:43:21 +0200 Subject: [PATCH 11/14] trim trailing whitespace --- build_pretraining_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 6380eb2..44f7cdd 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -209,7 +209,7 @@ def main(): parser.add_argument("--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries.") - # toggle lower-case + # toggle lower-case parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', From 5e66ae73ebc5614a3a5b23c75676fe9c51a38d86 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 10:47:32 +0200 Subject: [PATCH 12/14] add strip_accents toggle to build_openweb --- build_openwebtext_pretraining_dataset.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/build_openwebtext_pretraining_dataset.py b/build_openwebtext_pretraining_dataset.py index dd96368..cfd3f3f 100644 --- a/build_openwebtext_pretraining_dataset.py +++ b/build_openwebtext_pretraining_dataset.py @@ -44,7 +44,8 @@ def log(*args): max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=False, - do_lower_case=args.do_lower_case + do_lower_case=args.do_lower_case, + strip_accents=args.strip_accents, ) log("Writing tf examples") fnames = sorted(tf.io.gfile.listdir(owt_dir)) @@ -79,11 +80,23 @@ def main(): help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") + + # toggle lower-case parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") + + # toggle strip-accents + parser.add_argument("--do-strip-accents", dest='strip_accents', + action='store_true', help="Strip accents.") + parser.add_argument("--no-strip-accents", dest='strip_accents', + action='store_false', help="Don't strip accents.") + + # set defaults for toggles parser.set_defaults(do_lower_case=True) + parser.set_defaults(strip_accents=True) + args = parser.parse_args() utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords")) From 9dd0028bbac878d37d4912284707f9df11c0eb4a Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 10:52:17 +0200 Subject: [PATCH 13/14] command line doc --- build_openwebtext_pretraining_dataset.py | 2 +- build_pretraining_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build_openwebtext_pretraining_dataset.py b/build_openwebtext_pretraining_dataset.py index cfd3f3f..9fd489a 100644 --- a/build_openwebtext_pretraining_dataset.py +++ b/build_openwebtext_pretraining_dataset.py @@ -89,7 +89,7 @@ def main(): # toggle strip-accents parser.add_argument("--do-strip-accents", dest='strip_accents', - action='store_true', help="Strip accents.") + action='store_true', help="Strip accents (default).") parser.add_argument("--no-strip-accents", dest='strip_accents', action='store_false', help="Don't strip accents.") diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 44f7cdd..29e3621 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -217,7 +217,7 @@ def main(): # toggle strip-accents parser.add_argument("--do-strip-accents", dest='strip_accents', - action='store_true', help="Strip accents.") + action='store_true', help="Strip accents (default).") parser.add_argument("--no-strip-accents", dest='strip_accents', action='store_false', help="Don't strip accents.") From ce4b8699750d9e5a2b4534ccc20815d6a41ca6c5 Mon Sep 17 00:00:00 2001 From: PhilipMay Date: Sat, 1 Aug 2020 10:56:14 +0200 Subject: [PATCH 14/14] Docstring for accents --- model/tokenization.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/model/tokenization.py b/model/tokenization.py index 2774d5f..2684369 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -113,6 +113,13 @@ class FullTokenizer(object): """Runs end-to-end tokenziation.""" def __init__(self, vocab_file, do_lower_case=True, strip_accents=True): + """Constructs a FullTokenizer. + + Args: + vocab_file: The vocabulary file. + do_lower_case: Whether to lower case the input. + strip_accents: Whether to strip the accents. + """ self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, strip_accents=strip_accents)