diff --git a/build_openwebtext_pretraining_dataset.py b/build_openwebtext_pretraining_dataset.py index dd96368..9fd489a 100644 --- a/build_openwebtext_pretraining_dataset.py +++ b/build_openwebtext_pretraining_dataset.py @@ -44,7 +44,8 @@ def log(*args): max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=False, - do_lower_case=args.do_lower_case + do_lower_case=args.do_lower_case, + strip_accents=args.strip_accents, ) log("Writing tf examples") fnames = sorted(tf.io.gfile.listdir(owt_dir)) @@ -79,11 +80,23 @@ def main(): help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") + + # toggle lower-case parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") + + # toggle strip-accents + parser.add_argument("--do-strip-accents", dest='strip_accents', + action='store_true', help="Strip accents (default).") + parser.add_argument("--no-strip-accents", dest='strip_accents', + action='store_false', help="Don't strip accents.") + + # set defaults for toggles parser.set_defaults(do_lower_case=True) + parser.set_defaults(strip_accents=True) + args = parser.parse_args() utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords")) diff --git a/build_pretraining_dataset.py b/build_pretraining_dataset.py index 547b4b1..29e3621 100644 --- a/build_pretraining_dataset.py +++ b/build_pretraining_dataset.py @@ -119,11 +119,12 @@ class ExampleWriter(object): def __init__(self, job_id, vocab_file, output_dir, max_seq_length, num_jobs, blanks_separate_docs, do_lower_case, - num_out_files=1000): + num_out_files=1000, strip_accents=True): self._blanks_separate_docs = blanks_separate_docs tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, - do_lower_case=do_lower_case) + do_lower_case=do_lower_case, + strip_accents=strip_accents) self._example_builder = ExampleBuilder(tokenizer, max_seq_length) self._writers = [] for i in range(num_out_files): @@ -171,7 +172,8 @@ def log(*args): max_seq_length=args.max_seq_length, num_jobs=args.num_processes, blanks_separate_docs=args.blanks_separate_docs, - do_lower_case=args.do_lower_case + do_lower_case=args.do_lower_case, + strip_accents=args.strip_accents, ) log("Writing tf examples") fnames = sorted(tf.io.gfile.listdir(args.corpus_dir)) @@ -206,11 +208,23 @@ def main(): help="Parallelize across multiple processes.") parser.add_argument("--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries.") + + # toggle lower-case parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") + + # toggle strip-accents + parser.add_argument("--do-strip-accents", dest='strip_accents', + action='store_true', help="Strip accents (default).") + parser.add_argument("--no-strip-accents", dest='strip_accents', + action='store_false', help="Don't strip accents.") + + # set defaults for toggles parser.set_defaults(do_lower_case=True) + parser.set_defaults(strip_accents=True) + args = parser.parse_args() utils.rmkdir(args.output_dir) diff --git a/model/tokenization.py b/model/tokenization.py index b345256..2684369 100644 --- a/model/tokenization.py +++ b/model/tokenization.py @@ -112,10 +112,17 @@ def whitespace_tokenize(text): class FullTokenizer(object): """Runs end-to-end tokenziation.""" - def __init__(self, vocab_file, do_lower_case=True): + def __init__(self, vocab_file, do_lower_case=True, strip_accents=True): + """Constructs a FullTokenizer. + + Args: + vocab_file: The vocabulary file. + do_lower_case: Whether to lower case the input. + strip_accents: Whether to strip the accents. + """ self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, strip_accents=strip_accents) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) def tokenize(self, text): @@ -136,13 +143,15 @@ def convert_ids_to_tokens(self, ids): class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" - def __init__(self, do_lower_case=True): + def __init__(self, do_lower_case=True, strip_accents=True): """Constructs a BasicTokenizer. Args: do_lower_case: Whether to lower case the input. + strip_accents: Whether to strip the accents. """ self.do_lower_case = do_lower_case + self.strip_accents = strip_accents def tokenize(self, text): """Tokenizes a piece of text.""" @@ -162,7 +171,8 @@ def tokenize(self, text): for token in orig_tokens: if self.do_lower_case: token = token.lower() - token = self._run_strip_accents(token) + if self.strip_accents: + token = self._run_strip_accents(token) split_tokens.extend(self._run_split_on_punc(token)) output_tokens = whitespace_tokenize(" ".join(split_tokens))