Skip to content

Commit

Permalink
Merge pull request #88 from PhilipMay/no_strip_accents
Browse files Browse the repository at this point in the history
Add toggle to turn off `strip_accents`.
  • Loading branch information
clarkkev authored Mar 31, 2021
2 parents f93f3f8 + ce4b869 commit 8a46635
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 8 deletions.
15 changes: 14 additions & 1 deletion build_openwebtext_pretraining_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def log(*args):
max_seq_length=args.max_seq_length,
num_jobs=args.num_processes,
blanks_separate_docs=False,
do_lower_case=args.do_lower_case
do_lower_case=args.do_lower_case,
strip_accents=args.strip_accents,
)
log("Writing tf examples")
fnames = sorted(tf.io.gfile.listdir(owt_dir))
Expand Down Expand Up @@ -79,11 +80,23 @@ def main():
help="Number of tokens per example.")
parser.add_argument("--num-processes", default=1, type=int,
help="Parallelize across multiple processes.")

# toggle lower-case
parser.add_argument("--do-lower-case", dest='do_lower_case',
action='store_true', help="Lower case input text.")
parser.add_argument("--no-lower-case", dest='do_lower_case',
action='store_false', help="Don't lower case input text.")

# toggle strip-accents
parser.add_argument("--do-strip-accents", dest='strip_accents',
action='store_true', help="Strip accents (default).")
parser.add_argument("--no-strip-accents", dest='strip_accents',
action='store_false', help="Don't strip accents.")

# set defaults for toggles
parser.set_defaults(do_lower_case=True)
parser.set_defaults(strip_accents=True)

args = parser.parse_args()

utils.rmkdir(os.path.join(args.data_dir, "pretrain_tfrecords"))
Expand Down
20 changes: 17 additions & 3 deletions build_pretraining_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,12 @@ class ExampleWriter(object):

def __init__(self, job_id, vocab_file, output_dir, max_seq_length,
num_jobs, blanks_separate_docs, do_lower_case,
num_out_files=1000):
num_out_files=1000, strip_accents=True):
self._blanks_separate_docs = blanks_separate_docs
tokenizer = tokenization.FullTokenizer(
vocab_file=vocab_file,
do_lower_case=do_lower_case)
do_lower_case=do_lower_case,
strip_accents=strip_accents)
self._example_builder = ExampleBuilder(tokenizer, max_seq_length)
self._writers = []
for i in range(num_out_files):
Expand Down Expand Up @@ -171,7 +172,8 @@ def log(*args):
max_seq_length=args.max_seq_length,
num_jobs=args.num_processes,
blanks_separate_docs=args.blanks_separate_docs,
do_lower_case=args.do_lower_case
do_lower_case=args.do_lower_case,
strip_accents=args.strip_accents,
)
log("Writing tf examples")
fnames = sorted(tf.io.gfile.listdir(args.corpus_dir))
Expand Down Expand Up @@ -206,11 +208,23 @@ def main():
help="Parallelize across multiple processes.")
parser.add_argument("--blanks-separate-docs", default=True, type=bool,
help="Whether blank lines indicate document boundaries.")

# toggle lower-case
parser.add_argument("--do-lower-case", dest='do_lower_case',
action='store_true', help="Lower case input text.")
parser.add_argument("--no-lower-case", dest='do_lower_case',
action='store_false', help="Don't lower case input text.")

# toggle strip-accents
parser.add_argument("--do-strip-accents", dest='strip_accents',
action='store_true', help="Strip accents (default).")
parser.add_argument("--no-strip-accents", dest='strip_accents',
action='store_false', help="Don't strip accents.")

# set defaults for toggles
parser.set_defaults(do_lower_case=True)
parser.set_defaults(strip_accents=True)

args = parser.parse_args()

utils.rmkdir(args.output_dir)
Expand Down
18 changes: 14 additions & 4 deletions model/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,17 @@ def whitespace_tokenize(text):
class FullTokenizer(object):
"""Runs end-to-end tokenziation."""

def __init__(self, vocab_file, do_lower_case=True):
def __init__(self, vocab_file, do_lower_case=True, strip_accents=True):
"""Constructs a FullTokenizer.
Args:
vocab_file: The vocabulary file.
do_lower_case: Whether to lower case the input.
strip_accents: Whether to strip the accents.
"""
self.vocab = load_vocab(vocab_file)
self.inv_vocab = {v: k for k, v in self.vocab.items()}
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, strip_accents=strip_accents)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
Expand All @@ -136,13 +143,15 @@ def convert_ids_to_tokens(self, ids):
class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self, do_lower_case=True):
def __init__(self, do_lower_case=True, strip_accents=True):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
strip_accents: Whether to strip the accents.
"""
self.do_lower_case = do_lower_case
self.strip_accents = strip_accents

def tokenize(self, text):
"""Tokenizes a piece of text."""
Expand All @@ -162,7 +171,8 @@ def tokenize(self, text):
for token in orig_tokens:
if self.do_lower_case:
token = token.lower()
token = self._run_strip_accents(token)
if self.strip_accents:
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
Expand Down

0 comments on commit 8a46635

Please sign in to comment.