From e73a97a2b338fd4bf3d97034b37dfcb29de0cb25 Mon Sep 17 00:00:00 2001 From: Locke Date: Tue, 25 Jun 2024 19:35:50 +0800 Subject: [PATCH] add preprocessing_num_workers to run_classification.py (#31586) preprocessing_num_workers option to speedup preprocess --- examples/pytorch/text-classification/run_classification.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index 5decef3656b9a4..ff05b78cb538ec 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -133,6 +133,10 @@ class DataTrainingArguments: ) }, ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) overwrite_cache: bool = field( default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."} ) @@ -573,6 +577,7 @@ def preprocess_function(examples): raw_datasets = raw_datasets.map( preprocess_function, batched=True, + num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", )