From a6435fade11117fd0ef16ee645f16c9440cb9167 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 19 Aug 2024 23:42:50 +0000 Subject: [PATCH] simple change to enable mapping functions for ft constructor --- llmfoundry/data/finetuning/tasks.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 23929f9b85..754b203f34 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -801,6 +801,7 @@ def build_from_hf( split: str, safe_load: bool = False, max_seq_len: int = 2048, + mapping_fn: Callable = tokenize_formatted_example, preprocessing_fn: Optional[Callable[[dict[str, Any]], Example]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, target_prompts: str = DEFAULT_TARGET_PROMPTS, @@ -930,11 +931,11 @@ def build_from_hf( def dataset_mapper(example: dict): if preprocessing_fn is not None: - return tokenize_formatted_example( + return mapping_fn( preprocessing_fn(example), tokenizer, ) - return tokenize_formatted_example(example, tokenizer) + return mapping_fn(example, tokenizer) detected_cpu_count = os.cpu_count() or 1 detected_cpus_with_margin = detected_cpu_count - 8