Skip to content

Commit

Permalink
⛓️‍💥 Don't use eval_dataset in scripts when no eval strategy (#2270)
Browse files Browse the repository at this point in the history
  • Loading branch information
qgallouedec authored Oct 28, 2024
1 parent ea7a1be commit e155cb8
Show file tree
Hide file tree
Showing 15 changed files with 19 additions and 17 deletions.
2 changes: 1 addition & 1 deletion examples/scripts/bco.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def mean_pooling(model_output, attention_mask):
ref_model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_args),
embedding_func=embedding_func,
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/cpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_config),
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/dpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@
ref_model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=peft_config,
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/dpo_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
judge=judge,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_config),
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/dpo_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@
ref_model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=processor,
peft_config=peft_config,
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/gkd.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@
teacher_model=training_args.teacher_model_name_or_path,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_config),
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/kto.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@
ref_model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_args),
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/nash_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@
judge=judge,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
)
generation_config = GenerationConfig(
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/orpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
model,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_config),
)
Expand Down
8 changes: 5 additions & 3 deletions examples/scripts/ppo/ppo_tldr.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
################
dataset = load_dataset(script_args.dataset_name)
train_dataset = dataset[script_args.dataset_train_split]
eval_dataset = dataset[script_args.dataset_test_split]
eval_dataset = dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None

def prepare_dataset(dataset, tokenizer):
"""pre-tokenize the dataset before training; only collate during training"""
Expand All @@ -118,10 +118,12 @@ def tokenize(element):
# see: https://github.com/huggingface/trl/pull/1255
with PartialState().local_main_process_first():
train_dataset = prepare_dataset(train_dataset, tokenizer)
eval_dataset = prepare_dataset(eval_dataset, tokenizer)
if eval_dataset is not None:
eval_dataset = prepare_dataset(eval_dataset, tokenizer)
# filtering
train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)
if eval_dataset is not None:
eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=training_args.dataset_num_proc)

assert train_dataset[0]["input_ids"][-1] != tokenizer.eos_token_id, "The last token should not be an EOS token"
################
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/reward_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@
processing_class=tokenizer,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
peft_config=get_peft_config(model_config),
)
trainer.train()
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/rloo/rloo_tldr.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@
################
dataset = load_dataset(script_args.dataset_name)
train_dataset = dataset[script_args.dataset_train_split]
eval_dataset = dataset[script_args.dataset_test_split]
eval_dataset = dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None

def prepare_dataset(dataset, tokenizer):
"""pre-tokenize the dataset before training; only collate during training"""
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/sft.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
model=model_config.model_name_or_path,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
peft_config=get_peft_config(model_config),
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/sft_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def collate_fn(examples):
args=training_args,
data_collator=collate_fn,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=processor.tokenizer,
peft_config=get_peft_config(model_config),
)
Expand Down
2 changes: 1 addition & 1 deletion examples/scripts/xpo.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
judge=judge,
args=training_args,
train_dataset=dataset[script_args.dataset_train_split],
eval_dataset=dataset[script_args.dataset_test_split],
eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
processing_class=tokenizer,
)
generation_config = GenerationConfig(
Expand Down

0 comments on commit e155cb8

Please sign in to comment.