diff --git a/tests/test_ppo_trainer.py b/tests/test_ppo_trainer.py index ad9b19a7b9..5560aa80a9 100644 --- a/tests/test_ppo_trainer.py +++ b/tests/test_ppo_trainer.py @@ -12,94 +12,213 @@ # See the License for the specific language governing permissions and # limitations under the License. -import platform -import subprocess +import tempfile +import unittest +import torch +from datasets import load_dataset +from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer from transformers.testing_utils import require_peft +from trl import PPOConfig, PPOTrainer +from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE -def test(): - command = """\ -python examples/scripts/ppo/ppo.py \ - --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \ - --dataset_train_split descriptiveness \ - --learning_rate 3e-6 \ - --output_dir models/minimal/ppo \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --total_episodes 10 \ - --model_name_or_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \ - --reward_model_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \ - --sft_model_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \ - --missing_eos_penalty 1.0 \ - --save_strategy no \ - --stop_token eos -""" - if platform.system() == "Windows": - # windows CI does not work with subprocesses for some reason - # e.g., https://github.com/huggingface/trl/actions/runs/9600036224/job/26475286210?pr=1743 - return - subprocess.run( - command, - shell=True, - check=True, - ) - - -def test_num_train_epochs(): - command = """\ -python examples/scripts/ppo/ppo.py \ - --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \ - --dataset_train_split descriptiveness \ - --learning_rate 3e-6 \ - --output_dir models/minimal/ppo \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --num_train_epochs 0.003 \ - --model_name_or_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \ - --reward_model_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \ - --sft_model_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \ - --missing_eos_penalty 1.0 \ - --save_strategy no \ - --stop_token eos -""" - if platform.system() == "Windows": - # windows CI does not work with subprocesses for some reason - # e.g., https://github.com/huggingface/trl/actions/runs/9600036224/job/26475286210?pr=1743 - return - subprocess.run( - command, - shell=True, - check=True, - ) - - -@require_peft -def test_peft_support(): - command = """\ -python examples/scripts/ppo/ppo.py \ - --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \ - --dataset_train_split descriptiveness \ - --learning_rate 3e-6 \ - --output_dir models/minimal/ppo \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 1 \ - --total_episodes 10 \ - --model_name_or_path EleutherAI/pythia-14m \ - --missing_eos_penalty 1.0 \ - --save_strategy no \ - --stop_token eos \ - --use_peft \ - --lora_r 32 \ - --lora_alpha 16 \ - --lora_target_modules query_key_value dense -""" - if platform.system() == "Windows": - # windows CI does not work with subprocesses for some reason - # e.g., https://github.com/huggingface/trl/actions/runs/9600036224/job/26475286210?pr=1743 - return - subprocess.run( - command, - shell=True, - check=True, - ) + +class TestPPOTrainer(unittest.TestCase): + def setUp(self): + # Set up the models and tokenizer using the test model + self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5" + self.model = AutoModelForCausalLM.from_pretrained(self.model_id) + self.ref_model = AutoModelForCausalLM.from_pretrained(self.model_id) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_id, padding_side="left") + self.tokenizer.add_special_tokens({"pad_token": "[PAD]"}) + + if self.tokenizer.chat_template is None: + self.tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE + + # Add reward and value models as in ppo.py + self.value_model = AutoModelForSequenceClassification.from_pretrained( + self.model_id, trust_remote_code=True, num_labels=1 + ) + self.reward_model = AutoModelForSequenceClassification.from_pretrained( + self.model_id, trust_remote_code=True, num_labels=1 + ) + + # Load dataset + raw_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only") + self.raw_dataset = raw_dataset.map(lambda x: self.tokenizer(x["prompt"]), remove_columns=["prompt"]) + + def test_basic_training(self): + """Test basic PPO training configuration and verify model updates.""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Capture initial weights + initial_critic_weights = {} + initial_policy_weights = {} + for name, param in self.value_model.named_parameters(): + initial_critic_weights[name] = param.clone().detach() + for name, param in self.model.named_parameters(): + initial_policy_weights[name] = param.clone().detach() + + # Configure training args similar to example script + training_args = PPOConfig( + output_dir=tmp_dir, + per_device_train_batch_size=4, + per_device_eval_batch_size=2, + report_to="none", + missing_eos_penalty=1.0, + vf_coef=1.0, # Increase value function coefficient + num_ppo_epochs=4, # Increase number of PPO epochs + ) + + # Create trainer + trainer = PPOTrainer( + args=training_args, + processing_class=self.tokenizer, + model=self.model, + ref_model=self.ref_model, + reward_model=self.reward_model, + value_model=self.value_model, + train_dataset=self.raw_dataset["train"], + eval_dataset=self.raw_dataset["test"], + ) + + # Train + trainer.train() + + # Check if critic weights have been updated + critic_weights_updated = False + for name, param in trainer.model.value_model.named_parameters(): + if not torch.allclose(initial_critic_weights[name], param.to("cpu")): + critic_weights_updated = True + break + + # Check if policy weights have been updated + policy_weights_updated = False + for name, param in trainer.model.policy.named_parameters(): + if not torch.allclose(initial_policy_weights[name], param.to("cpu")): + policy_weights_updated = True + break + + self.assertTrue(critic_weights_updated, "Critic weights were not updated during training") + self.assertTrue(policy_weights_updated, "Policy weights were not updated during training") + + @require_peft + def test_peft_training(self): + """Test PPO training with PEFT configuration and verify model updates.""" + from peft import LoraConfig + + with tempfile.TemporaryDirectory() as tmp_dir: + # Capture initial weights + initial_critic_weights = {} + initial_policy_weights = {} + for name, param in self.value_model.named_parameters(): + initial_critic_weights[name] = param.clone().detach() + for name, param in self.model.named_parameters(): + initial_policy_weights[name] = param.clone().detach() + + # Configure training args + training_args = PPOConfig( + output_dir=tmp_dir, + per_device_train_batch_size=4, + per_device_eval_batch_size=2, + report_to="none", + missing_eos_penalty=1.0, + ) + + # Configure PEFT + peft_config = LoraConfig( + r=32, + lora_alpha=16, + lora_dropout=0.05, + bias="none", + task_type="CAUSAL_LM", + ) + + # Create trainer with PEFT + trainer = PPOTrainer( + args=training_args, + processing_class=self.tokenizer, + model=self.model, + ref_model=None, + reward_model=self.reward_model, + value_model=self.value_model, + train_dataset=self.raw_dataset["train"], + eval_dataset=self.raw_dataset["test"], + peft_config=peft_config, + ) + + # Train + trainer.train() + + # Check if critic weights have been updated + critic_weights_updated = False + for name, param in trainer.model.value_model.named_parameters(): + if name in initial_critic_weights and not torch.allclose( + initial_critic_weights[name], param.to("cpu") + ): + critic_weights_updated = True + break + + # Check if policy weights have been updated - for PEFT we check the LoRA weights + policy_weights_updated = False + for name, param in trainer.model.policy.named_parameters(): + if "lora" in name.lower() and param.requires_grad: # Only check LoRA weights + # New weights should be non-zero if they've been updated + if not torch.allclose(param, torch.zeros_like(param)): + policy_weights_updated = True + break + + self.assertTrue(critic_weights_updated, "Critic weights were not updated during training") + self.assertTrue(policy_weights_updated, "Policy LoRA weights were not updated during training") + + def test_with_num_train_epochs(self): + """Test PPO training with num_train_epochs configuration.""" + with tempfile.TemporaryDirectory() as tmp_dir: + # Capture initial weights + initial_critic_weights = {} + initial_policy_weights = {} + for name, param in self.value_model.named_parameters(): + initial_critic_weights[name] = param.clone().detach() + for name, param in self.model.named_parameters(): + initial_policy_weights[name] = param.clone().detach() + + # Configure training args + training_args = PPOConfig( + output_dir=tmp_dir, + per_device_train_batch_size=4, + per_device_eval_batch_size=2, + report_to="none", + missing_eos_penalty=1.0, + ) + + # Create trainer + trainer = PPOTrainer( + args=training_args, + processing_class=self.tokenizer, + model=self.model, + ref_model=self.ref_model, + reward_model=self.reward_model, + value_model=self.value_model, + train_dataset=self.raw_dataset["train"], + eval_dataset=self.raw_dataset["test"], + ) + + # Train and verify no exceptions are raised + trainer.train() + + # Check if critic weights have been updated + critic_weights_updated = False + for name, param in trainer.model.value_model.named_parameters(): + if not torch.allclose(initial_critic_weights[name], param.to("cpu")): + critic_weights_updated = True + break + + # Check if policy weights have been updated + policy_weights_updated = False + for name, param in trainer.model.policy.named_parameters(): + if not torch.allclose(initial_policy_weights[name], param.to("cpu")): + policy_weights_updated = True + break + + self.assertTrue(critic_weights_updated, "Critic weights were not updated during training") + self.assertTrue(policy_weights_updated, "Policy weights were not updated during training") diff --git a/trl/trainer/ppo_trainer.py b/trl/trainer/ppo_trainer.py index b51c29c014..3827caae30 100644 --- a/trl/trainer/ppo_trainer.py +++ b/trl/trainer/ppo_trainer.py @@ -130,16 +130,16 @@ def __init__( self.args = args self.processing_class = processing_class - self.model = model + self.policy_model = model # Define the collator if not provided if data_collator is None: data_collator = DataCollatorWithPadding(self.processing_class) - self.model.generation_config.eos_token_id = ( + self.policy_model.generation_config.eos_token_id = ( None # disable `pad_token_id` and `eos_token_id` because we just want to ) - self.model.generation_config.pad_token_id = None # generate tokens without truncation / padding + self.policy_model.generation_config.pad_token_id = None # generate tokens without truncation / padding # peft support if not is_peft_available() and peft_config is not None: @@ -148,15 +148,15 @@ def __init__( ) elif is_peft_available() and peft_config is not None: # if model is a peft model and we have a peft_confg, we merge and unload it first - if isinstance(self.model, PeftModel): - self.model = self.model.merge_and_unload() + if isinstance(self.policy_model, PeftModel): + self.policy_model = self.policy_model.merge_and_unload() # get peft model with the given config - self.model = get_peft_model(self.model, peft_config) - if args.bf16 and getattr(self.model, "is_loaded_in_4bit", False): - peft_module_casting_to_bf16(self.model) + self.policy_model = get_peft_model(self.policy_model, peft_config) + if args.bf16 and getattr(self.policy_model, "is_loaded_in_4bit", False): + peft_module_casting_to_bf16(self.policy_model) - self.is_peft_model = is_peft_available() and isinstance(self.model, PeftModel) + self.is_peft_model = is_peft_available() and isinstance(self.policy_model, PeftModel) self.model_adapter_name = args.model_adapter_name self.ref_adapter_name = args.ref_adapter_name @@ -165,7 +165,7 @@ def __init__( elif self.is_peft_model: self.ref_model = None else: - self.ref_model = create_reference_model(self.model) + self.ref_model = create_reference_model(self.policy_model) self.reward_model = reward_model self.train_dataset = train_dataset @@ -215,13 +215,13 @@ def __init__( ######### # setup model, optimizer, and others ######### - for module in [self.model, self.ref_model, self.value_model, self.reward_model]: + for module in [self.policy_model, self.ref_model, self.value_model, self.reward_model]: if module is not None: disable_dropout_in_model(module) if args.stop_token and args.stop_token == "eos": args.stop_token_id = processing_class.eos_token_id - self.policy_and_value = PolicyAndValueWrapper(self.model, self.value_model) - self.policy_and_value.config = self.model.config # needed for pushing to hub + self.model = PolicyAndValueWrapper(self.policy_model, self.value_model) + self.model.config = self.policy_model.config # needed for pushing to hub self.create_optimizer_and_scheduler( num_training_steps=args.num_total_batches ) # note that we are calling `self.lr_scheduler.step()` manually only at the batch level @@ -232,7 +232,7 @@ def __init__( default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) self.callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks self.callback_handler = CallbackHandler( - self.callbacks, self.policy_and_value, self.processing_class, self.optimizer, self.lr_scheduler + self.callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler ) self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) self.control = TrainerControl() @@ -255,8 +255,8 @@ def __init__( os.makedirs(self.args.output_dir, exist_ok=True) # Add tags for models that have been loaded with the correct transformers version - if hasattr(self.policy_and_value, "add_model_tags"): - self.policy_and_value.add_model_tags(self._tag_names) + if hasattr(self.model, "add_model_tags"): + self.model.add_model_tags(self._tag_names) ######### ### setup dataloader @@ -271,9 +271,7 @@ def __init__( # sync random states for DataLoader(shuffle=True) before `accelerator.prepare` # see https://gist.github.com/vwxyzjn/2581bff1e48e185e0b85b6dfe1def79c torch.manual_seed(args.seed) - self.policy_and_value, self.optimizer, self.dataloader = accelerator.prepare( - self.policy_and_value, self.optimizer, self.dataloader - ) + self.model, self.optimizer, self.dataloader = accelerator.prepare(self.model, self.optimizer, self.dataloader) torch.manual_seed(self.local_seed) # reset the local seed again self.eval_dataloader = DataLoader( @@ -314,25 +312,25 @@ def get_eval_dataloader(self) -> DataLoader: def null_ref_context(self): """Context manager for handling null reference model (that is, peft adapter manipulation).""" with self.accelerator.unwrap_model( - self.policy_and_value.policy + self.model.policy ).disable_adapter() if self.is_peft_model and not self.ref_adapter_name else nullcontext(): if self.ref_adapter_name: - self.policy_and_value.policy.set_adapter(self.ref_adapter_name) + self.model.policy.set_adapter(self.ref_adapter_name) yield if self.ref_adapter_name: - self.policy_and_value.policy.set_adapter(self.model_adapter_name or "default") + self.model.policy.set_adapter(self.model_adapter_name or "default") def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): - backup_model = self.policy_and_value - self.policy_and_value = self.policy_and_value.policy # save only the policy + backup_model = self.model + self.model = self.model.policy # save only the policy if self.is_deepspeed_enabled: backup_deepspeed = self.deepspeed - self.deepspeed = self.policy_and_value + self.deepspeed = self.model super().save_model(output_dir, _internal_call) - self.policy_and_value = backup_model + self.model = backup_model if self.is_deepspeed_enabled: self.deepspeed = backup_deepspeed @@ -341,7 +339,7 @@ def train(self): args = self.args accelerator = self.accelerator optimizer = self.optimizer - model = self.policy_and_value + model = self.model ref_policy = self.ref_model reward_model = self.reward_model processing_class = self.processing_class @@ -398,8 +396,8 @@ def repeat_generator(): # backward compatibility if self.is_deepspeed_enabled: - self.deepspeed = self.policy_and_value - self.model_wrapped = self.policy_and_value + self.deepspeed = self.model + self.model_wrapped = self.model for update in range(1, args.num_total_batches + 1): self.state.episode += 1 * args.batch_size @@ -686,7 +684,7 @@ def generate_completions(self, sampling: bool = False): ) table = defaultdict(list) - with unwrap_model_for_generation(self.policy_and_value, self.accelerator) as unwrapped_model: + with unwrap_model_for_generation(self.model, self.accelerator) as unwrapped_model: for batch in self.eval_dataloader: query = batch["input_ids"] with torch.no_grad(): @@ -749,10 +747,8 @@ def create_model_card( if not self.is_world_process_zero(): return - if hasattr(self.policy_and_value.config, "_name_or_path") and not os.path.isdir( - self.policy_and_value.config._name_or_path - ): - base_model = self.policy_and_value.config._name_or_path + if hasattr(self.model.config, "_name_or_path") and not os.path.isdir(self.model.config._name_or_path): + base_model = self.model.config._name_or_path else: base_model = None @@ -760,7 +756,7 @@ def create_model_card( if isinstance(tags, str): tags = [tags] - if hasattr(self.policy_and_value.config, "unsloth_version"): + if hasattr(self.model.config, "unsloth_version"): tags.append("unsloth") citation = textwrap.dedent("""\