huggingface · pramodith · Nov 1, 2025 · Oct 18, 2025 · Oct 18, 2025 · Oct 18, 2025
diff --git a/tests/test_grpo_trainer.py b/tests/test_grpo_trainer.py
@@ -703,6 +703,35 @@ def test_training_beta_non_zero(self):
             new_param = trainer.model.get_parameter(n)
             assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
 
+    def test_training_with_cast_lm_head_to_fp32(self):
+        dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+        training_args = GRPOConfig(
+            output_dir=self.tmp_dir,
+            learning_rate=0.1,
+            per_device_train_batch_size=3,
+            num_generations=3,
+            max_completion_length=8,
+            report_to="none",
+            cast_lm_head_to_fp32=True,
+        )
+        trainer = GRPOTrainer(
+            model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+            reward_funcs="trl-internal-testing/tiny-Qwen2ForSequenceClassification-2.5",
+            args=training_args,
+            train_dataset=dataset,
+        )
+        previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+        trainer.train()
+
+        assert trainer.state.log_history[-1]["train_loss"] is not None
+        assert trainer.model.lm_head.weight.dtype == torch.float32
+
+        # Check that the params have changed
+        for n, param in previous_trainable_params.items():
+            new_param = trainer.model.get_parameter(n)
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."
+
     def test_training_with_entropy_filter(self):
         dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
         training_args = GRPOConfig(

diff --git a/trl/trainer/grpo_config.py b/trl/trainer/grpo_config.py
@@ -41,9 +41,12 @@ class GRPOConfig(TrainingArguments):
         disable_dropout (`bool`, *optional*, defaults to `False`):
             Whether to disable dropout in the model. This is useful for training with a reference model, as it prevents
             the model from generating different logprobs for the same input.
+        cast_lm_head_to_fp32 (`bool`, *optional*, defaults to `False`):
+            Whether to cast the language modeling head of the policy and reference models to float32. As recommended by
+            the [ScaleRL](https://huggingface.co/papers/2510.13786) recipe. This flag is only supported when the model
+            has untied word embedding and language modeling head layers i.e. `tie_word_embeddings` in the model config is False.
 
         > Parameters that control the data preprocessing
-
         remove_unused_columns (`bool`, *optional*, defaults to `False`):
             Whether to only keep the column `"prompt"` in the dataset. If you use a custom reward function that
             requires any column other than `"prompts"` and `"completions"`, you should keep this to `False`.
@@ -297,6 +300,14 @@ class GRPOConfig(TrainingArguments):
             "it prevents the model from generating different logprobs for the same input."
         },
     )
+    cast_lm_head_to_fp32: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to cast the language modeling head of the policy and reference, models to float32."
+            "As recommended by the [ScaleRL](https://huggingface.co/papers/2510.13786) recipe. This flag is only supported when the model"
+            " has untied word embedding and language modeling head layers i.e. `tie_word_embeddings` in the model config is False."
+        },
+    )
 
     # Parameters that control the data preprocessing
     # The default value remove_unused_columns is overwritten from the parent class, because in GRPO we usually rely on

diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -477,6 +477,24 @@ def __init__(
             if self.ref_model is not None:
                 disable_dropout_in_model(self.ref_model)
 
+        # Cast LM Head To FP32
+        if args.cast_lm_head_to_fp32:
+            if not model.config.tie_word_embeddings:
+
+                def cast_inputs_to_fp32(module, input):
+                    return (input[0].float(),)
+
+                model.lm_head = model.lm_head.float()
+                model.lm_head.register_forward_pre_hook(cast_inputs_to_fp32)
+                if self.ref_model is not None:
+                    self.ref_model.lm_head = self.ref_model.lm_head.float()
+                    self.ref_model.lm_head.register_forward_pre_hook(cast_inputs_to_fp32)
+            else:
+                raise NotImplementedError(
+                    "`cast_lm_head_to_fp32=True` is only supported when the model has untied word embedding and language modeling head layers"
+                    "i.e. `tie_word_embeddings` in the model config is False."
+                )
+
         # Liger loss
         if self.use_liger_kernel:
             if not is_liger_kernel_available():
@@ -876,7 +894,6 @@ def _get_per_token_logps_and_entropies(
             # Divide logits by sampling temperature.
             # See https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo#policy-training-implementation-details
             logits = logits / self.temperature
-
             completion_ids = input_ids_batch[:, -logits_to_keep:]
             logps = selective_log_softmax(logits, completion_ids)  # compute logprobs
             all_logps.append(logps)
@@ -1300,6 +1317,8 @@ def _generate_single_turn(self, prompts: list):
                     unwrapped_model.to(torch.bfloat16)
                 elif self.args.fp16:
                     unwrapped_model.to(torch.float16)
+                if self.args.cast_lm_head_to_fp32:
+                    unwrapped_model.lm_head.to(torch.float32)
                 with torch.inference_mode():
                     # Continuous batching API expects 'inputs' arg only
                     all_outputs = unwrapped_model.generate_batch(