From eba86b826b0071d229586885f7f9dca5b4201cae Mon Sep 17 00:00:00 2001
From: Daniel Han <danielhanchen@gmail.com>
Date: Fri, 12 Jun 2026 13:58:32 +0000
Subject: [PATCH] Fix transformers 5.x drift in FP8Linear construction and the
 Gemma3 processor patch

Two independent breakages surfaced by running the GRPO notebooks on
transformers 5.11:

1. convert_vllm_to_huggingface still passed dtype= to FP8Linear, whose
   5.x signature dropped it (weight dtype is forced to the FP8 dtype
   internally). Replace the version branch with signature filtering so
   4.x (bias/dtype/device) and 5.x (has_bias) both get exactly the
   kwargs they accept, and future drift degrades gracefully.

2. patch_Gemma3Processor stripped the double BOS after the tokenizer
   padded. Under left padding only the longest row still starts with
   [bos, bos], so that row alone got shorter, re-ragging the batch
   (ValueError: expected sequence of length N at dim 1) and leaving
   attention_mask out of sync. Tokenize unpadded, strip with
   attention_mask kept in line, then pad once via tokenizer.pad with
   only the kwargs its installed signature accepts.

Verified: FP8Linear constructs through the filter on 4.57.6 and 5.11;
Gemma3 4B vision GRPO goes from failing in 3 min to passing 4 GRPO
steps in 12 min; both FP8 GRPO notebooks pass with this on top of the
size-node fix from #695.
---
 unsloth_zoo/temporary_patches/gemma.py | 18 +++++++++++++++---
 unsloth_zoo/vllm_utils.py              | 13 +++++++------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/unsloth_zoo/temporary_patches/gemma.py b/unsloth_zoo/temporary_patches/gemma.py
index 89001795e..51d7a3d81 100644
--- a/unsloth_zoo/temporary_patches/gemma.py
+++ b/unsloth_zoo/temporary_patches/gemma.py
@@ -189,11 +189,23 @@ def _gemma3_call_impl(
         # text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", True)
 
-        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
-        # Fix double BOS tokens
+        # Tokenize WITHOUT padding: stripping a double BOS after padding only
+        # shortens rows that still start with [bos, bos] (under left padding,
+        # just the longest row), re-ragging the batch and desyncing
+        # attention_mask. Strip first, then pad once.
+        text_kwargs = dict(output_kwargs["text_kwargs"])
+        pad_kwargs = {k: text_kwargs.pop(k) for k in ("padding", "max_length", "pad_to_multiple_of", "padding_side") if k in text_kwargs}
+        text_inputs = self.tokenizer(text=text, **text_kwargs)
+        # Fix double BOS tokens, keeping attention_mask in sync
         double_bos_token_id = [self.tokenizer.bos_token_id]*2
         input_ids = text_inputs["input_ids"]
-        text_inputs["input_ids"] = [x[1:] if x[:2] == double_bos_token_id else x for x in input_ids]
+        stripped = [x[1:] if x[:2] == double_bos_token_id else x for x in input_ids]
+        if "attention_mask" in text_inputs:
+            text_inputs["attention_mask"] = [m[1:] if len(k) != len(x) else m for x, k, m in zip(input_ids, stripped, text_inputs["attention_mask"])]
+        text_inputs["input_ids"] = stripped
+        if pad_kwargs.get("padding", False) not in (False, None, "do_not_pad"):
+            pad_params = inspect.signature(self.tokenizer.pad).parameters
+            text_inputs = self.tokenizer.pad(text_inputs, **{k: v for k, v in pad_kwargs.items() if k in pad_params})
 
         # Add token type ids manually, as tokenizer can't do arbitrary position token types
         # [TODO] FAILS for batched tokens since text_inputs["input_ids"] is a list of lists, so np.array creates an object!
diff --git a/unsloth_zoo/vllm_utils.py b/unsloth_zoo/vllm_utils.py
index 2550719c8..f4cd8deff 100644
--- a/unsloth_zoo/vllm_utils.py
+++ b/unsloth_zoo/vllm_utils.py
@@ -1426,12 +1426,13 @@ def _override_to(self, *args, **kwargs):
                     layer.weight.input_scale_ub = kwargs['input_scale_ub']
                     layer.quant_method = "fbgemm_fp8"
                 elif fp8_weight_scale.ndim == 2:
-                    # FP8 dynamic quantized. transformers 5.0+ renamed
-                    # bias -> has_bias and removed device.
-                    if Version("transformers") < Version("5.0.0"):
-                        fp8_kwargs = dict(in_features=0, out_features=0, bias=has_bias, dtype=dtype, block_size=kwargs['block_size'], activation_scheme=kwargs['activation_scheme'], device=get_target_device())
-                    else:
-                        fp8_kwargs = dict(in_features=0, out_features=0, has_bias=has_bias, dtype=dtype, block_size=kwargs['block_size'], activation_scheme=kwargs['activation_scheme'])
+                    # FP8 dynamic quantized. FP8Linear's signature drifts across
+                    # transformers versions (4.x: bias/dtype/device; 5.x:
+                    # has_bias, no dtype/device), so keep only accepted kwargs.
+                    fp8_kwargs = dict(in_features=0, out_features=0, bias=has_bias, has_bias=has_bias, dtype=dtype, block_size=kwargs['block_size'], activation_scheme=kwargs['activation_scheme'], device=get_target_device())
+                    fp8_params = inspect.signature(FP8Linear.__init__).parameters
+                    if not any(p.kind is p.VAR_KEYWORD for p in fp8_params.values()):
+                        fp8_kwargs = {k: v for k, v in fp8_kwargs.items() if k in fp8_params}
                     layer = FP8Linear(**fp8_kwargs)
                     layer.in_features = weight.shape[1]
                     layer.out_features = weight.shape[0]