From eba86b826b0071d229586885f7f9dca5b4201cae Mon Sep 17 00:00:00 2001 From: Daniel Han Date: Fri, 12 Jun 2026 13:58:32 +0000 Subject: [PATCH] Fix transformers 5.x drift in FP8Linear construction and the Gemma3 processor patch Two independent breakages surfaced by running the GRPO notebooks on transformers 5.11: 1. convert_vllm_to_huggingface still passed dtype= to FP8Linear, whose 5.x signature dropped it (weight dtype is forced to the FP8 dtype internally). Replace the version branch with signature filtering so 4.x (bias/dtype/device) and 5.x (has_bias) both get exactly the kwargs they accept, and future drift degrades gracefully. 2. patch_Gemma3Processor stripped the double BOS after the tokenizer padded. Under left padding only the longest row still starts with [bos, bos], so that row alone got shorter, re-ragging the batch (ValueError: expected sequence of length N at dim 1) and leaving attention_mask out of sync. Tokenize unpadded, strip with attention_mask kept in line, then pad once via tokenizer.pad with only the kwargs its installed signature accepts. Verified: FP8Linear constructs through the filter on 4.57.6 and 5.11; Gemma3 4B vision GRPO goes from failing in 3 min to passing 4 GRPO steps in 12 min; both FP8 GRPO notebooks pass with this on top of the size-node fix from #695. --- unsloth_zoo/temporary_patches/gemma.py | 18 +++++++++++++++--- unsloth_zoo/vllm_utils.py | 13 +++++++------ 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/unsloth_zoo/temporary_patches/gemma.py b/unsloth_zoo/temporary_patches/gemma.py index 89001795e..51d7a3d81 100644 --- a/unsloth_zoo/temporary_patches/gemma.py +++ b/unsloth_zoo/temporary_patches/gemma.py @@ -189,11 +189,23 @@ def _gemma3_call_impl( # text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np") return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", True) - text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) - # Fix double BOS tokens + # Tokenize WITHOUT padding: stripping a double BOS after padding only + # shortens rows that still start with [bos, bos] (under left padding, + # just the longest row), re-ragging the batch and desyncing + # attention_mask. Strip first, then pad once. + text_kwargs = dict(output_kwargs["text_kwargs"]) + pad_kwargs = {k: text_kwargs.pop(k) for k in ("padding", "max_length", "pad_to_multiple_of", "padding_side") if k in text_kwargs} + text_inputs = self.tokenizer(text=text, **text_kwargs) + # Fix double BOS tokens, keeping attention_mask in sync double_bos_token_id = [self.tokenizer.bos_token_id]*2 input_ids = text_inputs["input_ids"] - text_inputs["input_ids"] = [x[1:] if x[:2] == double_bos_token_id else x for x in input_ids] + stripped = [x[1:] if x[:2] == double_bos_token_id else x for x in input_ids] + if "attention_mask" in text_inputs: + text_inputs["attention_mask"] = [m[1:] if len(k) != len(x) else m for x, k, m in zip(input_ids, stripped, text_inputs["attention_mask"])] + text_inputs["input_ids"] = stripped + if pad_kwargs.get("padding", False) not in (False, None, "do_not_pad"): + pad_params = inspect.signature(self.tokenizer.pad).parameters + text_inputs = self.tokenizer.pad(text_inputs, **{k: v for k, v in pad_kwargs.items() if k in pad_params}) # Add token type ids manually, as tokenizer can't do arbitrary position token types # [TODO] FAILS for batched tokens since text_inputs["input_ids"] is a list of lists, so np.array creates an object! diff --git a/unsloth_zoo/vllm_utils.py b/unsloth_zoo/vllm_utils.py index 2550719c8..f4cd8deff 100644 --- a/unsloth_zoo/vllm_utils.py +++ b/unsloth_zoo/vllm_utils.py @@ -1426,12 +1426,13 @@ def _override_to(self, *args, **kwargs): layer.weight.input_scale_ub = kwargs['input_scale_ub'] layer.quant_method = "fbgemm_fp8" elif fp8_weight_scale.ndim == 2: - # FP8 dynamic quantized. transformers 5.0+ renamed - # bias -> has_bias and removed device. - if Version("transformers") < Version("5.0.0"): - fp8_kwargs = dict(in_features=0, out_features=0, bias=has_bias, dtype=dtype, block_size=kwargs['block_size'], activation_scheme=kwargs['activation_scheme'], device=get_target_device()) - else: - fp8_kwargs = dict(in_features=0, out_features=0, has_bias=has_bias, dtype=dtype, block_size=kwargs['block_size'], activation_scheme=kwargs['activation_scheme']) + # FP8 dynamic quantized. FP8Linear's signature drifts across + # transformers versions (4.x: bias/dtype/device; 5.x: + # has_bias, no dtype/device), so keep only accepted kwargs. + fp8_kwargs = dict(in_features=0, out_features=0, bias=has_bias, has_bias=has_bias, dtype=dtype, block_size=kwargs['block_size'], activation_scheme=kwargs['activation_scheme'], device=get_target_device()) + fp8_params = inspect.signature(FP8Linear.__init__).parameters + if not any(p.kind is p.VAR_KEYWORD for p in fp8_params.values()): + fp8_kwargs = {k: v for k, v in fp8_kwargs.items() if k in fp8_params} layer = FP8Linear(**fp8_kwargs) layer.in_features = weight.shape[1] layer.out_features = weight.shape[0]