diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5002918175d..9eddbc1d068 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -44,6 +44,7 @@ Changelog - Fix ``ShapeInferenceError`` during ONNX INT8 + FP16 quantization (``--high_precision_dtype fp16``) of weakly-typed models (e.g. TensorFlow exports) that carry stale rank-0 ``graph.output`` shapes or ops such as ``TopK`` that ONNX's static shape inference cannot resolve. ``clear_stale_value_info`` now reconciles stale output shapes via symbolic shape inference (keeping every output's shape field populated), and AutoCast runs ONNX shape inference in strict mode and falls back to schema-based standalone type inference when it fails, so unresolved ops no longer leave tensors untyped. - Fused MoE expert auto-detection (``register_fused_experts_on_the_fly``) no longer requires an ``act_fn`` attribute. Some fused-expert modules (e.g. ``MiniMaxM3VLExperts``) apply a custom gated activation between the two ``F.linear`` calls instead of exposing ``act_fn``; they were silently skipped, leaving routed experts unquantized (an experts-only recipe matched nothing) and failing HF export with ``NotImplementedError``. ``_QuantFusedExperts`` is activation-agnostic (it only intercepts the two ``F.linear`` calls), so the requirement was unnecessary. This enables NVFP4/FP8 quantization and export for MiniMax-M2 / MiniMax-M3. +- Fix unified HF export emitting transformers' *in-memory* (post-``conversion_mapping``) tensor names instead of the original model-hub names, breaking the unified-checkpoint contract (observed on MiniMax-M3: exported ``model.language_model.*`` / ``mlp.experts.*.gate_proj`` instead of hub ``language_model.model.*`` / ``block_sparse_moe.experts.*.w{1,2,3}``). transformers' own save-side ``revert_weight_conversion`` is disabled by ModelOpt because it raises ``RuntimeError`` on 0-d scalar scale tensors, so a new quant-aware reverse conversion (``modelopt/torch/export/quant_aware_conversion.py``) derives rename/split rules from the model's conversion mapping via transformers' ``reverse_transform()`` and carries each weight's companion scale tensors (``weight_scale``, ``weight_scale_2``, ``input_scale``, ``weight_scale_inv``, ``bias``) through the renames and un-fusions, so quantized exports round-trip to the hub names. Any mapping op that cannot be reversed quant-aware yet (e.g. still-stacked fused experts) falls back to the previous in-memory names instead of aborting the export. 0.45 (2026-07-02) ^^^^^^^^^^^^^^^^^ diff --git a/examples/hf_ptq/example_utils.py b/examples/hf_ptq/example_utils.py index 9e8dea5f107..8b118916601 100755 --- a/examples/hf_ptq/example_utils.py +++ b/examples/hf_ptq/example_utils.py @@ -912,19 +912,47 @@ def _resolve_model_path(model_name_or_path: str, trust_remote_code: bool = False def copy_custom_model_files(source_path: str, export_path: str, trust_remote_code: bool = False): - """Copy custom model files (configuration_*.py, modeling_*.py, *.json, etc.) from source to export directory. - - This function copies custom Python files and JSON configuration files that are needed for - models with custom code. It excludes config.json and model.safetensors.index.json as these - are typically handled separately by the model export process. + """Copy processor/tokenizer artifacts (and, with trust_remote_code, custom code) to export. + + Processor and tokenizer *data* artifacts -- e.g. a VLM's ``preprocessor_config.json``, + ``merges.txt``/``vocab.json``, and the processor helper modules -- are needed by the + deployment stack (vLLM/SGLang) even when the model itself runs on native (non-remote) + transformers code. transformers 5.x restructured many VLM configs and no longer + re-saves these on ``save_pretrained`` for models loaded natively, so without copying + them a native-path export is missing e.g. ``preprocessor_config.json`` and fails to + load (``Can't load image processor``). These are copied regardless of + ``trust_remote_code``. Executable model/config code (``modeling*.py``, + ``configuration_*.py``, ``tokenization_*.py``, and other custom JSON) is only meaningful + with ``trust_remote_code`` and is copied only then. ``config.json`` and + ``model.safetensors.index.json`` are always skipped (handled by the export itself). Args: source_path: Path to the original model directory or HuggingFace model ID export_path: Path to the exported model directory - trust_remote_code: Whether trust_remote_code was used (only copy files if True) + trust_remote_code: Whether trust_remote_code was used (gates the executable code files) """ - if not trust_remote_code: - return + # Deployment-critical processor/tokenizer artifacts: safe to copy regardless of + # trust_remote_code (data + processor helpers, not model code). + always_copy_patterns = [ + "preprocessor_config.json", + "processor_config.json", + "image_processing*.py", + "processing_*.py", + "video_processing*.py", + "feature_extraction_*.py", + "added_tokens.json", + "special_tokens_map.json", + "vocab.json", + "merges.txt", + "tokenizer.model", + ] + # Executable custom model/config code + other custom JSON: only used with trust_remote_code. + code_patterns = [ + "configuration_*.py", + "modeling*.py", + "tokenization_*.py", + "*.json", + ] # Resolve the source path (handles both local paths and HF model IDs) resolved_source_path = _resolve_model_path(source_path, trust_remote_code) @@ -946,24 +974,17 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod print(f"Warning: Export directory {export_path} does not exist") return - # Common patterns for custom model files that need to be copied - custom_file_patterns = [ - "configuration_*.py", - "modeling*.py", - "tokenization_*.py", - "processing_*.py", - "image_processing*.py", - "feature_extraction_*.py", - "*.json", - ] + patterns = [*always_copy_patterns, *(code_patterns if trust_remote_code else [])] - copied_files = [] - for pattern in custom_file_patterns: + copied_files: list[str] = [] + for pattern in patterns: for file_path in source_dir.glob(pattern): if file_path.is_file(): # Skip config.json and model.safetensors.index.json as they're handled separately if file_path.name in ["config.json", "model.safetensors.index.json"]: continue + if file_path.name in copied_files: # e.g. matched by both pattern lists + continue dest_path = export_dir / file_path.name try: shutil.copy2(file_path, dest_path) diff --git a/modelopt/torch/export/quant_aware_conversion.py b/modelopt/torch/export/quant_aware_conversion.py new file mode 100644 index 00000000000..516c0e90851 --- /dev/null +++ b/modelopt/torch/export/quant_aware_conversion.py @@ -0,0 +1,430 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Quantization-aware reverse weight conversion for unified HF export. + +Background +---------- +``transformers`` may apply a ``conversion_mapping`` when loading a model, so the +in-memory parameter names differ from the original model-hub checkpoint (e.g. fused +``mlp.gate_up_proj``, renamed MoE leaves, reordered ``model``/``language_model`` +prefix). On save, ``transformers`` reverses this via ``revert_weight_conversion`` so +the on-disk names match the hub checkpoint again. + +ModelOpt's unified export disables that reverse (it raises ``RuntimeError`` on 0-d +scalar scale tensors such as ``weight_scale_2``/``input_scale``), so a quantized +export emits the *in-memory* (post-conversion) names — violating the unified +checkpoint contract that names stay aligned with the original hub checkpoint. + +This module performs the reverse in a quantization-aware way: it carries each +weight's companion scale tensors (``weight_scale``, ``weight_scale_2``, +``input_scale``, ``weight_scale_inv``, ``bias``) through the rename and un-fuse +operations. + +Scope +----- +Two reverse primitives cover the conversion_mapping cases: + +* **Rename** — a key-level string substitution. Because a quantized linear stores + every tensor under ``.``, renaming the module substring rewrites the + weight and all its scale siblings together with no tensor manipulation. +* **Split** — un-fuse an output-dim concatenation (e.g. dense ``gate_up_proj`` -> + ``gate_proj`` + ``up_proj``). ``weight``/``weight_scale``/``weight_scale_inv``/ + ``bias`` are chunked along the fused (output) dim; 0-d scalar ``weight_scale_2``/ + ``input_scale`` are duplicated to each part (they are per-tensor and shared). + +MoE experts need only **Rename**: ModelOpt's export already expands the fused, +stacked in-memory experts (``experts.gate_up_proj`` of shape ``[E, 2F, H]``) into +per-expert 2-D linears (``experts..gate_proj`` / ``up_proj`` / ``down_proj``) +before save, so the reverse just maps those per-expert leaf names back to the hub +leaves (e.g. ``gate_proj`` -> ``w1``, ``up_proj`` -> ``w3``, ``down_proj`` -> ``w2``). + +Reverse rules are derived from the model's conversion mapping via transformers' +``reverse_transform()``. Any op shape not covered raises +:class:`QuantConversionUnsupportedError` so the caller falls back to the legacy +(in-memory-name) behavior rather than emit a silently-wrong checkpoint. +""" + +import re +from dataclasses import dataclass + +import torch + +__all__ = [ + "QuantConversionUnsupportedError", + "RenameRule", + "SplitRule", + "apply_reverse_rules", + "build_reverse_name_mapper", + "revert_quant_config_names", + "revert_weight_conversion_quant_aware", +] + +# Tensor leaves that belong to a single quantized linear module. A rename of the +# parent module path applies uniformly to all of these. +_LEAF_SUFFIXES = ( + ".weight", + ".weight_scale", + ".weight_scale_2", + ".weight_scale_inv", + ".input_scale", + ".bias", +) + +# Leaves that are per-tensor scalars (0-d) and must be *duplicated*, not split, when +# a fused module is un-fused. +_SCALAR_LEAF_SUFFIXES = (".weight_scale_2", ".input_scale") + + +class QuantConversionUnsupportedError(Exception): + """Raised when a conversion op cannot be reversed quant-aware (caller falls back).""" + + +@dataclass(frozen=True) +class RenameRule: + """Reverse of a ``WeightRenaming``: ``re.sub(pattern, repl, key)`` on every key.""" + + pattern: str + repl: str + + +@dataclass(frozen=True) +class SplitRule: + """Reverse of an output-dim ``Concatenate``: un-fuse one module into ``parts``. + + Args: + fused_suffix: module suffix of the fused tensor, e.g. ``".gate_up_proj"``. + part_suffixes: ordered replacements, e.g. ``(".gate_proj", ".up_proj")``. + dim: the fused (output) dim along which ``weight``/``weight_scale``/``bias`` + are chunked. NVFP4 ``weight`` is ``[out, in//2]`` and ``weight_scale`` is + ``[out, in//block]`` so the output dim is ``0`` for both. + """ + + fused_suffix: str + part_suffixes: tuple[str, ...] + dim: int = 0 + + +def _split_leaf_tensor(leaf: str, tensor: torch.Tensor, n: int, idx: int, dim: int): + """Return the ``idx``-th of ``n`` parts of ``tensor`` for tensor leaf ``leaf``.""" + if leaf in _SCALAR_LEAF_SUFFIXES or tensor.dim() == 0: + # Per-tensor scalar shared across the fused parts -> duplicate. + return tensor.clone() + size = tensor.size(dim) + if size % n != 0: + raise QuantConversionUnsupportedError( + f"cannot split leaf '{leaf}' of size {size} along dim {dim} into {n} parts" + ) + return tensor.chunk(n, dim=dim)[idx].clone() + + +def _apply_split_rule(state_dict: dict[str, torch.Tensor], rule: SplitRule) -> None: + """Un-fuse all modules matching ``rule.fused_suffix`` in place.""" + n = len(rule.part_suffixes) + # Collect (module_path, leaf, key) for every tensor under a fused module. + fused_keys: list[tuple[str, str, str]] = [] + for key in state_dict: + for leaf in _LEAF_SUFFIXES: + if key.endswith(rule.fused_suffix + leaf): + module = key[: -len(leaf)][: -len(rule.fused_suffix)] + fused_keys.append((module, leaf, key)) + break + + for module, leaf, key in fused_keys: + tensor = state_dict.pop(key) + # A 3-D expert tensor here means stacked experts (MergeModulelist) — out of scope. + if leaf == ".weight" and tensor.dim() >= 3: + raise QuantConversionUnsupportedError( + f"stacked 3-D expert tensor '{key}' (ndim={tensor.dim()}) is not supported; " + "un-stacking experts + their scales is a follow-up" + ) + for idx, part in enumerate(rule.part_suffixes): + target_key = module + part + leaf + if target_key in state_dict: + raise QuantConversionUnsupportedError(f"split collision on '{target_key}'") + state_dict[target_key] = _split_leaf_tensor(leaf, tensor, n, idx, rule.dim) + + +def apply_reverse_rules( + state_dict: dict[str, torch.Tensor], + split_rules: list[SplitRule], + rename_rules: list[RenameRule], +) -> dict[str, torch.Tensor]: + """Apply quant-aware reverse conversion: splits first, then renames. + + Splits run on the in-memory (post-conversion) names; renames then map the + resulting keys back to the original hub names. Renames are applied in order. + """ + out = dict(state_dict) + for rule in split_rules: + _apply_split_rule(out, rule) + + compiled = [(re.compile(r.pattern), r.repl) for r in rename_rules] + renamed: dict[str, torch.Tensor] = {} + for key, value in out.items(): + new_key = key + for pattern, repl in compiled: + new_key = pattern.sub(repl, new_key) + if new_key in renamed: + raise QuantConversionUnsupportedError(f"rename collision on '{new_key}'") + renamed[new_key] = value + return renamed + + +def revert_weight_conversion_quant_aware(model, state_dict: dict[str, torch.Tensor]): + """Reverse a transformers conversion_mapping on a quantized state dict. + + Builds reverse rules from the model's conversion mapping and applies them + carrying companion scale tensors. Raises :class:`QuantConversionUnsupportedError` + when the mapping uses an op that cannot be reversed quant-aware yet, so the + caller can fall back to the legacy behavior. + """ + split_rules, rename_rules, expert_fused_leaves = _build_reverse_rules(model) + if not split_rules and not rename_rules: + return state_dict + _assert_experts_pre_expanded(state_dict, expert_fused_leaves) + return apply_reverse_rules(state_dict, split_rules, rename_rules) + + +def build_reverse_name_mapper(model): + """Build a ``str -> str`` mapper that applies the quant-aware reverse *rename* rules. + + The exported weight tensors are reverted to the original hub names by + :func:`revert_weight_conversion_quant_aware`, but the quantization config's module + references (``exclude_modules`` and, for mixed precision, ``quantized_layers`` keys) + are built from the in-memory module names and would otherwise stay in the + post-conversion namespace -- so a deployment loader matching those patterns against + the (reverted) hub-named modules finds no match, silently loads an excluded BF16 + layer as quantized, and fails. Applying the same rename rules to those name strings + keeps them aligned with the weights. Only the rename rules apply (splits act on + tensors, not names). + + Returns ``None`` when no renaming applies. Raises + :class:`QuantConversionUnsupportedError` when the mapping can't be reversed, so the + caller can keep the in-memory names for BOTH weights and config (mutually consistent). + """ + _, rename_rules, _ = _build_reverse_rules(model) + if not rename_rules: + return None + compiled = [(re.compile(r.pattern), r.repl) for r in rename_rules] + # The rename patterns are anchored on full weight keys and use ``.`` (any char) as a + # path separator, so a trailing glob wildcard in an exclude pattern would be consumed + # (e.g. ``...mlp.shared_experts.`` -> ``...`` would eat the ``*``). Append a sentinel + # path segment so container renames whose pattern ends in ``.`` match the sentinel's + # separator, then strip it and restore the wildcard. + _sentinel = ".\x00modelopt_name_sentinel" + + def _apply(text: str) -> str: + for pattern, repl in compiled: + text = pattern.sub(repl, text) + return text + + def _map(name: str) -> str: + base, suffix = name, "" + if name.endswith(".*"): + base, suffix = name[:-2], ".*" + elif name.endswith("*"): + base, suffix = name[:-1], "*" + mapped = _apply(base + _sentinel) + mapped = mapped.removesuffix(_sentinel) + return mapped + suffix + + return _map + + +def revert_quant_config_names(quantization: dict, mapper) -> None: + """Revert ``exclude_modules`` / ``quantized_layers`` keys to hub names, in place. + + ``mapper`` is the callable from :func:`build_reverse_name_mapper` (a no-op when + ``None``). Applies to the ModelOpt ``{"quantization": {...}}`` sub-dict before it is + written / format-converted, so both ``hf_quant_config.json`` and the embedded + ``config.json`` ``quantization_config`` inherit the reverted names. + """ + if mapper is None or not isinstance(quantization, dict): + return + exclude = quantization.get("exclude_modules") + if exclude: + quantization["exclude_modules"] = [mapper(e) for e in exclude] + quantized_layers = quantization.get("quantized_layers") + if isinstance(quantized_layers, dict) and quantized_layers: + quantization["quantized_layers"] = {mapper(k): v for k, v in quantized_layers.items()} + + +def _assert_experts_pre_expanded( + state_dict: dict[str, torch.Tensor], expert_fused_leaves: list[str] +) -> None: + """Guard the expert rename path against experts that were not pre-expanded. + + The expert reverse is emitted as key renames anchored on the per-expert index + (``.experts..``). If ModelOpt did not expand the fused/stacked experts, + a key like ``.experts.gate_up_proj`` (a 3-D ``[E, ...]`` tensor) survives: no + per-expert rename matches it, so it would ship unrenamed under the wrong name. + Mirror the split path's 3-D guard and raise so the caller falls back to legacy + (in-memory-name) export instead of emitting a silently mis-named checkpoint. + """ + if not expert_fused_leaves: + return + fused = re.compile( + r"\.experts\.(?:" + "|".join(re.escape(leaf) for leaf in expert_fused_leaves) + r")(?:\.|$)" + ) + for key, tensor in state_dict.items(): + if fused.search(key) or (".experts." in key and getattr(tensor, "ndim", 0) >= 3): + raise QuantConversionUnsupportedError( + f"experts not pre-expanded (stacked/fused expert tensor '{key}'); " + "quant-aware reverse conversion cannot rename it" + ) + + +def _build_reverse_rules(model) -> tuple[list[SplitRule], list[RenameRule], list[str]]: + """Derive reverse rules from the model's transformers conversion mapping. + + Returns ``(split_rules, rename_rules, expert_fused_leaves)``; the last is the set + of in-memory fused expert leaf names, used to guard against experts that were not + pre-expanded. Returns empty lists when no mapping applies (export unchanged). Uses + transformers' own ``reverse_transform()`` to get correctly-reversed name patterns + (so anchored regex renamings reverse properly), then translates them: + + * ``WeightRenaming`` -> :class:`RenameRule` (carries scale siblings for free). + * Expert ``WeightConverter`` (reverse contains ``SplitModulelist``): ModelOpt's + export already expands fused experts into per-expert 2-D linears, so only the + per-expert leaf names need mapping back (e.g. ``gate_proj`` -> ``w1``). Emitted + as rename rules -- no tensor manipulation. + * Dense fusing ``WeightConverter`` (reverse is ``Chunk`` only): the fused tensor + survives in the state dict, so it is un-fused via a :class:`SplitRule`. + + Raises :class:`QuantConversionUnsupportedError` for any op shape not covered, so + the caller falls back to the legacy (in-memory-name) behavior. + """ + try: + conversions = getattr(model, "_weight_conversions", None) + if conversions is None: + from transformers.conversion_mapping import get_model_conversion_mapping + + conversions = get_model_conversion_mapping(model, add_legacy=False) + except Exception as exc: # transformers without conversion_mapping, or API drift + raise QuantConversionUnsupportedError(f"could not read conversion mapping: {exc}") from exc + + if not conversions: + return [], [], [] + + try: + from transformers.core_model_loading import ( + Chunk, + SplitModulelist, + WeightConverter, + WeightRenaming, + ) + except Exception as exc: # transformers too old / API drift -> fall back to legacy names + raise QuantConversionUnsupportedError( + f"transformers.core_model_loading unavailable: {exc}" + ) from exc + + split_rules: list[SplitRule] = [] + # WeightRenamings and expert-leaf (converter-derived) renames are collected + # separately so they can be ordered correctly on the save path -- see the + # ``rename_rules`` assembly below. + weight_renamings: list[RenameRule] = [] + leaf_renamings: list[RenameRule] = [] + # In-memory fused expert leaf names (e.g. ``gate_up_proj``, ``down_proj``). Used by + # the caller to detect experts that were NOT pre-expanded (stacked 3-D tensors), + # which the per-expert-index leaf renames cannot rewrite. + expert_fused_leaves: list[str] = [] + for conv in conversions: + rev = conv.reverse_transform() # hub<-in-memory; reversed name patterns + ops + if isinstance(rev, WeightRenaming): + for pattern, repl in zip(_as_list(rev.source_patterns), _as_list(rev.target_patterns)): + weight_renamings.append(RenameRule(pattern=pattern, repl=repl)) + elif isinstance(rev, WeightConverter): + ops = list(rev.operations) + if any(isinstance(op, SplitModulelist) for op in ops): + # Expert converter: ModelOpt already un-stacked/un-fused experts to + # per-expert 2-D linears, so only per-expert leaf names remain to map. + leaf_renamings.extend(_expert_leaf_renames(rev)) + expert_fused_leaves.append(_leaf(_as_list(rev.source_patterns)[0])) + elif ops and all(isinstance(op, Chunk) for op in ops): + # Dense fused linear survives in the state dict -> un-fuse (split). + split_rules.append(_dense_split_rule(rev, ops)) + else: + raise QuantConversionUnsupportedError( + f"unsupported reverse ops: {[type(o).__name__ for o in ops]}" + ) + else: + raise QuantConversionUnsupportedError(f"unsupported conversion: {type(rev).__name__}") + + # Save-path order mirrors transformers' ``rename_source_key``: converters act + # first, then WeightRenamings. Crucially, transformers *loads* by chaining the + # renamings in list order -- a component-reordering rename (e.g. + # ``language_model.model`` -> ``model.language_model``) fires before a rename that + # anchors on the resulting adjacency (e.g. + # ``.language_model.layers.N.mlp.experts.`` -> ``.block_sparse_moe.experts.``). + # The reverse must therefore apply WeightRenamings in *reverse* list order so the + # reorder rename runs last and does not destroy the anchor the MoE container/gate + # renames rely on. Expert leaf renames act on disjoint ``.experts..`` + # substrings and are applied first. + rename_rules = leaf_renamings + list(reversed(weight_renamings)) + return split_rules, rename_rules, expert_fused_leaves + + +# ModelOpt's export splits a fused ``gate_up_proj`` into these per-expert linears, +# in this order (see modelopt.torch.export.layer_utils.get_expert_linear_names). +_FUSED_EXPERT_PART_NAMES = {"gate_up_proj": ["gate_proj", "up_proj"]} + + +def _expert_leaf_renames(rev) -> list[RenameRule]: + """Per-expert leaf renames for an expert converter (ModelOpt pre-expands experts). + + ``rev`` reverses hub<-in-memory, so ``rev.source_patterns`` is the fused in-memory + leaf (e.g. ``.experts.gate_up_proj``) and ``rev.target_patterns`` the hub leaves + (e.g. ``.experts.*.w1.weight``, ``.experts.*.w3.weight``). ModelOpt exports the + fused leaf as per-expert parts, mapped back to the hub leaves positionally. + """ + src_leaf = _leaf(_as_list(rev.source_patterns)[0]) + hub_leaves = [_leaf(t) for t in _as_list(rev.target_patterns)] + part_leaves = _FUSED_EXPERT_PART_NAMES.get(src_leaf, [src_leaf]) + if len(part_leaves) != len(hub_leaves): + raise QuantConversionUnsupportedError( + f"expert converter arity mismatch: {part_leaves} vs {hub_leaves}" + ) + return [ + RenameRule(rf"(\.experts\.\d+\.){re.escape(part)}\b", rf"\g<1>{hub}") + for part, hub in zip(part_leaves, hub_leaves) + ] + + +def _dense_split_rule(rev, ops) -> SplitRule: + """Un-fuse a dense (non-expert) fused linear that survives in the state dict.""" + fused = _leaf_suffix(_as_list(rev.source_patterns)[0]) + parts = tuple(_leaf_suffix(t) for t in _as_list(rev.target_patterns)) + dim = next((op.dim for op in ops if hasattr(op, "dim")), 0) + return SplitRule(fused_suffix=fused, part_suffixes=parts, dim=dim) + + +def _as_list(x) -> list: + return list(x) if isinstance(x, (list, tuple)) else [x] + + +def _leaf(pattern: str) -> str: + """Bare leaf name from a conversion pattern, e.g. ``.experts.*.w1.weight`` -> ``w1``.""" + p = pattern + for suffix in _LEAF_SUFFIXES: + if p.endswith(suffix): + p = p[: -len(suffix)] + break + return p.rstrip(".*").rsplit(".", 1)[-1] + + +def _leaf_suffix(pattern: str) -> str: + """Leaf name as a module suffix, e.g. ``.gate_proj``.""" + return "." + _leaf(pattern) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 8bc92ed5eb9..64dfb5e12c2 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -91,6 +91,11 @@ from .model_utils import _reorder_canonical_first, get_language_model_from_vl, is_multimodal_model from .moe_utils import _export_fused_experts from .plugins import SpeculativeDecodingExporter, has_spec_opt, sanitize_hf_config_for_deployment +from .quant_aware_conversion import ( + build_reverse_name_mapper, + revert_quant_config_names, + revert_weight_conversion_quant_aware, +) from .quant_utils import ( fuse_prequant_layernorm, fuse_prequant_to_linear, @@ -1333,9 +1338,9 @@ def _export_diffusers_checkpoint( # TODO: Remove this workaround once HuggingFace fixes revert_weight_conversion to handle -# scalar (0-d) tensors. The bug is in transformers' Chunk.convert() which calls -# tensor.size(self.dim) on quantization scale buffers that are 0-d scalars, causing -# IndexError. Confirmed still present in transformers 5.2.0. +# scalar (0-d) tensors. transformers' Chunk.convert() calls torch.chunk() on quantization +# scale buffers that are 0-d scalars, raising RuntimeError ("chunk expects at least a +# 1-dimensional tensor"). Confirmed in transformers 5.12.0. # See: transformers/core_model_loading.py, Chunk.convert() def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict: """No-op replacement for transformers' revert_weight_conversion.""" @@ -1358,7 +1363,7 @@ def _try_patch_module(mod_path: str) -> tuple[Any, Any] | None: def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]: - """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors.""" + """Patch revert_weight_conversion in transformers to avoid RuntimeError on scalar tensors.""" patches: list[tuple[Any, Any]] = [] for mod_path in [ "transformers.core_model_loading", @@ -1452,6 +1457,34 @@ def export_hf_checkpoint( try: post_state_dict, hf_quant_config = _export_transformers_checkpoint(model, dtype, **kwargs) + # Remove hf_quantizer from model so post_state_dict can be exported. + if getattr(model, "hf_quantizer", None) is not None: + model.hf_quantizer = None + + export_state_dict = {**post_state_dict, **(extra_state_dict or {})} + + # transformers may have applied a load-time conversion_mapping (fused gate_up_proj, + # renamed MoE leaves, reordered model/language_model prefix), so the in-memory names + # differ from the original hub checkpoint. Reverse it quantization-aware so exported + # tensor names stay aligned with the hub checkpoint (the unified-checkpoint contract). + # transformers' own revert_weight_conversion errors on 0-d scalar scale tensors, so we + # do it here. The same rename is applied to the quant-config module references + # (exclude_modules / quantized_layers keys) so a deployment loader matches them against + # the reverted hub-named modules (otherwise an excluded BF16 layer is loaded as quantized + # and fails). Best-effort and atomic: any failure (an op we cannot reverse yet, + # transformers API drift, unexpected shapes) falls back to the in-memory names for BOTH + # weights and config so they stay mutually consistent. + try: + name_mapper = build_reverse_name_mapper(model) + export_state_dict = revert_weight_conversion_quant_aware(model, export_state_dict) + if name_mapper is not None and hf_quant_config: + revert_quant_config_names(hf_quant_config.get("quantization", {}), name_mapper) + except Exception as exc: + warnings.warn( + f"Quant-aware reverse weight conversion skipped ({exc}); exported tensor " + "names may not match the original HF hub checkpoint." + ) + # Only treat the export as quantized when at least one quant_algo field is set. # get_quant_config always returns a dict (even for sparsity-only or unmodified models), # so emitting hf_quant_config.json unconditionally produces a file with @@ -1472,15 +1505,10 @@ def export_hf_checkpoint( else: hf_quant_config = None - # Remove hf_quantizer from model so post_state_dict can be exported. - if getattr(model, "hf_quantizer", None) is not None: - model.hf_quantizer = None - - # Save model - # Temporarily disable revert_weight_conversion if available — it doesn't handle - # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError). - # We must patch both the source module and the importing module since - # modeling_utils does `from core_model_loading import revert_weight_conversion`. + # Keep transformers' own revert_weight_conversion disabled (the quant-aware reverse + # above replaces it): it can't handle quantized state dicts (RuntimeError on 0-d scalar + # scale tensors). Patch both the source and importing module since modeling_utils does + # `from core_model_loading import revert_weight_conversion`. _patches = _patch_revert_weight_conversion() _sanitize_generation_config_for_save(model) @@ -1488,7 +1516,7 @@ def export_hf_checkpoint( try: model.save_pretrained( export_dir, - state_dict={**post_state_dict, **(extra_state_dict or {})}, + state_dict=export_state_dict, save_modelopt_state=save_modelopt_state, max_shard_size=max_shard_size, ) diff --git a/tests/gpu/torch/export/test_quant_aware_conversion_gpu.py b/tests/gpu/torch/export/test_quant_aware_conversion_gpu.py new file mode 100644 index 00000000000..85b71d3f588 --- /dev/null +++ b/tests/gpu/torch/export/test_quant_aware_conversion_gpu.py @@ -0,0 +1,105 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""End-to-end GPU test: unified HF export produces original-hub-aligned tensor names. + +Uses a tiny Mixtral, whose transformers ``conversion_mapping`` fuses/renames MoE +experts (``block_sparse_moe.experts.*.w{1,2,3}`` <-> in-memory +``mlp.experts.gate_up_proj``) — the same machinery larger MoE VLMs (e.g. MiniMax-M3) +use. The exported quantized checkpoint's tensor names must match the canonical hub +names obtained from transformers' own ``revert_weight_conversion`` on the reference +(unquantized) model. +""" + +import glob +import os +import tempfile + +import pytest +import torch + +pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires a GPU") + +_SCALE_SUFFIXES = (".weight_scale", ".weight_scale_2", ".weight_scale_inv", ".input_scale") + + +def _tiny_mixtral_config(): + from transformers import MixtralConfig + + cfg = MixtralConfig( + hidden_size=64, + intermediate_size=128, + num_hidden_layers=2, + num_attention_heads=4, + num_key_value_heads=2, + num_local_experts=4, + num_experts_per_tok=2, + vocab_size=320, + max_position_embeddings=128, + ) + cfg.architectures = ["MixtralForCausalLM"] + return cfg + + +def test_export_tensor_names_match_hub_after_conversion_reverse(): + pytest.importorskip("transformers") + from transformers import MixtralForCausalLM + + try: + from transformers.conversion_mapping import get_checkpoint_conversion_mapping + from transformers.core_model_loading import revert_weight_conversion + except ImportError: + pytest.skip("transformers build has no conversion_mapping API") + if not get_checkpoint_conversion_mapping("mixtral"): + pytest.skip("transformers build has no mixtral conversion_mapping") + + import modelopt.torch.quantization as mtq + from modelopt.torch.export import export_hf_checkpoint + + cfg = _tiny_mixtral_config() + + # Canonical hub names: transformers' own reverse on the unquantized reference. + ref = MixtralForCausalLM(cfg) + hub_names = set(revert_weight_conversion(ref, ref.state_dict()).keys()) + # sanity: reference really is fused/renamed in memory + assert any(".block_sparse_moe.experts.0.w1.weight" in n for n in hub_names) + + model = MixtralForCausalLM(cfg).to("cuda", torch.bfloat16).eval() + ids = torch.randint(0, cfg.vocab_size, (2, 16), device="cuda") + + def forward_loop(m): + for _ in range(4): + m(ids) + + model = mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop) + + with tempfile.TemporaryDirectory() as export_dir: + with torch.inference_mode(): + export_hf_checkpoint(model, export_dir=export_dir) + exported = set() + for f in glob.glob(os.path.join(export_dir, "*.safetensors")): + from safetensors import safe_open + + with safe_open(f, framework="pt") as sf: + exported.update(sf.keys()) + + non_scale = {k for k in exported if not any(k.endswith(s) for s in _SCALE_SUFFIXES)} + # Every exported weight carries its original hub name; nothing renamed/left in-memory. + assert non_scale == hub_names, ( + f"missing={sorted(hub_names - non_scale)[:5]} extra={sorted(non_scale - hub_names)[:5]}" + ) + # Experts specifically use the hub layout, not the fused in-memory names. + assert any(".block_sparse_moe.experts.0.w1.weight" in k for k in non_scale) + assert not any(".mlp.experts.gate_up_proj" in k for k in exported) diff --git a/tests/unit/torch/export/test_quant_aware_conversion.py b/tests/unit/torch/export/test_quant_aware_conversion.py new file mode 100644 index 00000000000..ac6df25e6dc --- /dev/null +++ b/tests/unit/torch/export/test_quant_aware_conversion.py @@ -0,0 +1,343 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for quant-aware reverse weight conversion (CPU, no GPU needed). + +Tensor shapes mirror a real NVFP4 linear from the MiniMax-M3 checkpoint: ``weight`` +uint8 ``[out, in//2]``, ``weight_scale`` ``[out, in//16]``, ``weight_scale_2`` / +``input_scale`` 0-d scalars. The reverse logic is dtype-agnostic, so ``weight_scale`` +uses float32 here (real checkpoints use float8_e4m3, whose CPU ops are not portable +across platforms) — only shapes and the scalar-vs-blocked distinction matter. +""" + +import types + +import pytest +import torch + +from modelopt.torch.export.quant_aware_conversion import ( + QuantConversionUnsupportedError, + RenameRule, + SplitRule, + _assert_experts_pre_expanded, + apply_reverse_rules, + build_reverse_name_mapper, + revert_quant_config_names, + revert_weight_conversion_quant_aware, +) + +BLOCK = 16 + + +def _nvfp4_linear(module: str, out: int, in_features: int) -> dict[str, torch.Tensor]: + """Synthetic NVFP4 quantized-linear tensor group keyed under ``module``.""" + return { + f"{module}.weight": torch.randint(0, 255, (out, in_features // 2), dtype=torch.uint8), + f"{module}.weight_scale": torch.randn(out, in_features // BLOCK), + f"{module}.weight_scale_2": torch.tensor(0.037, dtype=torch.float32), # 0-d + f"{module}.input_scale": torch.tensor(1.0, dtype=torch.float32), # 0-d + } + + +def test_rename_carries_scale_siblings(): + """A module rename rewrites weight + all scale siblings with identical values.""" + sd = _nvfp4_linear("model.language_model.layers.10.mlp.experts.40.gate_proj", 8, 16) + rules = [ + RenameRule(r"\.mlp\.experts\.", ".block_sparse_moe.experts."), + RenameRule(r"(\.block_sparse_moe\.experts\.\d+\.)gate_proj", r"\1w1"), + RenameRule(r"^model\.language_model\.", "language_model.model."), + ] + out = apply_reverse_rules(sd, [], rules) + + base = "language_model.model.layers.10.block_sparse_moe.experts.40.w1" + assert set(out) == { + f"{base}.weight", + f"{base}.weight_scale", + f"{base}.weight_scale_2", + f"{base}.input_scale", + } + # values untouched: a rename rebinds the same tensor object (no copy) + for leaf in (".weight", ".weight_scale", ".weight_scale_2", ".input_scale"): + old = sd[f"model.language_model.layers.10.mlp.experts.40.gate_proj{leaf}"] + assert out[base + leaf] is old + + +def test_split_unfuses_dense_gate_up_with_scales(): + """gate_up_proj -> gate_proj + up_proj: weight/scale split on dim 0, scalars duplicated.""" + out_dim, in_dim = 8, 32 # fused output dim = 8 -> 4 per part + sd = _nvfp4_linear("m.layers.0.mlp.gate_up_proj", out_dim, in_dim) + rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0) + + out = apply_reverse_rules(sd, [rule], []) + + g, u = "m.layers.0.mlp.gate_proj", "m.layers.0.mlp.up_proj" + assert set(out) == { + f"{g}.weight", + f"{g}.weight_scale", + f"{g}.weight_scale_2", + f"{g}.input_scale", + f"{u}.weight", + f"{u}.weight_scale", + f"{u}.weight_scale_2", + f"{u}.input_scale", + } + # weight/scale halved on dim 0; concatenating the parts reconstructs the original + assert out[f"{g}.weight"].shape == (out_dim // 2, in_dim // 2) + assert out[f"{g}.weight_scale"].shape == (out_dim // 2, in_dim // BLOCK) + assert torch.equal( + torch.cat([out[f"{g}.weight"], out[f"{u}.weight"]], dim=0), + sd["m.layers.0.mlp.gate_up_proj.weight"], + ) + # 0-d scalars duplicated to both parts + for part in (g, u): + assert out[f"{part}.weight_scale_2"].dim() == 0 + assert torch.equal( + out[f"{part}.weight_scale_2"], sd["m.layers.0.mlp.gate_up_proj.weight_scale_2"] + ) + + +def test_stacked_3d_expert_raises_unsupported(): + """A stacked [num_experts, out, in] weight must trigger the safe fallback path.""" + sd = { + "m.layers.0.mlp.experts.gate_up_proj.weight": torch.zeros(4, 8, 16, dtype=torch.uint8), + } + rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0) + with pytest.raises(QuantConversionUnsupportedError): + apply_reverse_rules(sd, [rule], []) + + +def test_non_divisible_split_raises(): + sd = {"m.mlp.gate_up_proj.weight": torch.zeros(7, 8, dtype=torch.uint8)} + rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0) + with pytest.raises(QuantConversionUnsupportedError): + apply_reverse_rules(sd, [rule], []) + + +def test_end_to_end_minimax_m3_like_reversal(): + """Reverse a v1-style (post-conversion) M3 state dict back to hub names.""" + sd = {} + # dense MLP layer 0: fused gate_up + separate down + sd.update(_nvfp4_linear("model.language_model.layers.0.mlp.gate_up_proj", 8, 16)) + sd.update(_nvfp4_linear("model.language_model.layers.0.mlp.down_proj", 16, 8)) + # MoE layer 10: per-expert (already unfused) + router + sd.update(_nvfp4_linear("model.language_model.layers.10.mlp.experts.0.gate_proj", 8, 16)) + sd.update(_nvfp4_linear("model.language_model.layers.10.mlp.experts.0.up_proj", 8, 16)) + sd.update(_nvfp4_linear("model.language_model.layers.10.mlp.experts.0.down_proj", 16, 8)) + sd["model.language_model.layers.10.mlp.gate.weight"] = torch.randn(128, 6144) + sd["lm_head.weight"] = torch.randn(32, 16) + + split_rules = [SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0)] + rename_rules = [ + RenameRule(r"(\.experts\.\d+\.)gate_proj", r"\1w1"), + RenameRule(r"(\.experts\.\d+\.)up_proj", r"\1w3"), + RenameRule(r"(\.experts\.\d+\.)down_proj", r"\1w2"), + RenameRule(r"\.mlp\.experts\.", ".block_sparse_moe.experts."), + RenameRule(r"\.mlp\.gate\.", ".block_sparse_moe.gate."), + RenameRule(r"^model\.language_model\.", "language_model.model."), + RenameRule(r"^lm_head\.", "language_model.lm_head."), + ] + out = apply_reverse_rules(sd, split_rules, rename_rules) + + expected = { + # dense un-fused, still under mlp + "language_model.model.layers.0.mlp.gate_proj", + "language_model.model.layers.0.mlp.up_proj", + "language_model.model.layers.0.mlp.down_proj", + # experts renamed to block_sparse_moe + w1/w3/w2 + "language_model.model.layers.10.block_sparse_moe.experts.0.w1", + "language_model.model.layers.10.block_sparse_moe.experts.0.w3", + "language_model.model.layers.10.block_sparse_moe.experts.0.w2", + } + got_modules = {k.rsplit(".", 1)[0] for k in out if ".experts." in k or ".mlp." in k} + assert expected <= got_modules + assert "language_model.model.layers.10.block_sparse_moe.gate.weight" in out + assert "language_model.lm_head.weight" in out + # no leftover in-memory names + assert not any(k.startswith("model.language_model") for k in out) + assert not any(".gate_up_proj" in k for k in out) + + +def test_build_reverse_rules_from_mixtral_conversion_mapping_cpu(): + """Derive rules from a real transformers conversion mapping (CPU, no quantize). + + Exercises ``revert_weight_conversion_quant_aware`` / ``_build_reverse_rules``: + a ModelOpt-expanded per-expert state dict (in-memory ``mlp.experts..*`` names) + must revert to the hub layout (``block_sparse_moe.experts..w{1,2,3}``). + """ + pytest.importorskip("transformers") + from transformers import MixtralConfig, MixtralForCausalLM + + try: + from transformers.conversion_mapping import get_checkpoint_conversion_mapping + except ImportError: + pytest.skip("transformers build has no conversion_mapping API") + if not get_checkpoint_conversion_mapping("mixtral"): + pytest.skip("transformers build has no mixtral conversion_mapping") + + cfg = MixtralConfig( + hidden_size=32, + intermediate_size=64, + num_hidden_layers=1, + num_attention_heads=4, + num_key_value_heads=2, + num_local_experts=2, + num_experts_per_tok=2, + vocab_size=64, + max_position_embeddings=64, + ) + model = MixtralForCausalLM(cfg) + + p = "model.layers.0" + sd = {f"{p}.mlp.gate.weight": torch.randn(2, 32)} + for e in range(2): + sd.update(_nvfp4_linear(f"{p}.mlp.experts.{e}.gate_proj", 64, 32)) + sd.update(_nvfp4_linear(f"{p}.mlp.experts.{e}.up_proj", 64, 32)) + sd.update(_nvfp4_linear(f"{p}.mlp.experts.{e}.down_proj", 32, 64)) + + out = revert_weight_conversion_quant_aware(model, sd) + + # experts mapped to hub layout, with scale siblings carried along + for e in range(2): + base = f"{p}.block_sparse_moe.experts.{e}" + assert f"{base}.w1.weight" in out # gate_proj -> w1 + assert f"{base}.w3.weight" in out # up_proj -> w3 + assert f"{base}.w2.weight" in out # down_proj -> w2 + assert f"{base}.w1.weight_scale" in out + assert f"{base}.w1.weight_scale_2" in out + assert f"{p}.block_sparse_moe.gate.weight" in out + assert not any(".mlp.experts." in k for k in out) + + +def test_build_reverse_rules_orders_prefix_reorder_after_container(): + """WeightRenamings must reverse in reverse list order (M3 prefix-reorder bug). + + transformers *loads* by chaining renamings in list order: a component-reordering + rename (``language_model.model`` -> ``model.language_model``) fires first, making + ``language_model`` adjacent to ``layers`` so a later container rename anchored on + that adjacency (``.language_model.layers.N.mlp.experts.`` -> + ``.block_sparse_moe.experts.``) can match. On the save path the reorder must run + *last*, else it moves ``language_model`` away from ``layers`` and the container + rename silently no-ops -- exporting MiniMax-M3 experts as ``mlp.experts.*`` instead + of the hub ``block_sparse_moe.experts.*``. Mixtral does not exercise this (no + prefix reorder), so this reproduces it with a minimal two-renaming mapping. + """ + pytest.importorskip("transformers.core_model_loading") + from transformers.core_model_loading import WeightRenaming + + # Forward (hub -> in-memory) renamings; ``reverse_transform`` flips them on save. + # Order matters: reorder is listed BEFORE the adjacency-anchored container rename, + # exactly as a real M3 conversion mapping lists them. + conversions = [ + WeightRenaming("^language_model.model.", "model.language_model."), + WeightRenaming( + ".language_model.layers.(\\d+).block_sparse_moe.experts.", + ".language_model.layers.\\1.mlp.experts.", + ), + ] + model = types.SimpleNamespace(_weight_conversions=conversions) + + # In-memory expert key (leaf already at ``w1``; isolates the container/prefix order). + sd = _nvfp4_linear("model.language_model.layers.10.mlp.experts.0.w1", 8, 16) + out = revert_weight_conversion_quant_aware(model, sd) + + base = "language_model.model.layers.10.block_sparse_moe.experts.0.w1" + assert set(out) == { + f"{base}.weight", + f"{base}.weight_scale", + f"{base}.weight_scale_2", + f"{base}.input_scale", + } + # Regression guard: the buggy reorder-first order leaves these in-memory fragments. + assert not any(k.startswith("model.language_model") for k in out) + assert not any(".mlp.experts." in k for k in out) + + +def test_split_collision_raises(): + """A split whose target key already exists must fail instead of overwriting.""" + sd = _nvfp4_linear("m.gate_up_proj", 8, 16) + sd["m.gate_proj.weight"] = torch.zeros(4, 16) # pre-existing split target + rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0) + with pytest.raises(QuantConversionUnsupportedError, match="split collision"): + apply_reverse_rules(sd, [rule], []) + + +def test_stacked_experts_guard(): + """Experts not pre-expanded (stacked/fused 3-D leaf) must trigger the fallback. + + The per-expert-index leaf renames cannot rewrite a still-fused + ``.experts.gate_up_proj`` tensor, so it would ship mis-named; guard by raising. + """ + fused_leaves = ["gate_up_proj", "down_proj"] + + # Pre-expanded 2-D experts: no fused leaf present -> no raise. + ok = _nvfp4_linear("model.language_model.layers.10.mlp.experts.0.gate_proj", 8, 16) + _assert_experts_pre_expanded(ok, fused_leaves) + + # Still-fused stacked expert leaf (3-D) -> raise. + bad = {"model.language_model.layers.10.mlp.experts.gate_up_proj.weight": torch.zeros(2, 8, 16)} + with pytest.raises(QuantConversionUnsupportedError, match="not pre-expanded"): + _assert_experts_pre_expanded(bad, fused_leaves) + + # No expert converters in the mapping -> guard is a no-op even for 3-D tensors. + _assert_experts_pre_expanded(bad, []) + + +def test_revert_quant_config_names_mapper(): + """exclude_modules / quantized_layers keys revert to hub names, preserving wildcards. + + Regression for the bug where the reverse conversion renamed weight tensors to hub + names but left the quant-config module references in the in-memory namespace, so a + deployment loader matched none of the excludes and loaded an excluded BF16 layer as + quantized. Uses Mixtral's real mapping (``mlp.experts`` <-> ``block_sparse_moe.experts``). + """ + pytest.importorskip("transformers.core_model_loading") + from transformers import MixtralConfig, MixtralForCausalLM + + model = MixtralForCausalLM( + MixtralConfig( + hidden_size=32, + intermediate_size=64, + num_hidden_layers=1, + num_attention_heads=4, + num_key_value_heads=2, + num_local_experts=2, + num_experts_per_tok=2, + vocab_size=64, + max_position_embeddings=64, + ) + ) + mapper = build_reverse_name_mapper(model) + assert mapper is not None + + quant = { + "quant_algo": "NVFP4", + "exclude_modules": [ + "model.layers.0.self_attn*", # no container rename -> unchanged, wildcard kept + "model.layers.0.mlp.experts.0*", # in-memory -> block_sparse_moe.experts, wildcard kept + "lm_head", + ], + "quantized_layers": {"model.layers.0.mlp.experts.0.w1": {"quant_algo": "NVFP4"}}, + } + revert_quant_config_names(quant, mapper) + assert quant["exclude_modules"] == [ + "model.layers.0.self_attn*", + "model.layers.0.block_sparse_moe.experts.0*", + "lm_head", + ] + assert "model.layers.0.block_sparse_moe.experts.0.w1" in quant["quantized_layers"] + # mapper(None) is a no-op + q2 = {"exclude_modules": ["x*"]} + revert_quant_config_names(q2, None) + assert q2["exclude_modules"] == ["x*"]