diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 5002918175d..9eddbc1d068 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -44,6 +44,7 @@ Changelog
 
 - Fix ``ShapeInferenceError`` during ONNX INT8 + FP16 quantization (``--high_precision_dtype fp16``) of weakly-typed models (e.g. TensorFlow exports) that carry stale rank-0 ``graph.output`` shapes or ops such as ``TopK`` that ONNX's static shape inference cannot resolve. ``clear_stale_value_info`` now reconciles stale output shapes via symbolic shape inference (keeping every output's shape field populated), and AutoCast runs ONNX shape inference in strict mode and falls back to schema-based standalone type inference when it fails, so unresolved ops no longer leave tensors untyped.
 - Fused MoE expert auto-detection (``register_fused_experts_on_the_fly``) no longer requires an ``act_fn`` attribute. Some fused-expert modules (e.g. ``MiniMaxM3VLExperts``) apply a custom gated activation between the two ``F.linear`` calls instead of exposing ``act_fn``; they were silently skipped, leaving routed experts unquantized (an experts-only recipe matched nothing) and failing HF export with ``NotImplementedError``. ``_QuantFusedExperts`` is activation-agnostic (it only intercepts the two ``F.linear`` calls), so the requirement was unnecessary. This enables NVFP4/FP8 quantization and export for MiniMax-M2 / MiniMax-M3.
+- Fix unified HF export emitting transformers' *in-memory* (post-``conversion_mapping``) tensor names instead of the original model-hub names, breaking the unified-checkpoint contract (observed on MiniMax-M3: exported ``model.language_model.*`` / ``mlp.experts.*.gate_proj`` instead of hub ``language_model.model.*`` / ``block_sparse_moe.experts.*.w{1,2,3}``). transformers' own save-side ``revert_weight_conversion`` is disabled by ModelOpt because it raises ``RuntimeError`` on 0-d scalar scale tensors, so a new quant-aware reverse conversion (``modelopt/torch/export/quant_aware_conversion.py``) derives rename/split rules from the model's conversion mapping via transformers' ``reverse_transform()`` and carries each weight's companion scale tensors (``weight_scale``, ``weight_scale_2``, ``input_scale``, ``weight_scale_inv``, ``bias``) through the renames and un-fusions, so quantized exports round-trip to the hub names. Any mapping op that cannot be reversed quant-aware yet (e.g. still-stacked fused experts) falls back to the previous in-memory names instead of aborting the export.
 
 0.45 (2026-07-02)
 ^^^^^^^^^^^^^^^^^
diff --git a/examples/hf_ptq/example_utils.py b/examples/hf_ptq/example_utils.py
index 9e8dea5f107..8b118916601 100755
--- a/examples/hf_ptq/example_utils.py
+++ b/examples/hf_ptq/example_utils.py
@@ -912,19 +912,47 @@ def _resolve_model_path(model_name_or_path: str, trust_remote_code: bool = False
 
 
 def copy_custom_model_files(source_path: str, export_path: str, trust_remote_code: bool = False):
-    """Copy custom model files (configuration_*.py, modeling_*.py, *.json, etc.) from source to export directory.
-
-    This function copies custom Python files and JSON configuration files that are needed for
-    models with custom code. It excludes config.json and model.safetensors.index.json as these
-    are typically handled separately by the model export process.
+    """Copy processor/tokenizer artifacts (and, with trust_remote_code, custom code) to export.
+
+    Processor and tokenizer *data* artifacts -- e.g. a VLM's ``preprocessor_config.json``,
+    ``merges.txt``/``vocab.json``, and the processor helper modules -- are needed by the
+    deployment stack (vLLM/SGLang) even when the model itself runs on native (non-remote)
+    transformers code. transformers 5.x restructured many VLM configs and no longer
+    re-saves these on ``save_pretrained`` for models loaded natively, so without copying
+    them a native-path export is missing e.g. ``preprocessor_config.json`` and fails to
+    load (``Can't load image processor``). These are copied regardless of
+    ``trust_remote_code``. Executable model/config code (``modeling*.py``,
+    ``configuration_*.py``, ``tokenization_*.py``, and other custom JSON) is only meaningful
+    with ``trust_remote_code`` and is copied only then. ``config.json`` and
+    ``model.safetensors.index.json`` are always skipped (handled by the export itself).
 
     Args:
         source_path: Path to the original model directory or HuggingFace model ID
         export_path: Path to the exported model directory
-        trust_remote_code: Whether trust_remote_code was used (only copy files if True)
+        trust_remote_code: Whether trust_remote_code was used (gates the executable code files)
     """
-    if not trust_remote_code:
-        return
+    # Deployment-critical processor/tokenizer artifacts: safe to copy regardless of
+    # trust_remote_code (data + processor helpers, not model code).
+    always_copy_patterns = [
+        "preprocessor_config.json",
+        "processor_config.json",
+        "image_processing*.py",
+        "processing_*.py",
+        "video_processing*.py",
+        "feature_extraction_*.py",
+        "added_tokens.json",
+        "special_tokens_map.json",
+        "vocab.json",
+        "merges.txt",
+        "tokenizer.model",
+    ]
+    # Executable custom model/config code + other custom JSON: only used with trust_remote_code.
+    code_patterns = [
+        "configuration_*.py",
+        "modeling*.py",
+        "tokenization_*.py",
+        "*.json",
+    ]
 
     # Resolve the source path (handles both local paths and HF model IDs)
     resolved_source_path = _resolve_model_path(source_path, trust_remote_code)
@@ -946,24 +974,17 @@ def copy_custom_model_files(source_path: str, export_path: str, trust_remote_cod
         print(f"Warning: Export directory {export_path} does not exist")
         return
 
-    # Common patterns for custom model files that need to be copied
-    custom_file_patterns = [
-        "configuration_*.py",
-        "modeling*.py",
-        "tokenization_*.py",
-        "processing_*.py",
-        "image_processing*.py",
-        "feature_extraction_*.py",
-        "*.json",
-    ]
+    patterns = [*always_copy_patterns, *(code_patterns if trust_remote_code else [])]
 
-    copied_files = []
-    for pattern in custom_file_patterns:
+    copied_files: list[str] = []
+    for pattern in patterns:
         for file_path in source_dir.glob(pattern):
             if file_path.is_file():
                 # Skip config.json and model.safetensors.index.json as they're handled separately
                 if file_path.name in ["config.json", "model.safetensors.index.json"]:
                     continue
+                if file_path.name in copied_files:  # e.g. matched by both pattern lists
+                    continue
                 dest_path = export_dir / file_path.name
                 try:
                     shutil.copy2(file_path, dest_path)
diff --git a/modelopt/torch/export/quant_aware_conversion.py b/modelopt/torch/export/quant_aware_conversion.py
new file mode 100644
index 00000000000..516c0e90851
--- /dev/null
+++ b/modelopt/torch/export/quant_aware_conversion.py
@@ -0,0 +1,430 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Quantization-aware reverse weight conversion for unified HF export.
+
+Background
+----------
+``transformers`` may apply a ``conversion_mapping`` when loading a model, so the
+in-memory parameter names differ from the original model-hub checkpoint (e.g. fused
+``mlp.gate_up_proj``, renamed MoE leaves, reordered ``model``/``language_model``
+prefix). On save, ``transformers`` reverses this via ``revert_weight_conversion`` so
+the on-disk names match the hub checkpoint again.
+
+ModelOpt's unified export disables that reverse (it raises ``RuntimeError`` on 0-d
+scalar scale tensors such as ``weight_scale_2``/``input_scale``), so a quantized
+export emits the *in-memory* (post-conversion) names — violating the unified
+checkpoint contract that names stay aligned with the original hub checkpoint.
+
+This module performs the reverse in a quantization-aware way: it carries each
+weight's companion scale tensors (``weight_scale``, ``weight_scale_2``,
+``input_scale``, ``weight_scale_inv``, ``bias``) through the rename and un-fuse
+operations.
+
+Scope
+-----
+Two reverse primitives cover the conversion_mapping cases:
+
+* **Rename** — a key-level string substitution. Because a quantized linear stores
+  every tensor under ``<module>.<leaf>``, renaming the module substring rewrites the
+  weight and all its scale siblings together with no tensor manipulation.
+* **Split** — un-fuse an output-dim concatenation (e.g. dense ``gate_up_proj`` ->
+  ``gate_proj`` + ``up_proj``). ``weight``/``weight_scale``/``weight_scale_inv``/
+  ``bias`` are chunked along the fused (output) dim; 0-d scalar ``weight_scale_2``/
+  ``input_scale`` are duplicated to each part (they are per-tensor and shared).
+
+MoE experts need only **Rename**: ModelOpt's export already expands the fused,
+stacked in-memory experts (``experts.gate_up_proj`` of shape ``[E, 2F, H]``) into
+per-expert 2-D linears (``experts.<i>.gate_proj`` / ``up_proj`` / ``down_proj``)
+before save, so the reverse just maps those per-expert leaf names back to the hub
+leaves (e.g. ``gate_proj`` -> ``w1``, ``up_proj`` -> ``w3``, ``down_proj`` -> ``w2``).
+
+Reverse rules are derived from the model's conversion mapping via transformers'
+``reverse_transform()``. Any op shape not covered raises
+:class:`QuantConversionUnsupportedError` so the caller falls back to the legacy
+(in-memory-name) behavior rather than emit a silently-wrong checkpoint.
+"""
+
+import re
+from dataclasses import dataclass
+
+import torch
+
+__all__ = [
+    "QuantConversionUnsupportedError",
+    "RenameRule",
+    "SplitRule",
+    "apply_reverse_rules",
+    "build_reverse_name_mapper",
+    "revert_quant_config_names",
+    "revert_weight_conversion_quant_aware",
+]
+
+# Tensor leaves that belong to a single quantized linear module. A rename of the
+# parent module path applies uniformly to all of these.
+_LEAF_SUFFIXES = (
+    ".weight",
+    ".weight_scale",
+    ".weight_scale_2",
+    ".weight_scale_inv",
+    ".input_scale",
+    ".bias",
+)
+
+# Leaves that are per-tensor scalars (0-d) and must be *duplicated*, not split, when
+# a fused module is un-fused.
+_SCALAR_LEAF_SUFFIXES = (".weight_scale_2", ".input_scale")
+
+
+class QuantConversionUnsupportedError(Exception):
+    """Raised when a conversion op cannot be reversed quant-aware (caller falls back)."""
+
+
+@dataclass(frozen=True)
+class RenameRule:
+    """Reverse of a ``WeightRenaming``: ``re.sub(pattern, repl, key)`` on every key."""
+
+    pattern: str
+    repl: str
+
+
+@dataclass(frozen=True)
+class SplitRule:
+    """Reverse of an output-dim ``Concatenate``: un-fuse one module into ``parts``.
+
+    Args:
+        fused_suffix: module suffix of the fused tensor, e.g. ``".gate_up_proj"``.
+        part_suffixes: ordered replacements, e.g. ``(".gate_proj", ".up_proj")``.
+        dim: the fused (output) dim along which ``weight``/``weight_scale``/``bias``
+            are chunked. NVFP4 ``weight`` is ``[out, in//2]`` and ``weight_scale`` is
+            ``[out, in//block]`` so the output dim is ``0`` for both.
+    """
+
+    fused_suffix: str
+    part_suffixes: tuple[str, ...]
+    dim: int = 0
+
+
+def _split_leaf_tensor(leaf: str, tensor: torch.Tensor, n: int, idx: int, dim: int):
+    """Return the ``idx``-th of ``n`` parts of ``tensor`` for tensor leaf ``leaf``."""
+    if leaf in _SCALAR_LEAF_SUFFIXES or tensor.dim() == 0:
+        # Per-tensor scalar shared across the fused parts -> duplicate.
+        return tensor.clone()
+    size = tensor.size(dim)
+    if size % n != 0:
+        raise QuantConversionUnsupportedError(
+            f"cannot split leaf '{leaf}' of size {size} along dim {dim} into {n} parts"
+        )
+    return tensor.chunk(n, dim=dim)[idx].clone()
+
+
+def _apply_split_rule(state_dict: dict[str, torch.Tensor], rule: SplitRule) -> None:
+    """Un-fuse all modules matching ``rule.fused_suffix`` in place."""
+    n = len(rule.part_suffixes)
+    # Collect (module_path, leaf, key) for every tensor under a fused module.
+    fused_keys: list[tuple[str, str, str]] = []
+    for key in state_dict:
+        for leaf in _LEAF_SUFFIXES:
+            if key.endswith(rule.fused_suffix + leaf):
+                module = key[: -len(leaf)][: -len(rule.fused_suffix)]
+                fused_keys.append((module, leaf, key))
+                break
+
+    for module, leaf, key in fused_keys:
+        tensor = state_dict.pop(key)
+        # A 3-D expert tensor here means stacked experts (MergeModulelist) — out of scope.
+        if leaf == ".weight" and tensor.dim() >= 3:
+            raise QuantConversionUnsupportedError(
+                f"stacked 3-D expert tensor '{key}' (ndim={tensor.dim()}) is not supported; "
+                "un-stacking experts + their scales is a follow-up"
+            )
+        for idx, part in enumerate(rule.part_suffixes):
+            target_key = module + part + leaf
+            if target_key in state_dict:
+                raise QuantConversionUnsupportedError(f"split collision on '{target_key}'")
+            state_dict[target_key] = _split_leaf_tensor(leaf, tensor, n, idx, rule.dim)
+
+
+def apply_reverse_rules(
+    state_dict: dict[str, torch.Tensor],
+    split_rules: list[SplitRule],
+    rename_rules: list[RenameRule],
+) -> dict[str, torch.Tensor]:
+    """Apply quant-aware reverse conversion: splits first, then renames.
+
+    Splits run on the in-memory (post-conversion) names; renames then map the
+    resulting keys back to the original hub names. Renames are applied in order.
+    """
+    out = dict(state_dict)
+    for rule in split_rules:
+        _apply_split_rule(out, rule)
+
+    compiled = [(re.compile(r.pattern), r.repl) for r in rename_rules]
+    renamed: dict[str, torch.Tensor] = {}
+    for key, value in out.items():
+        new_key = key
+        for pattern, repl in compiled:
+            new_key = pattern.sub(repl, new_key)
+        if new_key in renamed:
+            raise QuantConversionUnsupportedError(f"rename collision on '{new_key}'")
+        renamed[new_key] = value
+    return renamed
+
+
+def revert_weight_conversion_quant_aware(model, state_dict: dict[str, torch.Tensor]):
+    """Reverse a transformers conversion_mapping on a quantized state dict.
+
+    Builds reverse rules from the model's conversion mapping and applies them
+    carrying companion scale tensors. Raises :class:`QuantConversionUnsupportedError`
+    when the mapping uses an op that cannot be reversed quant-aware yet, so the
+    caller can fall back to the legacy behavior.
+    """
+    split_rules, rename_rules, expert_fused_leaves = _build_reverse_rules(model)
+    if not split_rules and not rename_rules:
+        return state_dict
+    _assert_experts_pre_expanded(state_dict, expert_fused_leaves)
+    return apply_reverse_rules(state_dict, split_rules, rename_rules)
+
+
+def build_reverse_name_mapper(model):
+    """Build a ``str -> str`` mapper that applies the quant-aware reverse *rename* rules.
+
+    The exported weight tensors are reverted to the original hub names by
+    :func:`revert_weight_conversion_quant_aware`, but the quantization config's module
+    references (``exclude_modules`` and, for mixed precision, ``quantized_layers`` keys)
+    are built from the in-memory module names and would otherwise stay in the
+    post-conversion namespace -- so a deployment loader matching those patterns against
+    the (reverted) hub-named modules finds no match, silently loads an excluded BF16
+    layer as quantized, and fails. Applying the same rename rules to those name strings
+    keeps them aligned with the weights. Only the rename rules apply (splits act on
+    tensors, not names).
+
+    Returns ``None`` when no renaming applies. Raises
+    :class:`QuantConversionUnsupportedError` when the mapping can't be reversed, so the
+    caller can keep the in-memory names for BOTH weights and config (mutually consistent).
+    """
+    _, rename_rules, _ = _build_reverse_rules(model)
+    if not rename_rules:
+        return None
+    compiled = [(re.compile(r.pattern), r.repl) for r in rename_rules]
+    # The rename patterns are anchored on full weight keys and use ``.`` (any char) as a
+    # path separator, so a trailing glob wildcard in an exclude pattern would be consumed
+    # (e.g. ``...mlp.shared_experts.`` -> ``...`` would eat the ``*``). Append a sentinel
+    # path segment so container renames whose pattern ends in ``.`` match the sentinel's
+    # separator, then strip it and restore the wildcard.
+    _sentinel = ".\x00modelopt_name_sentinel"
+
+    def _apply(text: str) -> str:
+        for pattern, repl in compiled:
+            text = pattern.sub(repl, text)
+        return text
+
+    def _map(name: str) -> str:
+        base, suffix = name, ""
+        if name.endswith(".*"):
+            base, suffix = name[:-2], ".*"
+        elif name.endswith("*"):
+            base, suffix = name[:-1], "*"
+        mapped = _apply(base + _sentinel)
+        mapped = mapped.removesuffix(_sentinel)
+        return mapped + suffix
+
+    return _map
+
+
+def revert_quant_config_names(quantization: dict, mapper) -> None:
+    """Revert ``exclude_modules`` / ``quantized_layers`` keys to hub names, in place.
+
+    ``mapper`` is the callable from :func:`build_reverse_name_mapper` (a no-op when
+    ``None``). Applies to the ModelOpt ``{"quantization": {...}}`` sub-dict before it is
+    written / format-converted, so both ``hf_quant_config.json`` and the embedded
+    ``config.json`` ``quantization_config`` inherit the reverted names.
+    """
+    if mapper is None or not isinstance(quantization, dict):
+        return
+    exclude = quantization.get("exclude_modules")
+    if exclude:
+        quantization["exclude_modules"] = [mapper(e) for e in exclude]
+    quantized_layers = quantization.get("quantized_layers")
+    if isinstance(quantized_layers, dict) and quantized_layers:
+        quantization["quantized_layers"] = {mapper(k): v for k, v in quantized_layers.items()}
+
+
+def _assert_experts_pre_expanded(
+    state_dict: dict[str, torch.Tensor], expert_fused_leaves: list[str]
+) -> None:
+    """Guard the expert rename path against experts that were not pre-expanded.
+
+    The expert reverse is emitted as key renames anchored on the per-expert index
+    (``.experts.<i>.<leaf>``). If ModelOpt did not expand the fused/stacked experts,
+    a key like ``.experts.gate_up_proj`` (a 3-D ``[E, ...]`` tensor) survives: no
+    per-expert rename matches it, so it would ship unrenamed under the wrong name.
+    Mirror the split path's 3-D guard and raise so the caller falls back to legacy
+    (in-memory-name) export instead of emitting a silently mis-named checkpoint.
+    """
+    if not expert_fused_leaves:
+        return
+    fused = re.compile(
+        r"\.experts\.(?:" + "|".join(re.escape(leaf) for leaf in expert_fused_leaves) + r")(?:\.|$)"
+    )
+    for key, tensor in state_dict.items():
+        if fused.search(key) or (".experts." in key and getattr(tensor, "ndim", 0) >= 3):
+            raise QuantConversionUnsupportedError(
+                f"experts not pre-expanded (stacked/fused expert tensor '{key}'); "
+                "quant-aware reverse conversion cannot rename it"
+            )
+
+
+def _build_reverse_rules(model) -> tuple[list[SplitRule], list[RenameRule], list[str]]:
+    """Derive reverse rules from the model's transformers conversion mapping.
+
+    Returns ``(split_rules, rename_rules, expert_fused_leaves)``; the last is the set
+    of in-memory fused expert leaf names, used to guard against experts that were not
+    pre-expanded. Returns empty lists when no mapping applies (export unchanged). Uses
+    transformers' own ``reverse_transform()`` to get correctly-reversed name patterns
+    (so anchored regex renamings reverse properly), then translates them:
+
+    * ``WeightRenaming`` -> :class:`RenameRule` (carries scale siblings for free).
+    * Expert ``WeightConverter`` (reverse contains ``SplitModulelist``): ModelOpt's
+      export already expands fused experts into per-expert 2-D linears, so only the
+      per-expert leaf names need mapping back (e.g. ``gate_proj`` -> ``w1``). Emitted
+      as rename rules -- no tensor manipulation.
+    * Dense fusing ``WeightConverter`` (reverse is ``Chunk`` only): the fused tensor
+      survives in the state dict, so it is un-fused via a :class:`SplitRule`.
+
+    Raises :class:`QuantConversionUnsupportedError` for any op shape not covered, so
+    the caller falls back to the legacy (in-memory-name) behavior.
+    """
+    try:
+        conversions = getattr(model, "_weight_conversions", None)
+        if conversions is None:
+            from transformers.conversion_mapping import get_model_conversion_mapping
+
+            conversions = get_model_conversion_mapping(model, add_legacy=False)
+    except Exception as exc:  # transformers without conversion_mapping, or API drift
+        raise QuantConversionUnsupportedError(f"could not read conversion mapping: {exc}") from exc
+
+    if not conversions:
+        return [], [], []
+
+    try:
+        from transformers.core_model_loading import (
+            Chunk,
+            SplitModulelist,
+            WeightConverter,
+            WeightRenaming,
+        )
+    except Exception as exc:  # transformers too old / API drift -> fall back to legacy names
+        raise QuantConversionUnsupportedError(
+            f"transformers.core_model_loading unavailable: {exc}"
+        ) from exc
+
+    split_rules: list[SplitRule] = []
+    # WeightRenamings and expert-leaf (converter-derived) renames are collected
+    # separately so they can be ordered correctly on the save path -- see the
+    # ``rename_rules`` assembly below.
+    weight_renamings: list[RenameRule] = []
+    leaf_renamings: list[RenameRule] = []
+    # In-memory fused expert leaf names (e.g. ``gate_up_proj``, ``down_proj``). Used by
+    # the caller to detect experts that were NOT pre-expanded (stacked 3-D tensors),
+    # which the per-expert-index leaf renames cannot rewrite.
+    expert_fused_leaves: list[str] = []
+    for conv in conversions:
+        rev = conv.reverse_transform()  # hub<-in-memory; reversed name patterns + ops
+        if isinstance(rev, WeightRenaming):
+            for pattern, repl in zip(_as_list(rev.source_patterns), _as_list(rev.target_patterns)):
+                weight_renamings.append(RenameRule(pattern=pattern, repl=repl))
+        elif isinstance(rev, WeightConverter):
+            ops = list(rev.operations)
+            if any(isinstance(op, SplitModulelist) for op in ops):
+                # Expert converter: ModelOpt already un-stacked/un-fused experts to
+                # per-expert 2-D linears, so only per-expert leaf names remain to map.
+                leaf_renamings.extend(_expert_leaf_renames(rev))
+                expert_fused_leaves.append(_leaf(_as_list(rev.source_patterns)[0]))
+            elif ops and all(isinstance(op, Chunk) for op in ops):
+                # Dense fused linear survives in the state dict -> un-fuse (split).
+                split_rules.append(_dense_split_rule(rev, ops))
+            else:
+                raise QuantConversionUnsupportedError(
+                    f"unsupported reverse ops: {[type(o).__name__ for o in ops]}"
+                )
+        else:
+            raise QuantConversionUnsupportedError(f"unsupported conversion: {type(rev).__name__}")
+
+    # Save-path order mirrors transformers' ``rename_source_key``: converters act
+    # first, then WeightRenamings. Crucially, transformers *loads* by chaining the
+    # renamings in list order -- a component-reordering rename (e.g.
+    # ``language_model.model`` -> ``model.language_model``) fires before a rename that
+    # anchors on the resulting adjacency (e.g.
+    # ``.language_model.layers.N.mlp.experts.`` -> ``.block_sparse_moe.experts.``).
+    # The reverse must therefore apply WeightRenamings in *reverse* list order so the
+    # reorder rename runs last and does not destroy the anchor the MoE container/gate
+    # renames rely on. Expert leaf renames act on disjoint ``.experts.<i>.<leaf>``
+    # substrings and are applied first.
+    rename_rules = leaf_renamings + list(reversed(weight_renamings))
+    return split_rules, rename_rules, expert_fused_leaves
+
+
+# ModelOpt's export splits a fused ``gate_up_proj`` into these per-expert linears,
+# in this order (see modelopt.torch.export.layer_utils.get_expert_linear_names).
+_FUSED_EXPERT_PART_NAMES = {"gate_up_proj": ["gate_proj", "up_proj"]}
+
+
+def _expert_leaf_renames(rev) -> list[RenameRule]:
+    """Per-expert leaf renames for an expert converter (ModelOpt pre-expands experts).
+
+    ``rev`` reverses hub<-in-memory, so ``rev.source_patterns`` is the fused in-memory
+    leaf (e.g. ``.experts.gate_up_proj``) and ``rev.target_patterns`` the hub leaves
+    (e.g. ``.experts.*.w1.weight``, ``.experts.*.w3.weight``). ModelOpt exports the
+    fused leaf as per-expert parts, mapped back to the hub leaves positionally.
+    """
+    src_leaf = _leaf(_as_list(rev.source_patterns)[0])
+    hub_leaves = [_leaf(t) for t in _as_list(rev.target_patterns)]
+    part_leaves = _FUSED_EXPERT_PART_NAMES.get(src_leaf, [src_leaf])
+    if len(part_leaves) != len(hub_leaves):
+        raise QuantConversionUnsupportedError(
+            f"expert converter arity mismatch: {part_leaves} vs {hub_leaves}"
+        )
+    return [
+        RenameRule(rf"(\.experts\.\d+\.){re.escape(part)}\b", rf"\g<1>{hub}")
+        for part, hub in zip(part_leaves, hub_leaves)
+    ]
+
+
+def _dense_split_rule(rev, ops) -> SplitRule:
+    """Un-fuse a dense (non-expert) fused linear that survives in the state dict."""
+    fused = _leaf_suffix(_as_list(rev.source_patterns)[0])
+    parts = tuple(_leaf_suffix(t) for t in _as_list(rev.target_patterns))
+    dim = next((op.dim for op in ops if hasattr(op, "dim")), 0)
+    return SplitRule(fused_suffix=fused, part_suffixes=parts, dim=dim)
+
+
+def _as_list(x) -> list:
+    return list(x) if isinstance(x, (list, tuple)) else [x]
+
+
+def _leaf(pattern: str) -> str:
+    """Bare leaf name from a conversion pattern, e.g. ``.experts.*.w1.weight`` -> ``w1``."""
+    p = pattern
+    for suffix in _LEAF_SUFFIXES:
+        if p.endswith(suffix):
+            p = p[: -len(suffix)]
+            break
+    return p.rstrip(".*").rsplit(".", 1)[-1]
+
+
+def _leaf_suffix(pattern: str) -> str:
+    """Leaf name as a module suffix, e.g. ``.gate_proj``."""
+    return "." + _leaf(pattern)
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 8bc92ed5eb9..64dfb5e12c2 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -91,6 +91,11 @@
 from .model_utils import _reorder_canonical_first, get_language_model_from_vl, is_multimodal_model
 from .moe_utils import _export_fused_experts
 from .plugins import SpeculativeDecodingExporter, has_spec_opt, sanitize_hf_config_for_deployment
+from .quant_aware_conversion import (
+    build_reverse_name_mapper,
+    revert_quant_config_names,
+    revert_weight_conversion_quant_aware,
+)
 from .quant_utils import (
     fuse_prequant_layernorm,
     fuse_prequant_to_linear,
@@ -1333,9 +1338,9 @@ def _export_diffusers_checkpoint(
 
 
 # TODO: Remove this workaround once HuggingFace fixes revert_weight_conversion to handle
-# scalar (0-d) tensors. The bug is in transformers' Chunk.convert() which calls
-# tensor.size(self.dim) on quantization scale buffers that are 0-d scalars, causing
-# IndexError. Confirmed still present in transformers 5.2.0.
+# scalar (0-d) tensors. transformers' Chunk.convert() calls torch.chunk() on quantization
+# scale buffers that are 0-d scalars, raising RuntimeError ("chunk expects at least a
+# 1-dimensional tensor"). Confirmed in transformers 5.12.0.
 # See: transformers/core_model_loading.py, Chunk.convert()
 def _revert_weight_conversion_noop(model: Any, state_dict: dict) -> dict:
     """No-op replacement for transformers' revert_weight_conversion."""
@@ -1358,7 +1363,7 @@ def _try_patch_module(mod_path: str) -> tuple[Any, Any] | None:
 
 
 def _patch_revert_weight_conversion() -> list[tuple[Any, Any]]:
-    """Patch revert_weight_conversion in transformers to avoid IndexError on scalar tensors."""
+    """Patch revert_weight_conversion in transformers to avoid RuntimeError on scalar tensors."""
     patches: list[tuple[Any, Any]] = []
     for mod_path in [
         "transformers.core_model_loading",
@@ -1452,6 +1457,34 @@ def export_hf_checkpoint(
     try:
         post_state_dict, hf_quant_config = _export_transformers_checkpoint(model, dtype, **kwargs)
 
+        # Remove hf_quantizer from model so post_state_dict can be exported.
+        if getattr(model, "hf_quantizer", None) is not None:
+            model.hf_quantizer = None
+
+        export_state_dict = {**post_state_dict, **(extra_state_dict or {})}
+
+        # transformers may have applied a load-time conversion_mapping (fused gate_up_proj,
+        # renamed MoE leaves, reordered model/language_model prefix), so the in-memory names
+        # differ from the original hub checkpoint. Reverse it quantization-aware so exported
+        # tensor names stay aligned with the hub checkpoint (the unified-checkpoint contract).
+        # transformers' own revert_weight_conversion errors on 0-d scalar scale tensors, so we
+        # do it here. The same rename is applied to the quant-config module references
+        # (exclude_modules / quantized_layers keys) so a deployment loader matches them against
+        # the reverted hub-named modules (otherwise an excluded BF16 layer is loaded as quantized
+        # and fails). Best-effort and atomic: any failure (an op we cannot reverse yet,
+        # transformers API drift, unexpected shapes) falls back to the in-memory names for BOTH
+        # weights and config so they stay mutually consistent.
+        try:
+            name_mapper = build_reverse_name_mapper(model)
+            export_state_dict = revert_weight_conversion_quant_aware(model, export_state_dict)
+            if name_mapper is not None and hf_quant_config:
+                revert_quant_config_names(hf_quant_config.get("quantization", {}), name_mapper)
+        except Exception as exc:
+            warnings.warn(
+                f"Quant-aware reverse weight conversion skipped ({exc}); exported tensor "
+                "names may not match the original HF hub checkpoint."
+            )
+
         # Only treat the export as quantized when at least one quant_algo field is set.
         # get_quant_config always returns a dict (even for sparsity-only or unmodified models),
         # so emitting hf_quant_config.json unconditionally produces a file with
@@ -1472,15 +1505,10 @@ def export_hf_checkpoint(
         else:
             hf_quant_config = None
 
-        # Remove hf_quantizer from model so post_state_dict can be exported.
-        if getattr(model, "hf_quantizer", None) is not None:
-            model.hf_quantizer = None
-
-        # Save model
-        # Temporarily disable revert_weight_conversion if available — it doesn't handle
-        # quantized state dicts (scalar scale tensors have 0 dimensions, causing IndexError).
-        # We must patch both the source module and the importing module since
-        # modeling_utils does `from core_model_loading import revert_weight_conversion`.
+        # Keep transformers' own revert_weight_conversion disabled (the quant-aware reverse
+        # above replaces it): it can't handle quantized state dicts (RuntimeError on 0-d scalar
+        # scale tensors). Patch both the source and importing module since modeling_utils does
+        # `from core_model_loading import revert_weight_conversion`.
         _patches = _patch_revert_weight_conversion()
 
         _sanitize_generation_config_for_save(model)
@@ -1488,7 +1516,7 @@ def export_hf_checkpoint(
         try:
             model.save_pretrained(
                 export_dir,
-                state_dict={**post_state_dict, **(extra_state_dict or {})},
+                state_dict=export_state_dict,
                 save_modelopt_state=save_modelopt_state,
                 max_shard_size=max_shard_size,
             )
diff --git a/tests/gpu/torch/export/test_quant_aware_conversion_gpu.py b/tests/gpu/torch/export/test_quant_aware_conversion_gpu.py
new file mode 100644
index 00000000000..85b71d3f588
--- /dev/null
+++ b/tests/gpu/torch/export/test_quant_aware_conversion_gpu.py
@@ -0,0 +1,105 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end GPU test: unified HF export produces original-hub-aligned tensor names.
+
+Uses a tiny Mixtral, whose transformers ``conversion_mapping`` fuses/renames MoE
+experts (``block_sparse_moe.experts.*.w{1,2,3}`` <-> in-memory
+``mlp.experts.gate_up_proj``) — the same machinery larger MoE VLMs (e.g. MiniMax-M3)
+use. The exported quantized checkpoint's tensor names must match the canonical hub
+names obtained from transformers' own ``revert_weight_conversion`` on the reference
+(unquantized) model.
+"""
+
+import glob
+import os
+import tempfile
+
+import pytest
+import torch
+
+pytestmark = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires a GPU")
+
+_SCALE_SUFFIXES = (".weight_scale", ".weight_scale_2", ".weight_scale_inv", ".input_scale")
+
+
+def _tiny_mixtral_config():
+    from transformers import MixtralConfig
+
+    cfg = MixtralConfig(
+        hidden_size=64,
+        intermediate_size=128,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        num_local_experts=4,
+        num_experts_per_tok=2,
+        vocab_size=320,
+        max_position_embeddings=128,
+    )
+    cfg.architectures = ["MixtralForCausalLM"]
+    return cfg
+
+
+def test_export_tensor_names_match_hub_after_conversion_reverse():
+    pytest.importorskip("transformers")
+    from transformers import MixtralForCausalLM
+
+    try:
+        from transformers.conversion_mapping import get_checkpoint_conversion_mapping
+        from transformers.core_model_loading import revert_weight_conversion
+    except ImportError:
+        pytest.skip("transformers build has no conversion_mapping API")
+    if not get_checkpoint_conversion_mapping("mixtral"):
+        pytest.skip("transformers build has no mixtral conversion_mapping")
+
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.export import export_hf_checkpoint
+
+    cfg = _tiny_mixtral_config()
+
+    # Canonical hub names: transformers' own reverse on the unquantized reference.
+    ref = MixtralForCausalLM(cfg)
+    hub_names = set(revert_weight_conversion(ref, ref.state_dict()).keys())
+    # sanity: reference really is fused/renamed in memory
+    assert any(".block_sparse_moe.experts.0.w1.weight" in n for n in hub_names)
+
+    model = MixtralForCausalLM(cfg).to("cuda", torch.bfloat16).eval()
+    ids = torch.randint(0, cfg.vocab_size, (2, 16), device="cuda")
+
+    def forward_loop(m):
+        for _ in range(4):
+            m(ids)
+
+    model = mtq.quantize(model, mtq.NVFP4_DEFAULT_CFG, forward_loop)
+
+    with tempfile.TemporaryDirectory() as export_dir:
+        with torch.inference_mode():
+            export_hf_checkpoint(model, export_dir=export_dir)
+        exported = set()
+        for f in glob.glob(os.path.join(export_dir, "*.safetensors")):
+            from safetensors import safe_open
+
+            with safe_open(f, framework="pt") as sf:
+                exported.update(sf.keys())
+
+    non_scale = {k for k in exported if not any(k.endswith(s) for s in _SCALE_SUFFIXES)}
+    # Every exported weight carries its original hub name; nothing renamed/left in-memory.
+    assert non_scale == hub_names, (
+        f"missing={sorted(hub_names - non_scale)[:5]} extra={sorted(non_scale - hub_names)[:5]}"
+    )
+    # Experts specifically use the hub layout, not the fused in-memory names.
+    assert any(".block_sparse_moe.experts.0.w1.weight" in k for k in non_scale)
+    assert not any(".mlp.experts.gate_up_proj" in k for k in exported)
diff --git a/tests/unit/torch/export/test_quant_aware_conversion.py b/tests/unit/torch/export/test_quant_aware_conversion.py
new file mode 100644
index 00000000000..ac6df25e6dc
--- /dev/null
+++ b/tests/unit/torch/export/test_quant_aware_conversion.py
@@ -0,0 +1,343 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for quant-aware reverse weight conversion (CPU, no GPU needed).
+
+Tensor shapes mirror a real NVFP4 linear from the MiniMax-M3 checkpoint: ``weight``
+uint8 ``[out, in//2]``, ``weight_scale`` ``[out, in//16]``, ``weight_scale_2`` /
+``input_scale`` 0-d scalars. The reverse logic is dtype-agnostic, so ``weight_scale``
+uses float32 here (real checkpoints use float8_e4m3, whose CPU ops are not portable
+across platforms) — only shapes and the scalar-vs-blocked distinction matter.
+"""
+
+import types
+
+import pytest
+import torch
+
+from modelopt.torch.export.quant_aware_conversion import (
+    QuantConversionUnsupportedError,
+    RenameRule,
+    SplitRule,
+    _assert_experts_pre_expanded,
+    apply_reverse_rules,
+    build_reverse_name_mapper,
+    revert_quant_config_names,
+    revert_weight_conversion_quant_aware,
+)
+
+BLOCK = 16
+
+
+def _nvfp4_linear(module: str, out: int, in_features: int) -> dict[str, torch.Tensor]:
+    """Synthetic NVFP4 quantized-linear tensor group keyed under ``module``."""
+    return {
+        f"{module}.weight": torch.randint(0, 255, (out, in_features // 2), dtype=torch.uint8),
+        f"{module}.weight_scale": torch.randn(out, in_features // BLOCK),
+        f"{module}.weight_scale_2": torch.tensor(0.037, dtype=torch.float32),  # 0-d
+        f"{module}.input_scale": torch.tensor(1.0, dtype=torch.float32),  # 0-d
+    }
+
+
+def test_rename_carries_scale_siblings():
+    """A module rename rewrites weight + all scale siblings with identical values."""
+    sd = _nvfp4_linear("model.language_model.layers.10.mlp.experts.40.gate_proj", 8, 16)
+    rules = [
+        RenameRule(r"\.mlp\.experts\.", ".block_sparse_moe.experts."),
+        RenameRule(r"(\.block_sparse_moe\.experts\.\d+\.)gate_proj", r"\1w1"),
+        RenameRule(r"^model\.language_model\.", "language_model.model."),
+    ]
+    out = apply_reverse_rules(sd, [], rules)
+
+    base = "language_model.model.layers.10.block_sparse_moe.experts.40.w1"
+    assert set(out) == {
+        f"{base}.weight",
+        f"{base}.weight_scale",
+        f"{base}.weight_scale_2",
+        f"{base}.input_scale",
+    }
+    # values untouched: a rename rebinds the same tensor object (no copy)
+    for leaf in (".weight", ".weight_scale", ".weight_scale_2", ".input_scale"):
+        old = sd[f"model.language_model.layers.10.mlp.experts.40.gate_proj{leaf}"]
+        assert out[base + leaf] is old
+
+
+def test_split_unfuses_dense_gate_up_with_scales():
+    """gate_up_proj -> gate_proj + up_proj: weight/scale split on dim 0, scalars duplicated."""
+    out_dim, in_dim = 8, 32  # fused output dim = 8 -> 4 per part
+    sd = _nvfp4_linear("m.layers.0.mlp.gate_up_proj", out_dim, in_dim)
+    rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0)
+
+    out = apply_reverse_rules(sd, [rule], [])
+
+    g, u = "m.layers.0.mlp.gate_proj", "m.layers.0.mlp.up_proj"
+    assert set(out) == {
+        f"{g}.weight",
+        f"{g}.weight_scale",
+        f"{g}.weight_scale_2",
+        f"{g}.input_scale",
+        f"{u}.weight",
+        f"{u}.weight_scale",
+        f"{u}.weight_scale_2",
+        f"{u}.input_scale",
+    }
+    # weight/scale halved on dim 0; concatenating the parts reconstructs the original
+    assert out[f"{g}.weight"].shape == (out_dim // 2, in_dim // 2)
+    assert out[f"{g}.weight_scale"].shape == (out_dim // 2, in_dim // BLOCK)
+    assert torch.equal(
+        torch.cat([out[f"{g}.weight"], out[f"{u}.weight"]], dim=0),
+        sd["m.layers.0.mlp.gate_up_proj.weight"],
+    )
+    # 0-d scalars duplicated to both parts
+    for part in (g, u):
+        assert out[f"{part}.weight_scale_2"].dim() == 0
+        assert torch.equal(
+            out[f"{part}.weight_scale_2"], sd["m.layers.0.mlp.gate_up_proj.weight_scale_2"]
+        )
+
+
+def test_stacked_3d_expert_raises_unsupported():
+    """A stacked [num_experts, out, in] weight must trigger the safe fallback path."""
+    sd = {
+        "m.layers.0.mlp.experts.gate_up_proj.weight": torch.zeros(4, 8, 16, dtype=torch.uint8),
+    }
+    rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0)
+    with pytest.raises(QuantConversionUnsupportedError):
+        apply_reverse_rules(sd, [rule], [])
+
+
+def test_non_divisible_split_raises():
+    sd = {"m.mlp.gate_up_proj.weight": torch.zeros(7, 8, dtype=torch.uint8)}
+    rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0)
+    with pytest.raises(QuantConversionUnsupportedError):
+        apply_reverse_rules(sd, [rule], [])
+
+
+def test_end_to_end_minimax_m3_like_reversal():
+    """Reverse a v1-style (post-conversion) M3 state dict back to hub names."""
+    sd = {}
+    # dense MLP layer 0: fused gate_up + separate down
+    sd.update(_nvfp4_linear("model.language_model.layers.0.mlp.gate_up_proj", 8, 16))
+    sd.update(_nvfp4_linear("model.language_model.layers.0.mlp.down_proj", 16, 8))
+    # MoE layer 10: per-expert (already unfused) + router
+    sd.update(_nvfp4_linear("model.language_model.layers.10.mlp.experts.0.gate_proj", 8, 16))
+    sd.update(_nvfp4_linear("model.language_model.layers.10.mlp.experts.0.up_proj", 8, 16))
+    sd.update(_nvfp4_linear("model.language_model.layers.10.mlp.experts.0.down_proj", 16, 8))
+    sd["model.language_model.layers.10.mlp.gate.weight"] = torch.randn(128, 6144)
+    sd["lm_head.weight"] = torch.randn(32, 16)
+
+    split_rules = [SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0)]
+    rename_rules = [
+        RenameRule(r"(\.experts\.\d+\.)gate_proj", r"\1w1"),
+        RenameRule(r"(\.experts\.\d+\.)up_proj", r"\1w3"),
+        RenameRule(r"(\.experts\.\d+\.)down_proj", r"\1w2"),
+        RenameRule(r"\.mlp\.experts\.", ".block_sparse_moe.experts."),
+        RenameRule(r"\.mlp\.gate\.", ".block_sparse_moe.gate."),
+        RenameRule(r"^model\.language_model\.", "language_model.model."),
+        RenameRule(r"^lm_head\.", "language_model.lm_head."),
+    ]
+    out = apply_reverse_rules(sd, split_rules, rename_rules)
+
+    expected = {
+        # dense un-fused, still under mlp
+        "language_model.model.layers.0.mlp.gate_proj",
+        "language_model.model.layers.0.mlp.up_proj",
+        "language_model.model.layers.0.mlp.down_proj",
+        # experts renamed to block_sparse_moe + w1/w3/w2
+        "language_model.model.layers.10.block_sparse_moe.experts.0.w1",
+        "language_model.model.layers.10.block_sparse_moe.experts.0.w3",
+        "language_model.model.layers.10.block_sparse_moe.experts.0.w2",
+    }
+    got_modules = {k.rsplit(".", 1)[0] for k in out if ".experts." in k or ".mlp." in k}
+    assert expected <= got_modules
+    assert "language_model.model.layers.10.block_sparse_moe.gate.weight" in out
+    assert "language_model.lm_head.weight" in out
+    # no leftover in-memory names
+    assert not any(k.startswith("model.language_model") for k in out)
+    assert not any(".gate_up_proj" in k for k in out)
+
+
+def test_build_reverse_rules_from_mixtral_conversion_mapping_cpu():
+    """Derive rules from a real transformers conversion mapping (CPU, no quantize).
+
+    Exercises ``revert_weight_conversion_quant_aware`` / ``_build_reverse_rules``:
+    a ModelOpt-expanded per-expert state dict (in-memory ``mlp.experts.<i>.*`` names)
+    must revert to the hub layout (``block_sparse_moe.experts.<i>.w{1,2,3}``).
+    """
+    pytest.importorskip("transformers")
+    from transformers import MixtralConfig, MixtralForCausalLM
+
+    try:
+        from transformers.conversion_mapping import get_checkpoint_conversion_mapping
+    except ImportError:
+        pytest.skip("transformers build has no conversion_mapping API")
+    if not get_checkpoint_conversion_mapping("mixtral"):
+        pytest.skip("transformers build has no mixtral conversion_mapping")
+
+    cfg = MixtralConfig(
+        hidden_size=32,
+        intermediate_size=64,
+        num_hidden_layers=1,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        num_local_experts=2,
+        num_experts_per_tok=2,
+        vocab_size=64,
+        max_position_embeddings=64,
+    )
+    model = MixtralForCausalLM(cfg)
+
+    p = "model.layers.0"
+    sd = {f"{p}.mlp.gate.weight": torch.randn(2, 32)}
+    for e in range(2):
+        sd.update(_nvfp4_linear(f"{p}.mlp.experts.{e}.gate_proj", 64, 32))
+        sd.update(_nvfp4_linear(f"{p}.mlp.experts.{e}.up_proj", 64, 32))
+        sd.update(_nvfp4_linear(f"{p}.mlp.experts.{e}.down_proj", 32, 64))
+
+    out = revert_weight_conversion_quant_aware(model, sd)
+
+    # experts mapped to hub layout, with scale siblings carried along
+    for e in range(2):
+        base = f"{p}.block_sparse_moe.experts.{e}"
+        assert f"{base}.w1.weight" in out  # gate_proj -> w1
+        assert f"{base}.w3.weight" in out  # up_proj   -> w3
+        assert f"{base}.w2.weight" in out  # down_proj -> w2
+        assert f"{base}.w1.weight_scale" in out
+        assert f"{base}.w1.weight_scale_2" in out
+    assert f"{p}.block_sparse_moe.gate.weight" in out
+    assert not any(".mlp.experts." in k for k in out)
+
+
+def test_build_reverse_rules_orders_prefix_reorder_after_container():
+    """WeightRenamings must reverse in reverse list order (M3 prefix-reorder bug).
+
+    transformers *loads* by chaining renamings in list order: a component-reordering
+    rename (``language_model.model`` -> ``model.language_model``) fires first, making
+    ``language_model`` adjacent to ``layers`` so a later container rename anchored on
+    that adjacency (``.language_model.layers.N.mlp.experts.`` ->
+    ``.block_sparse_moe.experts.``) can match. On the save path the reorder must run
+    *last*, else it moves ``language_model`` away from ``layers`` and the container
+    rename silently no-ops -- exporting MiniMax-M3 experts as ``mlp.experts.*`` instead
+    of the hub ``block_sparse_moe.experts.*``. Mixtral does not exercise this (no
+    prefix reorder), so this reproduces it with a minimal two-renaming mapping.
+    """
+    pytest.importorskip("transformers.core_model_loading")
+    from transformers.core_model_loading import WeightRenaming
+
+    # Forward (hub -> in-memory) renamings; ``reverse_transform`` flips them on save.
+    # Order matters: reorder is listed BEFORE the adjacency-anchored container rename,
+    # exactly as a real M3 conversion mapping lists them.
+    conversions = [
+        WeightRenaming("^language_model.model.", "model.language_model."),
+        WeightRenaming(
+            ".language_model.layers.(\\d+).block_sparse_moe.experts.",
+            ".language_model.layers.\\1.mlp.experts.",
+        ),
+    ]
+    model = types.SimpleNamespace(_weight_conversions=conversions)
+
+    # In-memory expert key (leaf already at ``w1``; isolates the container/prefix order).
+    sd = _nvfp4_linear("model.language_model.layers.10.mlp.experts.0.w1", 8, 16)
+    out = revert_weight_conversion_quant_aware(model, sd)
+
+    base = "language_model.model.layers.10.block_sparse_moe.experts.0.w1"
+    assert set(out) == {
+        f"{base}.weight",
+        f"{base}.weight_scale",
+        f"{base}.weight_scale_2",
+        f"{base}.input_scale",
+    }
+    # Regression guard: the buggy reorder-first order leaves these in-memory fragments.
+    assert not any(k.startswith("model.language_model") for k in out)
+    assert not any(".mlp.experts." in k for k in out)
+
+
+def test_split_collision_raises():
+    """A split whose target key already exists must fail instead of overwriting."""
+    sd = _nvfp4_linear("m.gate_up_proj", 8, 16)
+    sd["m.gate_proj.weight"] = torch.zeros(4, 16)  # pre-existing split target
+    rule = SplitRule(".gate_up_proj", (".gate_proj", ".up_proj"), dim=0)
+    with pytest.raises(QuantConversionUnsupportedError, match="split collision"):
+        apply_reverse_rules(sd, [rule], [])
+
+
+def test_stacked_experts_guard():
+    """Experts not pre-expanded (stacked/fused 3-D leaf) must trigger the fallback.
+
+    The per-expert-index leaf renames cannot rewrite a still-fused
+    ``.experts.gate_up_proj`` tensor, so it would ship mis-named; guard by raising.
+    """
+    fused_leaves = ["gate_up_proj", "down_proj"]
+
+    # Pre-expanded 2-D experts: no fused leaf present -> no raise.
+    ok = _nvfp4_linear("model.language_model.layers.10.mlp.experts.0.gate_proj", 8, 16)
+    _assert_experts_pre_expanded(ok, fused_leaves)
+
+    # Still-fused stacked expert leaf (3-D) -> raise.
+    bad = {"model.language_model.layers.10.mlp.experts.gate_up_proj.weight": torch.zeros(2, 8, 16)}
+    with pytest.raises(QuantConversionUnsupportedError, match="not pre-expanded"):
+        _assert_experts_pre_expanded(bad, fused_leaves)
+
+    # No expert converters in the mapping -> guard is a no-op even for 3-D tensors.
+    _assert_experts_pre_expanded(bad, [])
+
+
+def test_revert_quant_config_names_mapper():
+    """exclude_modules / quantized_layers keys revert to hub names, preserving wildcards.
+
+    Regression for the bug where the reverse conversion renamed weight tensors to hub
+    names but left the quant-config module references in the in-memory namespace, so a
+    deployment loader matched none of the excludes and loaded an excluded BF16 layer as
+    quantized. Uses Mixtral's real mapping (``mlp.experts`` <-> ``block_sparse_moe.experts``).
+    """
+    pytest.importorskip("transformers.core_model_loading")
+    from transformers import MixtralConfig, MixtralForCausalLM
+
+    model = MixtralForCausalLM(
+        MixtralConfig(
+            hidden_size=32,
+            intermediate_size=64,
+            num_hidden_layers=1,
+            num_attention_heads=4,
+            num_key_value_heads=2,
+            num_local_experts=2,
+            num_experts_per_tok=2,
+            vocab_size=64,
+            max_position_embeddings=64,
+        )
+    )
+    mapper = build_reverse_name_mapper(model)
+    assert mapper is not None
+
+    quant = {
+        "quant_algo": "NVFP4",
+        "exclude_modules": [
+            "model.layers.0.self_attn*",  # no container rename -> unchanged, wildcard kept
+            "model.layers.0.mlp.experts.0*",  # in-memory -> block_sparse_moe.experts, wildcard kept
+            "lm_head",
+        ],
+        "quantized_layers": {"model.layers.0.mlp.experts.0.w1": {"quant_algo": "NVFP4"}},
+    }
+    revert_quant_config_names(quant, mapper)
+    assert quant["exclude_modules"] == [
+        "model.layers.0.self_attn*",
+        "model.layers.0.block_sparse_moe.experts.0*",
+        "lm_head",
+    ]
+    assert "model.layers.0.block_sparse_moe.experts.0.w1" in quant["quantized_layers"]
+    # mapper(None) is a no-op
+    q2 = {"exclude_modules": ["x*"]}
+    revert_quant_config_names(q2, None)
+    assert q2["exclude_modules"] == ["x*"]