intel · chensuyue · Jun 25, 2026 · Jun 15, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/examples/README.md b/examples/README.md
@@ -15,6 +15,12 @@ Intel® Neural Compressor validated examples with multiple compression technique
   </tr>
 </thead>
 <tbody>
+<tr>
+    <td>deepseek-ai/DeepSeek-V4</td>
+    <td>Natural Language Processing</td>
+    <td>Quantization (MXFP8/MXFP4)</td>
+    <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4">link</a></td>
+</tr>
 <tr>
     <td>deepseek-ai/DeepSeek-R1</td>
     <td>Natural Language Processing</td>

diff --git a/...ggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/...ggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -0,0 +1,112 @@
+# DeepSeek V4 AutoRound (INC prepare/convert)
+
+This example demonstrates model-free quantization through INC API:
+
+```python
+from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert
+
+config = AutoRoundConfig(
+    model_free=True,
+    scheme="MXFP4",
+    ignore_layers="compressor,indexer.weights_proj",
+    export_format="llm_compressor",
+    output_dir="/path/to/output",
+)
+model = "/path/or/hf_model_name"
+model = prepare(model, config)
+model = convert(model)
+```
+
+## Requirements
+
+Install dependencies before running quantization or evaluation:
+
+```bash
+uv pip install -U pip
+uv pip install -U "git+https://github.com/intel/auto-round.git@main"
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install compressed-tensors --no-deps
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+uv pip install setuptools_rust setuptools_scm
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
+```
+
+## Quick Start
+
+```bash
+cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4
+bash run_quant.sh \
+  --dtype=mxfp4_mixed \
+  --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+  --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed
+```
+
+Then run serving + evaluation in one command:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \
+  --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \
+  --tp 2 \
+  --port 8009 \
+  --tasks piqa,hellaswag,gsm8k,mmlu_pro,math_500,mmlu,aime26,gpqa_diamond,ruler_qa_squad
+  --temp 1.0
+```
+
+Equivalent vLLM defaults inside `run_evalscope.sh`:
+
+```bash
+SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve <model> \
+  --trust-remote-code \
+  --kv-cache-dtype fp8 \
+  --block-size 256 \
+  --tensor-parallel-size 2 \
+  --attention_config.use_fp4_indexer_cache=True \
+  --port 8009 \
+  --no-enable-flashinfer-autotune
+```
+
+If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix),
+`run_evalscope.sh` will also add (automatically):
+
+```bash
+--enable-expert-parallel --moe-backend deep_gemm_mega_moe
+```
+
+Mixed preset example:
+
+```bash
+bash run_quant.sh \
+  --dtype=mxfp4_mixed \
+  --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+  --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP8
+```
+
+## CLI Arguments
+
+- `--dtype`: quantization preset.
+  - `mxfp4`: `scheme=MXFP4`
+  - `mxfp4_mixed`: `scheme=MXFP8` + `layer_config={"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}`
+  - `mxfp8`: `scheme=MXFP8`
+  - `w4a16`: `scheme=W4A16` + `layer_config={"wo_a": {"bits": 16}}`
+- `--input_model`: HF model name or local model path.
+- `--output_model`: output directory.
+- `--format`: `auto_round` or `llm_compressor` (default: `llm_compressor`).
+- `--ignore_layers`: comma-separated layer patterns (default: `compressor,indexer.weights_proj`).
+
+`run_evalscope.sh` arguments:
+
+- `--model`: model path for vLLM and evalscope.
+- `--port`: vLLM API port (default: `8009`).
+- `--temp`: generation temperature used by evalscope (default: `0`).
+- `--skip_serve`: skip starting vLLM (use existing endpoint on the same `--port`).
+- `--tp`: tensor parallel size for vLLM (default: `2`).
+- `--kv-cache-dtype`: kv cache dtype for vLLM (default: `fp8`).
+- `--block-size`: vLLM block size (default: `256`).
+
+## Notes
+
+- This flow is enabled only when:
+  - `config` is `AutoRoundConfig`
+  - `config.model_free=True`
+  - `model` passed to `prepare/convert` is a `str` (model path or model name)
+- The example uses `reloading=False` by default and saves quantized artifacts to `--output_model`.
diff --git a/...h/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py b/...h/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+
+from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+_PRESET_CONFIG = {
+    "mxfp4": {
+        "scheme": "MXFP4",
+        "layer_config": None,
+    },
+    # MXFP8 + experts FP4 mixed setup.
+    "mxfp4_mixed": {
+        "scheme": "MXFP8",
+        "layer_config": {"ffn.experts": {"bits": 4, "data_type": "mx_fp"}},
+    },
+    "mxfp8": {
+        "scheme": "MXFP8",
+        "layer_config": None,
+    },
+    "w4a16": {
+        "scheme": "W4A16",
+        "layer_config": {"wo_a": {"bits": 16}},
+    },
+}
+
+
+def build_config(args: argparse.Namespace) -> AutoRoundConfig:
+    dtype_key = args.dtype.lower()
+    if dtype_key not in _PRESET_CONFIG:
+        raise ValueError(f"Unsupported dtype: {args.dtype}. Supported: {', '.join(_PRESET_CONFIG.keys())}")
+
+    preset = _PRESET_CONFIG[dtype_key]
+    layer_config = preset["layer_config"]
+    if args.disable_preset_layer_config:
+        layer_config = None
+
+    return AutoRoundConfig(
+        model_free=True,
+        scheme=preset["scheme"],
+        ignore_layers=args.ignore_layers,
+        layer_config=layer_config,
+        export_format=args.format,
+        output_dir=args.output_model,
+        reloading=False,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="DeepSeek V4 model-free quantization via INC AutoRound prepare/convert.")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        required=True,
+        choices=sorted(_PRESET_CONFIG.keys()),
+        help="Quantization preset. e.g. mxfp4 or mxfp4_mixed",
+    )
+    parser.add_argument(
+        "--input_model",
+        type=str,
+        required=True,
+        help="Model name or local path.",
+    )
+    parser.add_argument(
+        "--output_model",
+        type=str,
+        required=True,
+        help="Output directory for quantized model.",
+    )
+    parser.add_argument(
+        "--ignore_layers",
+        type=str,
+        default="compressor,indexer.weights_proj",
+        help="Comma-separated layer name patterns to skip.",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="llm_compressor",
+        choices=["auto_round", "llm_compressor"],
+        help="Export format.",
+    )
+    parser.add_argument(
+        "--disable_preset_layer_config",
+        action="store_true",
+        help="Disable preset layer_config for the selected dtype.",
+    )
+    args = parser.parse_args()
+
+    quant_config = build_config(args)
+
+    model = args.input_model
+    model = prepare(model, quant_config)
+    _ = convert(model)
+    logger.info("Quantized model saved to %s", args.output_model)
+
+
+if __name__ == "__main__":
+    main()