diff --git a/examples/README.md b/examples/README.md
index 26e4a5792d7..9d590b1b7f5 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -15,6 +15,12 @@ IntelĀ® Neural Compressor validated examples with multiple compression technique
+
+ | deepseek-ai/DeepSeek-V4 |
+ Natural Language Processing |
+ Quantization (MXFP8/MXFP4) |
+ link |
+
| deepseek-ai/DeepSeek-R1 |
Natural Language Processing |
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
new file mode 100644
index 00000000000..5ab41000942
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -0,0 +1,112 @@
+# DeepSeek V4 AutoRound (INC prepare/convert)
+
+This example demonstrates model-free quantization through INC API:
+
+```python
+from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert
+
+config = AutoRoundConfig(
+ model_free=True,
+ scheme="MXFP4",
+ ignore_layers="compressor,indexer.weights_proj",
+ export_format="llm_compressor",
+ output_dir="/path/to/output",
+)
+model = "/path/or/hf_model_name"
+model = prepare(model, config)
+model = convert(model)
+```
+
+## Requirements
+
+Install dependencies before running quantization or evaluation:
+
+```bash
+uv pip install -U pip
+uv pip install -U "git+https://github.com/intel/auto-round.git@main"
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install compressed-tensors --no-deps
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+uv pip install setuptools_rust setuptools_scm
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
+```
+
+## Quick Start
+
+```bash
+cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4
+bash run_quant.sh \
+ --dtype=mxfp4_mixed \
+ --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+ --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed
+```
+
+Then run serving + evaluation in one command:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \
+ --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \
+ --tp 2 \
+ --port 8009 \
+ --tasks piqa,hellaswag,gsm8k,mmlu_pro,math_500,mmlu,aime26,gpqa_diamond,ruler_qa_squad
+ --temp 1.0
+```
+
+Equivalent vLLM defaults inside `run_evalscope.sh`:
+
+```bash
+SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve \
+ --trust-remote-code \
+ --kv-cache-dtype fp8 \
+ --block-size 256 \
+ --tensor-parallel-size 2 \
+ --attention_config.use_fp4_indexer_cache=True \
+ --port 8009 \
+ --no-enable-flashinfer-autotune
+```
+
+If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix),
+`run_evalscope.sh` will also add (automatically):
+
+```bash
+--enable-expert-parallel --moe-backend deep_gemm_mega_moe
+```
+
+Mixed preset example:
+
+```bash
+bash run_quant.sh \
+ --dtype=mxfp4_mixed \
+ --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+ --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP8
+```
+
+## CLI Arguments
+
+- `--dtype`: quantization preset.
+ - `mxfp4`: `scheme=MXFP4`
+ - `mxfp4_mixed`: `scheme=MXFP8` + `layer_config={"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}`
+ - `mxfp8`: `scheme=MXFP8`
+ - `w4a16`: `scheme=W4A16` + `layer_config={"wo_a": {"bits": 16}}`
+- `--input_model`: HF model name or local model path.
+- `--output_model`: output directory.
+- `--format`: `auto_round` or `llm_compressor` (default: `llm_compressor`).
+- `--ignore_layers`: comma-separated layer patterns (default: `compressor,indexer.weights_proj`).
+
+`run_evalscope.sh` arguments:
+
+- `--model`: model path for vLLM and evalscope.
+- `--port`: vLLM API port (default: `8009`).
+- `--temp`: generation temperature used by evalscope (default: `0`).
+- `--skip_serve`: skip starting vLLM (use existing endpoint on the same `--port`).
+- `--tp`: tensor parallel size for vLLM (default: `2`).
+- `--kv-cache-dtype`: kv cache dtype for vLLM (default: `fp8`).
+- `--block-size`: vLLM block size (default: `256`).
+
+## Notes
+
+- This flow is enabled only when:
+ - `config` is `AutoRoundConfig`
+ - `config.model_free=True`
+ - `model` passed to `prepare/convert` is a `str` (model path or model name)
+- The example uses `reloading=False` by default and saves quantized artifacts to `--output_model`.
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
new file mode 100644
index 00000000000..d64fd3a480d
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+
+from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+_PRESET_CONFIG = {
+ "mxfp4": {
+ "scheme": "MXFP4",
+ "layer_config": None,
+ },
+ # MXFP8 + experts FP4 mixed setup.
+ "mxfp4_mixed": {
+ "scheme": "MXFP8",
+ "layer_config": {"ffn.experts": {"bits": 4, "data_type": "mx_fp"}},
+ },
+ "mxfp8": {
+ "scheme": "MXFP8",
+ "layer_config": None,
+ },
+ "w4a16": {
+ "scheme": "W4A16",
+ "layer_config": {"wo_a": {"bits": 16}},
+ },
+}
+
+
+def build_config(args: argparse.Namespace) -> AutoRoundConfig:
+ dtype_key = args.dtype.lower()
+ if dtype_key not in _PRESET_CONFIG:
+ raise ValueError(f"Unsupported dtype: {args.dtype}. Supported: {', '.join(_PRESET_CONFIG.keys())}")
+
+ preset = _PRESET_CONFIG[dtype_key]
+ layer_config = preset["layer_config"]
+ if args.disable_preset_layer_config:
+ layer_config = None
+
+ return AutoRoundConfig(
+ model_free=True,
+ scheme=preset["scheme"],
+ ignore_layers=args.ignore_layers,
+ layer_config=layer_config,
+ export_format=args.format,
+ output_dir=args.output_model,
+ reloading=False,
+ )
+
+
+def main() -> None:
+ parser = argparse.ArgumentParser(description="DeepSeek V4 model-free quantization via INC AutoRound prepare/convert.")
+ parser.add_argument(
+ "--dtype",
+ type=str,
+ required=True,
+ choices=sorted(_PRESET_CONFIG.keys()),
+ help="Quantization preset. e.g. mxfp4 or mxfp4_mixed",
+ )
+ parser.add_argument(
+ "--input_model",
+ type=str,
+ required=True,
+ help="Model name or local path.",
+ )
+ parser.add_argument(
+ "--output_model",
+ type=str,
+ required=True,
+ help="Output directory for quantized model.",
+ )
+ parser.add_argument(
+ "--ignore_layers",
+ type=str,
+ default="compressor,indexer.weights_proj",
+ help="Comma-separated layer name patterns to skip.",
+ )
+ parser.add_argument(
+ "--format",
+ type=str,
+ default="llm_compressor",
+ choices=["auto_round", "llm_compressor"],
+ help="Export format.",
+ )
+ parser.add_argument(
+ "--disable_preset_layer_config",
+ action="store_true",
+ help="Disable preset layer_config for the selected dtype.",
+ )
+ args = parser.parse_args()
+
+ quant_config = build_config(args)
+
+ model = args.input_model
+ model = prepare(model, quant_config)
+ _ = convert(model)
+ logger.info("Quantized model saved to %s", args.output_model)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
new file mode 100644
index 00000000000..6c19df57576
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -0,0 +1,354 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Usage:
+# bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] [--tasks TASK1,TASK2]
+#
+# This script can start vLLM serve and then run evalscope automatically.
+
+PORT=8009
+MODEL=/workspace/models/deepseek-ai/DeepSeek-V4-Flash
+TEMPERATURE=0
+KV_CACHE_DTYPE="fp8"
+BLOCK_SIZE=256
+TENSOR_PARALLEL_SIZE=2
+SAFETENSORS_FAST_GPU="1"
+TRUST_REMOTE_CODE="true"
+NO_ENABLE_FLASHINFER_AUTOTUNE="true"
+TASKS=""
+SKIP_SERVE="${SKIP_SERVE:-false}"
+VLLM_PID=""
+LOG_TAIL_PID=""
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+cd "${SCRIPT_DIR}"
+
+cleanup() {
+ if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then
+ kill "${LOG_TAIL_PID}" 2>/dev/null || true
+ fi
+
+ if [[ "${SKIP_SERVE}" == "true" ]]; then
+ return
+ fi
+
+ if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then
+ CHILDREN=$(pgrep -P "${VLLM_PID}" || true)
+ if [[ -n "${CHILDREN}" ]]; then
+ kill -9 ${CHILDREN} 2>/dev/null || true
+ fi
+ kill -9 "${VLLM_PID}" 2>/dev/null || true
+ return
+ fi
+
+ # Kill the process listening on the specified port to free GPU.
+ VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
+ if [[ -n "${VLLM_PIDS}" ]]; then
+ for PID in ${VLLM_PIDS}; do
+ CHILDREN=$(pgrep -P "${PID}" || true)
+ if [[ -n "${CHILDREN}" ]]; then
+ kill -9 ${CHILDREN} 2>/dev/null || true
+ fi
+ kill -9 "${PID}" 2>/dev/null || true
+ done
+ fi
+}
+
+trap cleanup EXIT
+
+stop_log_tail() {
+ if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then
+ kill "${LOG_TAIL_PID}" 2>/dev/null || true
+ LOG_TAIL_PID=""
+ fi
+}
+
+trim_task_name() {
+ local task_name="$1"
+ task_name="${task_name#${task_name%%[![:space:]]*}}"
+ task_name="${task_name%${task_name##*[![:space:]]}}"
+ echo "${task_name}"
+}
+
+task_in_list() {
+ local target_task="$1"
+ shift
+ local task_name
+ for task_name in "$@"; do
+ if [[ "${task_name}" == "${target_task}" ]]; then
+ return 0
+ fi
+ done
+ return 1
+}
+
+print_section_header() {
+ echo "=== [${STEP_INDEX}/${TOTAL_STEPS}] $1 ===" | tee -a "$OUTPUT_FILE"
+ STEP_INDEX=$((STEP_INDEX + 1))
+}
+
+while [[ $# -gt 0 ]]; do
+ case "$1" in
+ --port)
+ PORT="$2"; shift 2 ;;
+ --model)
+ MODEL="$2"; shift 2 ;;
+ --temp)
+ TEMPERATURE="$2"; shift 2 ;;
+ --tasks)
+ TASKS="$2"; shift 2 ;;
+ --skip_serve)
+ SKIP_SERVE="true"; shift 1 ;;
+ --skip-serve)
+ SKIP_SERVE="true"; shift 1 ;;
+ --tp)
+ TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
+ --kv-cache-dtype)
+ KV_CACHE_DTYPE="$2"; shift 2 ;;
+ --block-size)
+ BLOCK_SIZE="$2"; shift 2 ;;
+ *)
+ echo "Unknown option: $1"; exit 1 ;;
+ esac
+done
+
+SKIP_SERVE="$(echo "${SKIP_SERVE}" | tr '[:upper:]' '[:lower:]')"
+
+API_URL="http://127.0.0.1:${PORT}/v1"
+
+if [[ "${SKIP_SERVE}" != "true" ]]; then
+ echo "Starting vLLM serve on port ${PORT} ..."
+ MODEL_NORMALIZED="${MODEL%/}"
+ MODEL_NAME="${MODEL_NORMALIZED##*/}"
+ EXTRA_ARGS=()
+ # Only for base DeepSeek-V4-Flash/Pro model names without quantized suffixes.
+ if [[ "${MODEL_NAME}" == "DeepSeek-V4-Flash" || "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+ EXTRA_ARGS+=(--enable-expert-parallel)
+ EXTRA_ARGS+=(--moe-backend deep_gemm_mega_moe)
+ fi
+
+ VLLM_CMD=(
+ vllm serve "${MODEL}"
+ --kv-cache-dtype "${KV_CACHE_DTYPE}"
+ --block-size "${BLOCK_SIZE}"
+ --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}"
+ --attention_config.use_fp4_indexer_cache=True
+ --port "${PORT}"
+ )
+ if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
+ VLLM_CMD+=(--max-model-len 1048576)
+ fi
+ if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then
+ VLLM_CMD+=(--trust-remote-code)
+ fi
+ if [[ "${NO_ENABLE_FLASHINFER_AUTOTUNE}" == "true" ]]; then
+ VLLM_CMD+=(--no-enable-flashinfer-autotune)
+ fi
+ VLLM_CMD+=("${EXTRA_ARGS[@]}")
+
+ SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 &
+ VLLM_PID=$!
+ echo "vLLM launched. Log: /tmp/vllm_${PORT}.log"
+ echo "vLLM PID: ${VLLM_PID}"
+ echo "=== vLLM startup log (will stop after API wait ends) ==="
+ tail -n +1 -f "/tmp/vllm_${PORT}.log" &
+ LOG_TAIL_PID=$!
+fi
+
+# Wait until the API is ready
+echo "Waiting for API at ${API_URL} ..."
+for _ in $(seq 1 90); do
+ if curl -sf "${API_URL}/models" -o /dev/null; then
+ break
+ fi
+ if [[ "${SKIP_SERVE}" != "true" ]] && [[ -n "${VLLM_PID}" ]] && ! kill -0 "${VLLM_PID}" 2>/dev/null; then
+ stop_log_tail
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] vLLM exited before API became ready."
+ echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----"
+ tail -n 80 "/tmp/vllm_${PORT}.log" || true
+ exit 1
+ fi
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..."
+ sleep 20
+done
+
+stop_log_tail
+
+if ! curl -sf "${API_URL}/models" -o /dev/null; then
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for API at ${API_URL}."
+ echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----"
+ tail -n 80 "/tmp/vllm_${PORT}.log" || true
+ exit 1
+fi
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation."
+MODEL_NORMALIZED="${MODEL%/}"
+MODEL_NAME="${MODEL_NORMALIZED##*/}"
+LOG_DIR="logs/${MODEL_NAME}"
+mkdir -p "$LOG_DIR"
+OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log"
+
+DEFAULT_STANDARD_TASKS=(piqa hellaswag gsm8k mmlu_pro math_500 mmlu)
+SUPPORTED_TASKS=(aime26 gpqa_diamond ruler_qa_squad "${DEFAULT_STANDARD_TASKS[@]}")
+SELECTED_STANDARD_TASKS=()
+RUN_AIME26="true"
+RUN_GPQA_DIAMOND="true"
+RUN_STANDARD_TASKS="true"
+RUN_RULER_QA_SQUAD="true"
+
+if [[ -n "${TASKS}" ]]; then
+ RUN_AIME26="false"
+ RUN_GPQA_DIAMOND="false"
+ RUN_STANDARD_TASKS="false"
+ RUN_RULER_QA_SQUAD="false"
+
+ IFS=',' read -r -a REQUESTED_TASKS <<< "${TASKS}"
+ for raw_task in "${REQUESTED_TASKS[@]}"; do
+ task_name="$(trim_task_name "${raw_task}")"
+ if [[ -z "${task_name}" ]]; then
+ continue
+ fi
+ if ! task_in_list "${task_name}" "${SUPPORTED_TASKS[@]}"; then
+ echo "Unsupported task: ${task_name}"
+ echo "Supported tasks: ${SUPPORTED_TASKS[*]}"
+ exit 1
+ fi
+
+ case "${task_name}" in
+ aime26)
+ RUN_AIME26="true"
+ ;;
+ gpqa_diamond)
+ RUN_GPQA_DIAMOND="true"
+ ;;
+ ruler_qa_squad)
+ RUN_RULER_QA_SQUAD="true"
+ ;;
+ *)
+ if ! task_in_list "${task_name}" "${SELECTED_STANDARD_TASKS[@]}"; then
+ SELECTED_STANDARD_TASKS+=("${task_name}")
+ RUN_STANDARD_TASKS="true"
+ fi
+ ;;
+ esac
+ done
+
+ if [[ "${RUN_AIME26}" != "true" ]] && [[ "${RUN_GPQA_DIAMOND}" != "true" ]] \
+ && [[ "${RUN_STANDARD_TASKS}" != "true" ]] && [[ "${RUN_RULER_QA_SQUAD}" != "true" ]]; then
+ echo "No valid tasks selected from --tasks '${TASKS}'."
+ exit 1
+ fi
+else
+ SELECTED_STANDARD_TASKS=("${DEFAULT_STANDARD_TASKS[@]}")
+fi
+
+TOTAL_STEPS=0
+if [[ "${RUN_AIME26}" == "true" ]]; then
+ TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then
+ TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then
+ TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then
+ TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+STEP_INDEX=1
+
+echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE"
+echo "Model: $MODEL" | tee -a "$OUTPUT_FILE"
+echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE"
+echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE"
+if [[ -n "${TASKS}" ]]; then
+ echo "Tasks: ${TASKS}" | tee -a "$OUTPUT_FILE"
+else
+ echo "Tasks: all default tasks" | tee -a "$OUTPUT_FILE"
+fi
+echo "" | tee -a "$OUTPUT_FILE"
+
+
+if [[ "${RUN_AIME26}" == "true" ]]; then
+ echo "" | tee -a "$OUTPUT_FILE"
+ print_section_header "aime26 (n=10)"
+ evalscope eval \
+ --model "$MODEL" \
+ --eval-type openai_api \
+ --api-key EMPTY \
+ --datasets aime26 \
+ --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \
+ --eval-batch-size 10 --timeout 3000 \
+ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then
+ echo "" | tee -a "$OUTPUT_FILE"
+ print_section_header "gpqa_diamond (n=5)"
+ evalscope eval \
+ --model "$MODEL" \
+ --eval-type openai_api \
+ --api-key EMPTY \
+ --datasets gpqa_diamond \
+ --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \
+ --eval-batch-size 10 --timeout 3000 \
+ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then
+ echo "" | tee -a "$OUTPUT_FILE"
+ print_section_header "${SELECTED_STANDARD_TASKS[*]}"
+ evalscope eval \
+ --model "$MODEL" \
+ --eval-type openai_api \
+ --api-key EMPTY \
+ --datasets "${SELECTED_STANDARD_TASKS[@]}" \
+ --eval-batch-size 10 --timeout 3000 \
+ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then
+ echo "" | tee -a "$OUTPUT_FILE"
+ print_section_header "ruler_qa_squad (lm_eval, 1M)"
+ if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
+ LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa"
+ mkdir -p "${LMEVAL_OUTPUT_DIR}"
+ LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}")
+ lm_eval \
+ --model local-completions \
+ --tasks ruler_qa_squad \
+ --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \
+ --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \
+ --metadata "${LMEVAL_METADATA}" \
+ --batch_size 1 \
+ --log_samples \
+ --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE"
+ else
+ echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE"
+ fi
+fi
+
+
+echo "" | tee -a "$OUTPUT_FILE"
+echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE"
+echo "Results saved to: $OUTPUT_FILE"
+
+# Kill the process listening on the specified port to free GPU
+echo "Stopping process on port ${PORT} to free GPU..." | tee -a "$OUTPUT_FILE"
+VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
+if [[ -n "$VLLM_PIDS" ]]; then
+ echo "Found vllm serve process(es) with --port ${PORT}: $VLLM_PIDS" | tee -a "$OUTPUT_FILE"
+ for PID in $VLLM_PIDS; do
+ # Kill all child processes (including GPU processes)
+ CHILDREN=$(pgrep -P $PID)
+ if [[ -n "$CHILDREN" ]]; then
+ echo "Killing child processes of $PID: $CHILDREN" | tee -a "$OUTPUT_FILE"
+ kill -9 $CHILDREN 2>/dev/null
+ fi
+ kill -9 $PID 2>/dev/null
+ echo "Killed vllm serve process and its children: $PID $CHILDREN" | tee -a "$OUTPUT_FILE"
+ done
+else
+ echo "No vllm serve process found with --port ${PORT}." | tee -a "$OUTPUT_FILE"
+fi
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
new file mode 100644
index 00000000000..305c2cd266d
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+DTYPE=""
+INPUT_MODEL=""
+OUTPUT_MODEL=""
+FORMAT="llm_compressor"
+IGNORE_LAYERS="compressor,indexer.weights_proj"
+
+usage() {
+ echo "Usage: bash run_quant.sh --dtype= --input_model= --output_model="
+ echo "Optional: --format= --ignore_layers="
+ exit 1
+}
+
+for arg in "$@"; do
+ case $arg in
+ --dtype=*)
+ DTYPE="${arg#*=}"
+ ;;
+ --input_model=*)
+ INPUT_MODEL="${arg#*=}"
+ ;;
+ --output_model=*)
+ OUTPUT_MODEL="${arg#*=}"
+ ;;
+ --format=*)
+ FORMAT="${arg#*=}"
+ ;;
+ --ignore_layers=*)
+ IGNORE_LAYERS="${arg#*=}"
+ ;;
+ -h|--help)
+ usage
+ ;;
+ *)
+ echo "Unknown option: $arg"
+ usage
+ ;;
+ esac
+done
+
+[[ -z "$DTYPE" ]] && echo "Error: --dtype is required" && usage
+[[ -z "$INPUT_MODEL" ]] && echo "Error: --input_model is required" && usage
+[[ -z "$OUTPUT_MODEL" ]] && echo "Error: --output_model is required" && usage
+
+cd "$SCRIPT_DIR"
+python quantize.py \
+ --dtype "$DTYPE" \
+ --input_model "$INPUT_MODEL" \
+ --output_model "$OUTPUT_MODEL" \
+ --format "$FORMAT" \
+ --ignore_layers "$IGNORE_LAYERS"
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
new file mode 100644
index 00000000000..0b1215ebfba
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+
+uv pip install -U pip setuptools_rust setuptools_scm
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install git+https://github.com/intel/auto-round.git@main
+uv pip install compressed-tensors --no-deps
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py
index 94e012b0298..c4601528c96 100644
--- a/neural_compressor/torch/algorithms/autoround/autoround.py
+++ b/neural_compressor/torch/algorithms/autoround/autoround.py
@@ -158,6 +158,8 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs):
Returns:
A prepared model.
"""
+ if isinstance(model, str) and bool(getattr(self, "model_free", False)):
+ return model
prepare_model = InputCaptureModule(model)
return prepare_model
@@ -171,20 +173,26 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
The quantized model.
"""
pipe = kwargs.pop("pipeline", None)
- tokenizer = getattr(model.orig_model, "tokenizer", None)
- if tokenizer is not None:
- delattr(model.orig_model, "tokenizer")
- elif pipe is None:
- tokenizer = "Placeholder"
- self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
- # Retrieve processor/image_processor/template from model if they were attached there
- # (moved from quant_config to model to avoid duplicating large objects in per-layer configs)
- for _attr in ("processor", "image_processor", "template"):
- _val = getattr(model.orig_model, _attr, None)
- if _val is not None:
- setattr(self, _attr, _val)
- delattr(model.orig_model, _attr)
- model = model.orig_model
+ is_model_reference = isinstance(model, str)
+ if is_model_reference:
+ tokenizer = getattr(self, "tokenizer", None)
+ if tokenizer is None and pipe is None:
+ tokenizer = "Placeholder"
+ else:
+ tokenizer = getattr(model.orig_model, "tokenizer", None)
+ if tokenizer is not None:
+ delattr(model.orig_model, "tokenizer")
+ elif pipe is None:
+ tokenizer = "Placeholder"
+ self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
+ # Retrieve processor/image_processor/template from model if they were attached there
+ # (moved from quant_config to model to avoid duplicating large objects in per-layer configs)
+ for _attr in ("processor", "image_processor", "template"):
+ _val = getattr(model.orig_model, _attr, None)
+ if _val is not None:
+ setattr(self, _attr, _val)
+ delattr(model.orig_model, _attr)
+ model = model.orig_model
if pipe is not None:
model = pipe
# Remove AutoRound specific args before passing to AutoRound constructor
@@ -221,7 +229,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
if self._is_w4afp8():
model, weight_config = rounder.quantize()
- model.autoround_config = weight_config
+ if hasattr(model, "__dict__"):
+ model.autoround_config = weight_config
return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
else: # pragma: no cover
_, quantized_model_path = rounder.quantize_and_save(
@@ -229,10 +238,12 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
)
self.output_dir = quantized_model_path
model = rounder.model
- model.autoround_config = rounder.layer_config
+ if hasattr(model, "__dict__"):
+ model.autoround_config = rounder.layer_config
self.accelerator.empty_cache()
- dump_model_op_stats(rounder.layer_config)
+ if not bool(getattr(self, "model_free", False)):
+ dump_model_op_stats(rounder.layer_config)
reloading = self.__dict__.get("reloading", True)
if self.export_format in ["auto_round", "llm_compressor"] and reloading:
@@ -248,7 +259,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
except Exception as e:
logger.error(f"Error reloading model: {e}")
- setattr(model, "name_or_path", self.output_dir) # model is saved in a subfolder of output_dir based on scheme
+ if hasattr(model, "__dict__"):
+ # model is saved in a subfolder of output_dir based on scheme
+ setattr(model, "name_or_path", self.output_dir)
return model
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index e15d19fba96..95651695030 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -560,12 +560,12 @@ def teq_quantize_entry(
###################### AUTOROUND Algo Entry ##################################
@register_algo(name=AUTOROUND)
def autoround_quantize_entry(
- model: torch.nn.Module,
+ model,
configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig],
mode: Mode = Mode.QUANTIZE,
*args,
**kwargs,
-) -> torch.nn.Module:
+):
"""The main entry to apply AutoRound quantization.
Args:
@@ -630,8 +630,10 @@ def autoround_quantize_entry(
kwargs.pop("example_inputs")
quantizer = get_quantizer(model, quantizer_cls=AutoRoundQuantizer, quant_config=quant_config, **params_dict)
model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
- model.qconfig = configs_mapping
- model.save = MethodType(save, model)
+ if hasattr(model, "__dict__"):
+ model.qconfig = configs_mapping
+ if isinstance(model, torch.nn.Module):
+ model.save = MethodType(save, model)
postprocess_model(model, mode, quantizer)
return model
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 17641d81f2f..3bd7729b2b3 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -33,6 +33,25 @@
FRAMEWORK_NAME = "torch"
+class _AutoRoundModelReference:
+ """A lightweight container for model-free AutoRound prepare/convert flow."""
+
+ def __init__(self, model_reference: str, quant_config: BaseConfig, example_inputs: Any = None):
+ self.model_reference = model_reference
+ self.quant_config = quant_config
+ self.example_inputs = example_inputs
+ self.is_prepared = True
+
+
+def _is_autoround_model_free_string_case(model: Any, quant_config: BaseConfig) -> bool:
+ """Return True when model-free AutoRound is called with a string model reference."""
+ return (
+ isinstance(quant_config, AutoRoundConfig)
+ and bool(getattr(quant_config, "model_free", False))
+ and isinstance(model, str)
+ )
+
+
def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name):
"""Check whether to apply this algorithm according to configs_mapping.
@@ -89,12 +108,16 @@ def preprocess_quant_config(model, quant_config, mode="prepare", example_inputs=
)
model_info = quant_config.get_model_info(model, example_inputs)
elif isinstance(quant_config, AutoRoundConfig):
- for _attr in ("tokenizer", "processor", "image_processor", "template"):
- _backup = getattr(quant_config, _attr, None)
- if _backup is not None:
- setattr(model, _attr, _backup)
- delattr(quant_config, _attr)
- model_info = quant_config.get_model_info(model=model)
+ if _is_autoround_model_free_string_case(model, quant_config):
+ # Keep optional large objects on config when model is a string reference.
+ model_info = quant_config.get_model_info(model=None)
+ else:
+ for _attr in ("tokenizer", "processor", "image_processor", "template"):
+ _backup = getattr(quant_config, _attr, None)
+ if _backup is not None:
+ setattr(model, _attr, _backup)
+ delattr(quant_config, _attr)
+ model_info = quant_config.get_model_info(model=model)
else:
model_info = quant_config.get_model_info(model=model)
@@ -172,6 +195,9 @@ def prepare(
Returns:
prepared and calibrated module.
"""
+ if _is_autoround_model_free_string_case(model, quant_config):
+ return _AutoRoundModelReference(model_reference=model, quant_config=quant_config, example_inputs=example_inputs)
+
prepared_model = model if inplace else copy.deepcopy(model)
prepared_model, configs_mapping = preprocess_quant_config(
prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
@@ -240,6 +266,13 @@ def convert(
Returns:
The quantized model.
"""
+ if isinstance(model, _AutoRoundModelReference):
+ if quant_config is None:
+ quant_config = model.quant_config
+ else:
+ logger.warning("quant_config will be ignored since the model has been prepared.")
+ model = model.model_reference
+
q_model = model if inplace else copy.deepcopy(model)
assert (
@@ -287,7 +320,8 @@ def convert(
mode=Mode.CONVERT,
**kwargs,
)
- setattr(q_model, "is_quantized", True)
+ if hasattr(q_model, "__dict__"):
+ setattr(q_model, "is_quantized", True)
return q_model
diff --git a/test/torch/quantization/test_autoround_cpu.py b/test/torch/quantization/test_autoround_cpu.py
index d25fc9bb1e6..f70e95248d7 100644
--- a/test/torch/quantization/test_autoround_cpu.py
+++ b/test/torch/quantization/test_autoround_cpu.py
@@ -16,6 +16,10 @@
prepare,
quantize,
)
+from neural_compressor.torch.quantization.quantize import (
+ _AutoRoundModelReference,
+ _is_autoround_model_free_string_case,
+)
from neural_compressor.torch.utils import logger
torch.backends.__allow_nonbracketed_mutation_flag = True
@@ -638,3 +642,82 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tmp_
assert (
getattr(attn, "q_scale", None) is not None
), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
+
+ def test_is_autoround_model_free_string_case_true(self):
+ """Test detection when model is string and config has model_free=True."""
+ config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+ model = "/path/to/model"
+ assert _is_autoround_model_free_string_case(model, config) is True
+
+ def test_is_autoround_model_free_string_case_false_not_string(self):
+ """Test detection returns False when model is not a string."""
+ config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+ model = torch.nn.Linear(10, 10)
+ assert _is_autoround_model_free_string_case(model, config) is False
+
+ def test_is_autoround_model_free_string_case_false_no_flag(self):
+ """Test detection returns False when model_free is not set."""
+ config = AutoRoundConfig(scheme="MXFP4")
+ model = "/path/to/model"
+ assert _is_autoround_model_free_string_case(model, config) is False
+
+ def test_is_autoround_model_free_string_case_false_model_free_false(self):
+ """Test detection returns False when model_free is explicitly False."""
+ config = AutoRoundConfig(model_free=False, scheme="MXFP4")
+ model = "/path/to/model"
+ assert _is_autoround_model_free_string_case(model, config) is False
+
+ def test_autoround_model_reference_creation(self):
+ """Test _AutoRoundModelReference wrapper creation."""
+ model_ref = "/path/to/deepseek-v4"
+ config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+ example_inputs = {"input_ids": torch.ones(1, 10, dtype=torch.long)}
+
+ ref = _AutoRoundModelReference(model_reference=model_ref, quant_config=config, example_inputs=example_inputs)
+
+ assert ref.model_reference == model_ref
+ assert ref.quant_config is config
+ assert ref.example_inputs == example_inputs
+ assert ref.is_prepared is True
+
+ def test_prepare_with_string_model_and_model_free_returns_reference(self):
+ """Test that prepare() returns _AutoRoundModelReference when called with string model and model_free=True."""
+ model = "/path/to/model"
+ config = AutoRoundConfig(
+ model_free=True,
+ scheme="MXFP4",
+ ignore_layers="compressor",
+ output_dir="/tmp/test_output",
+ )
+
+ result = prepare(model, config)
+
+ assert isinstance(result, _AutoRoundModelReference)
+ assert result.model_reference == model
+ assert result.quant_config is config
+
+ def test_model_free_with_string_model(self):
+ """Test that prepare() preserves all config attributes in _AutoRoundModelReference."""
+ model = "facebook/opt-125m"
+ layer_config = {"fc2": {"bits": 4, "data_type": "mx_fp"}}
+ config = AutoRoundConfig(
+ model_free=True,
+ scheme="MXFP8",
+ ignore_layers="self_attn",
+ layer_config=layer_config,
+ export_format="llm_compressor",
+ output_dir="/tmp/quantized_model",
+ )
+
+ result = prepare(model, config)
+
+ assert isinstance(result, _AutoRoundModelReference)
+ assert result.quant_config.scheme == "MXFP8"
+ assert result.quant_config.ignore_layers == "self_attn"
+ assert result.quant_config.layer_config == layer_config
+ assert result.quant_config.export_format == "llm_compressor"
+
+ result = convert(result)
+ assert not hasattr(result.model.decoder.layers[0].self_attn.k_proj, "quantization_scheme"), "Ignored layers were not preserved during conversion."
+ assert result.model.decoder.layers[0].fc1.quantization_scheme.format.value == 'mxfp8-quantized', "Model conversion did not preserve the quantization scheme format."
+ assert result.model.decoder.layers[0].fc2.quantization_scheme.format.value == 'mxfp4-pack-quantized', "Model conversion did not preserve the quantization scheme format for layer_config."