From 6140eab0c1ff9504dbbe2f265940b54acb3301d1 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Mon, 15 Jun 2026 16:23:46 +0800
Subject: [PATCH 01/12] Add DeepSeek V4 AutoRound example and scripts for
 model-free quantization

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../auto_round/deepseekv4/README.md           |  97 ++++++++++
 .../auto_round/deepseekv4/quantize.py         | 116 ++++++++++++
 .../auto_round/deepseekv4/run_evalscope.sh    | 171 ++++++++++++++++++
 .../auto_round/deepseekv4/run_quant.sh        |  55 ++++++
 .../torch/algorithms/autoround/autoround.py   |  46 +++--
 .../torch/quantization/algorithm_entry.py     |  10 +-
 .../torch/quantization/quantize.py            |  48 ++++-
 7 files changed, 515 insertions(+), 28 deletions(-)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
new file mode 100644
index 00000000000..6930104d889
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -0,0 +1,97 @@
+# DeepSeek V4 AutoRound (INC prepare/convert)
+
+This example demonstrates model-free quantization through INC API:
+
+```python
+from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert
+
+config = AutoRoundConfig(
+    model_free=True,
+    scheme="MXFP4",
+    ignore_layers="compressor,indexer.weights_proj",
+    export_format="llm_compressor",
+    output_dir="/path/to/output",
+)
+model = "/path/or/hf_model_name"
+model = prepare(model, config)
+model = convert(model)
+```
+
+## Quick Start
+
+```bash
+cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4
+bash run_quant.sh \
+  --dtype=mxfp4_mixed \
+  --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+  --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed
+```
+
+Then run serving + evaluation in one command:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \
+  --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \
+  --tp 2 \
+  --port 8009 \
+  --temp 1.0
+```
+
+Equivalent vLLM defaults inside `run_evalscope.sh`:
+
+```bash
+SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve <model> \
+  --trust-remote-code \
+  --kv-cache-dtype fp8 \
+  --block-size 256 \
+  --tensor-parallel-size 2 \
+  --attention_config.use_fp4_indexer_cache=True \
+  --port 8009 \
+  --no-enable-flashinfer-autotune
+```
+
+If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix),
+`run_evalscope.sh` will also add:
+
+```bash
+--enable-expert-parallel --moe-backend deep_gemm_mega_moe
+```
+
+Mixed preset example:
+
+```bash
+bash run_quant.sh \
+  --dtype=mxfp4_mixed \
+  --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+  --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP8
+```
+
+## CLI Arguments
+
+- `--dtype`: quantization preset.
+  - `mxfp4`: `scheme=MXFP4`
+  - `mxfp4_mixed`: `scheme=MXFP8` + `layer_config={"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}`
+  - `mxfp8`: `scheme=MXFP8`
+  - `w4a16`: `scheme=W4A16` + `layer_config={"wo_a": {"bits": 16}}`
+- `--input_model`: HF model name or local model path.
+- `--output_model`: output directory.
+- `--format`: `auto_round` or `llm_compressor` (default: `llm_compressor`).
+- `--ignore_layers`: comma-separated layer patterns (default: `compressor,indexer.weights_proj`).
+
+`run_evalscope.sh` arguments:
+
+- `--model`: model path for vLLM and evalscope.
+- `--port`: vLLM API port (default: `8009`).
+- `--temp`: generation temperature used by evalscope (default: `0`).
+- `--skip_serve`: skip starting vLLM (use existing endpoint on the same `--port`).
+- `--tp`: tensor parallel size for vLLM (default: `2`).
+- `--kv-cache-dtype`: kv cache dtype for vLLM (default: `fp8`).
+- `--block-size`: vLLM block size (default: `256`).
+
+## Notes
+
+- This flow is enabled only when:
+  - `config` is `AutoRoundConfig`
+  - `config.model_free=True`
+  - `model` passed to `prepare/convert` is a `str` (model path or model name)
+- The example uses `reloading=False` by default and saves quantized artifacts to `--output_model`.
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
new file mode 100644
index 00000000000..d64fd3a480d
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+
+from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+_PRESET_CONFIG = {
+    "mxfp4": {
+        "scheme": "MXFP4",
+        "layer_config": None,
+    },
+    # MXFP8 + experts FP4 mixed setup.
+    "mxfp4_mixed": {
+        "scheme": "MXFP8",
+        "layer_config": {"ffn.experts": {"bits": 4, "data_type": "mx_fp"}},
+    },
+    "mxfp8": {
+        "scheme": "MXFP8",
+        "layer_config": None,
+    },
+    "w4a16": {
+        "scheme": "W4A16",
+        "layer_config": {"wo_a": {"bits": 16}},
+    },
+}
+
+
+def build_config(args: argparse.Namespace) -> AutoRoundConfig:
+    dtype_key = args.dtype.lower()
+    if dtype_key not in _PRESET_CONFIG:
+        raise ValueError(f"Unsupported dtype: {args.dtype}. Supported: {', '.join(_PRESET_CONFIG.keys())}")
+
+    preset = _PRESET_CONFIG[dtype_key]
+    layer_config = preset["layer_config"]
+    if args.disable_preset_layer_config:
+        layer_config = None
+
+    return AutoRoundConfig(
+        model_free=True,
+        scheme=preset["scheme"],
+        ignore_layers=args.ignore_layers,
+        layer_config=layer_config,
+        export_format=args.format,
+        output_dir=args.output_model,
+        reloading=False,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="DeepSeek V4 model-free quantization via INC AutoRound prepare/convert.")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        required=True,
+        choices=sorted(_PRESET_CONFIG.keys()),
+        help="Quantization preset. e.g. mxfp4 or mxfp4_mixed",
+    )
+    parser.add_argument(
+        "--input_model",
+        type=str,
+        required=True,
+        help="Model name or local path.",
+    )
+    parser.add_argument(
+        "--output_model",
+        type=str,
+        required=True,
+        help="Output directory for quantized model.",
+    )
+    parser.add_argument(
+        "--ignore_layers",
+        type=str,
+        default="compressor,indexer.weights_proj",
+        help="Comma-separated layer name patterns to skip.",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="llm_compressor",
+        choices=["auto_round", "llm_compressor"],
+        help="Export format.",
+    )
+    parser.add_argument(
+        "--disable_preset_layer_config",
+        action="store_true",
+        help="Disable preset layer_config for the selected dtype.",
+    )
+    args = parser.parse_args()
+
+    quant_config = build_config(args)
+
+    model = args.input_model
+    model = prepare(model, quant_config)
+    _ = convert(model)
+    logger.info("Quantized model saved to %s", args.output_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
new file mode 100644
index 00000000000..7f1f1f97196
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -0,0 +1,171 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Usage:
+#   bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE]
+#
+# This script can start vLLM serve and then run evalscope automatically.
+
+PORT=8009
+MODEL=/workspace/models/deepseek-ai/DeepSeek-V4-Flash
+TEMPERATURE=0
+KV_CACHE_DTYPE="fp8"
+BLOCK_SIZE=256
+TENSOR_PARALLEL_SIZE=2
+SAFETENSORS_FAST_GPU="1"
+TRUST_REMOTE_CODE="true"
+NO_ENABLE_FLASHINFER_AUTOTUNE="true"
+SKIP_SERVE="false"
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+cd "${SCRIPT_DIR}"
+
+cleanup() {
+  if [[ "${SKIP_SERVE}" == "true" ]]; then
+    return
+  fi
+  # Kill the process listening on the specified port to free GPU.
+  VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
+  if [[ -n "${VLLM_PIDS}" ]]; then
+    for PID in ${VLLM_PIDS}; do
+      CHILDREN=$(pgrep -P "${PID}" || true)
+      if [[ -n "${CHILDREN}" ]]; then
+        kill -9 ${CHILDREN} 2>/dev/null || true
+      fi
+      kill -9 "${PID}" 2>/dev/null || true
+    done
+  fi
+}
+
+trap cleanup EXIT
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --port)
+      PORT="$2"; shift 2 ;;
+    --model)
+      MODEL="$2"; shift 2 ;;
+    --temp)
+      TEMPERATURE="$2"; shift 2 ;;
+    --skip_serve)
+      SKIP_SERVE="true"; shift 1 ;;
+    --tp)
+      TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
+    --kv-cache-dtype)
+      KV_CACHE_DTYPE="$2"; shift 2 ;;
+    --block-size)
+      BLOCK_SIZE="$2"; shift 2 ;;
+    *)
+      echo "Unknown option: $1"; exit 1 ;;
+  esac
+done
+
+API_URL="http://127.0.0.1:${PORT}/v1"
+
+if [[ "${SKIP_SERVE}" != "true" ]]; then
+  echo "Starting vLLM serve on port ${PORT} ..."
+  MODEL_NORMALIZED="${MODEL%/}"
+  MODEL_NAME="${MODEL_NORMALIZED##*/}"
+  EXTRA_ARGS=()
+  # Only for base DeepSeek-V4-Flash/Pro model names without quantized suffixes.
+  if [[ "${MODEL_NAME}" == "DeepSeek-V4-Flash" || "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+    EXTRA_ARGS+=(--enable-expert-parallel)
+    EXTRA_ARGS+=(--moe-backend deep_gemm_mega_moe)
+  fi
+
+  VLLM_CMD=(
+    vllm serve "${MODEL}"
+    --kv-cache-dtype "${KV_CACHE_DTYPE}"
+    --block-size "${BLOCK_SIZE}"
+    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}"
+    --attention_config.use_fp4_indexer_cache=True
+    --port "${PORT}"
+  )
+  if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then
+    VLLM_CMD+=(--trust-remote-code)
+  fi
+  if [[ "${NO_ENABLE_FLASHINFER_AUTOTUNE}" == "true" ]]; then
+    VLLM_CMD+=(--no-enable-flashinfer-autotune)
+  fi
+  VLLM_CMD+=("${EXTRA_ARGS[@]}")
+
+  SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 &
+  echo "vLLM launched. Log: /tmp/vllm_${PORT}.log"
+fi
+
+# Wait until the API is ready
+echo "Waiting for API at ${API_URL} ..."
+until curl -sf "${API_URL}/models" -o /dev/null; do
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..."
+  sleep 20
+done
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation."
+
+MODEL_NORMALIZED="${MODEL%/}"
+MODEL_NAME="${MODEL_NORMALIZED##*/}"
+LOG_DIR="logs/${MODEL_NAME}"
+mkdir -p "$LOG_DIR"
+OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log"
+
+echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE"
+echo "Model: $MODEL" | tee -a "$OUTPUT_FILE"
+echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE"
+echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE"
+echo "" | tee -a "$OUTPUT_FILE"
+  
+
+echo "" | tee -a "$OUTPUT_FILE"
+echo "=== [1/3] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE"
+evalscope eval \
+  --model "$MODEL" \
+  --eval-type openai_api \
+  --api-key EMPTY \
+  --datasets aime26 \
+  --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \
+  --eval-batch-size 10  --timeout 3000 \
+  --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+echo "" | tee -a "$OUTPUT_FILE"
+
+echo "=== [2/3] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE"
+evalscope eval \
+  --model "$MODEL" \
+  --eval-type openai_api \
+  --api-key EMPTY \
+  --datasets gpqa_diamond \
+  --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \
+  --eval-batch-size 10  --timeout 3000 \
+  --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+
+echo "=== [3/3] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE"
+evalscope eval \
+  --model "$MODEL" \
+  --eval-type openai_api \
+  --api-key EMPTY \
+  --datasets piqa hellaswag gsm8k mmlu_pro math_500 mmlu \
+  --eval-batch-size 10 --timeout 3000 \
+  --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+
+
+echo "" | tee -a "$OUTPUT_FILE"
+echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE"
+echo "Results saved to: $OUTPUT_FILE"
+
+# Kill the process listening on the specified port to free GPU
+echo "Stopping process on port ${PORT} to free GPU..." | tee -a "$OUTPUT_FILE"
+VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
+if [[ -n "$VLLM_PIDS" ]]; then
+  echo "Found vllm serve process(es) with --port ${PORT}: $VLLM_PIDS" | tee -a "$OUTPUT_FILE"
+  for PID in $VLLM_PIDS; do
+    # Kill all child processes (including GPU processes)
+    CHILDREN=$(pgrep -P $PID)
+    if [[ -n "$CHILDREN" ]]; then
+      echo "Killing child processes of $PID: $CHILDREN" | tee -a "$OUTPUT_FILE"
+      kill -9 $CHILDREN 2>/dev/null
+    fi
+    kill -9 $PID 2>/dev/null
+    echo "Killed vllm serve process and its children: $PID $CHILDREN" | tee -a "$OUTPUT_FILE"
+  done
+else
+  echo "No vllm serve process found with --port ${PORT}." | tee -a "$OUTPUT_FILE"
+fi
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
new file mode 100644
index 00000000000..305c2cd266d
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+DTYPE=""
+INPUT_MODEL=""
+OUTPUT_MODEL=""
+FORMAT="llm_compressor"
+IGNORE_LAYERS="compressor,indexer.weights_proj"
+
+usage() {
+  echo "Usage: bash run_quant.sh --dtype=<mxfp4|mxfp4_mixed|mxfp8|w4a16> --input_model=<path_or_name> --output_model=<output_dir>"
+  echo "Optional: --format=<auto_round|llm_compressor> --ignore_layers=<comma_separated_patterns>"
+  exit 1
+}
+
+for arg in "$@"; do
+  case $arg in
+    --dtype=*)
+      DTYPE="${arg#*=}"
+      ;;
+    --input_model=*)
+      INPUT_MODEL="${arg#*=}"
+      ;;
+    --output_model=*)
+      OUTPUT_MODEL="${arg#*=}"
+      ;;
+    --format=*)
+      FORMAT="${arg#*=}"
+      ;;
+    --ignore_layers=*)
+      IGNORE_LAYERS="${arg#*=}"
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      echo "Unknown option: $arg"
+      usage
+      ;;
+  esac
+done
+
+[[ -z "$DTYPE" ]] && echo "Error: --dtype is required" && usage
+[[ -z "$INPUT_MODEL" ]] && echo "Error: --input_model is required" && usage
+[[ -z "$OUTPUT_MODEL" ]] && echo "Error: --output_model is required" && usage
+
+cd "$SCRIPT_DIR"
+python quantize.py \
+  --dtype "$DTYPE" \
+  --input_model "$INPUT_MODEL" \
+  --output_model "$OUTPUT_MODEL" \
+  --format "$FORMAT" \
+  --ignore_layers "$IGNORE_LAYERS"
diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py
index 5b440a57c0b..bf820b083ab 100644
--- a/neural_compressor/torch/algorithms/autoround/autoround.py
+++ b/neural_compressor/torch/algorithms/autoround/autoround.py
@@ -158,6 +158,8 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs):
         Returns:
             A prepared model.
         """
+        if isinstance(model, str) and bool(getattr(self, "model_free", False)):
+            return model
         prepare_model = InputCaptureModule(model)
         return prepare_model
 
@@ -171,20 +173,26 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             The quantized model.
         """
         pipe = kwargs.pop("pipeline", None)
-        tokenizer = getattr(model.orig_model, "tokenizer", None)
-        if tokenizer is not None:
-            delattr(model.orig_model, "tokenizer")
-        elif pipe is None:
-            tokenizer = "Placeholder"
-            self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
-        # Retrieve processor/image_processor/template from model if they were attached there
-        # (moved from quant_config to model to avoid duplicating large objects in per-layer configs)
-        for _attr in ("processor", "image_processor", "template"):
-            _val = getattr(model.orig_model, _attr, None)
-            if _val is not None:
-                setattr(self, _attr, _val)
-                delattr(model.orig_model, _attr)
-        model = model.orig_model
+        is_model_reference = isinstance(model, str)
+        if is_model_reference:
+            tokenizer = getattr(self, "tokenizer", None)
+            if tokenizer is None and pipe is None:
+                tokenizer = "Placeholder"
+        else:
+            tokenizer = getattr(model.orig_model, "tokenizer", None)
+            if tokenizer is not None:
+                delattr(model.orig_model, "tokenizer")
+            elif pipe is None:
+                tokenizer = "Placeholder"
+                self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
+            # Retrieve processor/image_processor/template from model if they were attached there
+            # (moved from quant_config to model to avoid duplicating large objects in per-layer configs)
+            for _attr in ("processor", "image_processor", "template"):
+                _val = getattr(model.orig_model, _attr, None)
+                if _val is not None:
+                    setattr(self, _attr, _val)
+                    delattr(model.orig_model, _attr)
+            model = model.orig_model
         if pipe is not None:
             model = pipe
         # Remove AutoRound specific args before passing to AutoRound constructor
@@ -221,7 +229,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
 
         if self._is_w4afp8():
             model, weight_config = rounder.quantize()
-            model.autoround_config = weight_config
+            if hasattr(model, "__dict__"):
+                model.autoround_config = weight_config
             return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
         else:  # pragma: no cover
             _, quantized_model_path = rounder.quantize_and_save(
@@ -229,7 +238,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             )
             self.output_dir = quantized_model_path
             model = rounder.model
-            model.autoround_config = rounder.layer_config
+            if hasattr(model, "__dict__"):
+                model.autoround_config = rounder.layer_config
 
         self.accelerator.empty_cache()
         dump_model_op_stats(rounder.layer_config)
@@ -248,7 +258,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             except Exception as e:
                 logger.error(f"Error reloading model: {e}")
 
-        setattr(model, "name_or_path", self.output_dir)  # model is saved in a subfolder of output_dir based on scheme
+        if hasattr(model, "__dict__"):
+            # model is saved in a subfolder of output_dir based on scheme
+            setattr(model, "name_or_path", self.output_dir)
         return model
 
 
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index e15d19fba96..95651695030 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -560,12 +560,12 @@ def teq_quantize_entry(
 ###################### AUTOROUND Algo Entry ##################################
 @register_algo(name=AUTOROUND)
 def autoround_quantize_entry(
-    model: torch.nn.Module,
+    model,
     configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig],
     mode: Mode = Mode.QUANTIZE,
     *args,
     **kwargs,
-) -> torch.nn.Module:
+):
     """The main entry to apply AutoRound quantization.
 
     Args:
@@ -630,8 +630,10 @@ def autoround_quantize_entry(
     kwargs.pop("example_inputs")
     quantizer = get_quantizer(model, quantizer_cls=AutoRoundQuantizer, quant_config=quant_config, **params_dict)
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
-    model.qconfig = configs_mapping
-    model.save = MethodType(save, model)
+    if hasattr(model, "__dict__"):
+        model.qconfig = configs_mapping
+        if isinstance(model, torch.nn.Module):
+            model.save = MethodType(save, model)
     postprocess_model(model, mode, quantizer)
     return model
 
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 17641d81f2f..3bd7729b2b3 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -33,6 +33,25 @@
 FRAMEWORK_NAME = "torch"
 
 
+class _AutoRoundModelReference:
+    """A lightweight container for model-free AutoRound prepare/convert flow."""
+
+    def __init__(self, model_reference: str, quant_config: BaseConfig, example_inputs: Any = None):
+        self.model_reference = model_reference
+        self.quant_config = quant_config
+        self.example_inputs = example_inputs
+        self.is_prepared = True
+
+
+def _is_autoround_model_free_string_case(model: Any, quant_config: BaseConfig) -> bool:
+    """Return True when model-free AutoRound is called with a string model reference."""
+    return (
+        isinstance(quant_config, AutoRoundConfig)
+        and bool(getattr(quant_config, "model_free", False))
+        and isinstance(model, str)
+    )
+
+
 def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name):
     """Check whether to apply this algorithm according to configs_mapping.
 
@@ -89,12 +108,16 @@ def preprocess_quant_config(model, quant_config, mode="prepare", example_inputs=
                 )
         model_info = quant_config.get_model_info(model, example_inputs)
     elif isinstance(quant_config, AutoRoundConfig):
-        for _attr in ("tokenizer", "processor", "image_processor", "template"):
-            _backup = getattr(quant_config, _attr, None)
-            if _backup is not None:
-                setattr(model, _attr, _backup)
-                delattr(quant_config, _attr)
-        model_info = quant_config.get_model_info(model=model)
+        if _is_autoround_model_free_string_case(model, quant_config):
+            # Keep optional large objects on config when model is a string reference.
+            model_info = quant_config.get_model_info(model=None)
+        else:
+            for _attr in ("tokenizer", "processor", "image_processor", "template"):
+                _backup = getattr(quant_config, _attr, None)
+                if _backup is not None:
+                    setattr(model, _attr, _backup)
+                    delattr(quant_config, _attr)
+            model_info = quant_config.get_model_info(model=model)
     else:
         model_info = quant_config.get_model_info(model=model)
 
@@ -172,6 +195,9 @@ def prepare(
     Returns:
         prepared and calibrated module.
     """
+    if _is_autoround_model_free_string_case(model, quant_config):
+        return _AutoRoundModelReference(model_reference=model, quant_config=quant_config, example_inputs=example_inputs)
+
     prepared_model = model if inplace else copy.deepcopy(model)
     prepared_model, configs_mapping = preprocess_quant_config(
         prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
@@ -240,6 +266,13 @@ def convert(
     Returns:
         The quantized model.
     """
+    if isinstance(model, _AutoRoundModelReference):
+        if quant_config is None:
+            quant_config = model.quant_config
+        else:
+            logger.warning("quant_config will be ignored since the model has been prepared.")
+        model = model.model_reference
+
     q_model = model if inplace else copy.deepcopy(model)
 
     assert (
@@ -287,7 +320,8 @@ def convert(
                 mode=Mode.CONVERT,
                 **kwargs,
             )
-    setattr(q_model, "is_quantized", True)
+    if hasattr(q_model, "__dict__"):
+        setattr(q_model, "is_quantized", True)
     return q_model
 
 

From a893a69369bdd8f9d33b69f5d40b68eeba2c34f7 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 16 Jun 2026 10:33:44 +0800
Subject: [PATCH 02/12] Refactor AutoRoundQuantizer to conditionally dump model
 operation stats and add tests for model-free string case detection

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../torch/algorithms/autoround/autoround.py   |  3 +-
 test/torch/quantization/test_autoround_cpu.py | 83 +++++++++++++++++++
 2 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py
index bf820b083ab..654b58d9501 100644
--- a/neural_compressor/torch/algorithms/autoround/autoround.py
+++ b/neural_compressor/torch/algorithms/autoround/autoround.py
@@ -242,7 +242,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
                 model.autoround_config = rounder.layer_config
 
         self.accelerator.empty_cache()
-        dump_model_op_stats(rounder.layer_config)
+        if not bool(getattr(self, "model_free", False)):
+            dump_model_op_stats(rounder.layer_config)
 
         reloading = self.__dict__.get("reloading", True)
         if self.export_format in ["auto_round", "llm_compressor"] and reloading:
diff --git a/test/torch/quantization/test_autoround_cpu.py b/test/torch/quantization/test_autoround_cpu.py
index d25fc9bb1e6..f70e95248d7 100644
--- a/test/torch/quantization/test_autoround_cpu.py
+++ b/test/torch/quantization/test_autoround_cpu.py
@@ -16,6 +16,10 @@
     prepare,
     quantize,
 )
+from neural_compressor.torch.quantization.quantize import (
+    _AutoRoundModelReference,
+    _is_autoround_model_free_string_case,
+)
 from neural_compressor.torch.utils import logger
 
 torch.backends.__allow_nonbracketed_mutation_flag = True
@@ -638,3 +642,82 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tmp_
             assert (
                 getattr(attn, "q_scale", None) is not None
             ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
+
+    def test_is_autoround_model_free_string_case_true(self):
+        """Test detection when model is string and config has model_free=True."""
+        config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+        model = "/path/to/model"
+        assert _is_autoround_model_free_string_case(model, config) is True
+
+    def test_is_autoround_model_free_string_case_false_not_string(self):
+        """Test detection returns False when model is not a string."""
+        config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+        model = torch.nn.Linear(10, 10)
+        assert _is_autoround_model_free_string_case(model, config) is False
+
+    def test_is_autoround_model_free_string_case_false_no_flag(self):
+        """Test detection returns False when model_free is not set."""
+        config = AutoRoundConfig(scheme="MXFP4")
+        model = "/path/to/model"
+        assert _is_autoround_model_free_string_case(model, config) is False
+
+    def test_is_autoround_model_free_string_case_false_model_free_false(self):
+        """Test detection returns False when model_free is explicitly False."""
+        config = AutoRoundConfig(model_free=False, scheme="MXFP4")
+        model = "/path/to/model"
+        assert _is_autoround_model_free_string_case(model, config) is False
+
+    def test_autoround_model_reference_creation(self):
+        """Test _AutoRoundModelReference wrapper creation."""
+        model_ref = "/path/to/deepseek-v4"
+        config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+        example_inputs = {"input_ids": torch.ones(1, 10, dtype=torch.long)}
+
+        ref = _AutoRoundModelReference(model_reference=model_ref, quant_config=config, example_inputs=example_inputs)
+
+        assert ref.model_reference == model_ref
+        assert ref.quant_config is config
+        assert ref.example_inputs == example_inputs
+        assert ref.is_prepared is True
+
+    def test_prepare_with_string_model_and_model_free_returns_reference(self):
+        """Test that prepare() returns _AutoRoundModelReference when called with string model and model_free=True."""
+        model = "/path/to/model"
+        config = AutoRoundConfig(
+            model_free=True,
+            scheme="MXFP4",
+            ignore_layers="compressor",
+            output_dir="/tmp/test_output",
+        )
+
+        result = prepare(model, config)
+
+        assert isinstance(result, _AutoRoundModelReference)
+        assert result.model_reference == model
+        assert result.quant_config is config
+
+    def test_model_free_with_string_model(self):
+        """Test that prepare() preserves all config attributes in _AutoRoundModelReference."""
+        model = "facebook/opt-125m"
+        layer_config = {"fc2": {"bits": 4, "data_type": "mx_fp"}}
+        config = AutoRoundConfig(
+            model_free=True,
+            scheme="MXFP8",
+            ignore_layers="self_attn",
+            layer_config=layer_config,
+            export_format="llm_compressor",
+            output_dir="/tmp/quantized_model",
+        )
+
+        result = prepare(model, config)
+
+        assert isinstance(result, _AutoRoundModelReference)
+        assert result.quant_config.scheme == "MXFP8"
+        assert result.quant_config.ignore_layers == "self_attn"
+        assert result.quant_config.layer_config == layer_config
+        assert result.quant_config.export_format == "llm_compressor"
+
+        result = convert(result)
+        assert not hasattr(result.model.decoder.layers[0].self_attn.k_proj, "quantization_scheme"), "Ignored layers were not preserved during conversion."
+        assert result.model.decoder.layers[0].fc1.quantization_scheme.format.value == 'mxfp8-quantized', "Model conversion did not preserve the quantization scheme format."
+        assert result.model.decoder.layers[0].fc2.quantization_scheme.format.value == 'mxfp4-pack-quantized', "Model conversion did not preserve the quantization scheme format for layer_config."

From 214328a3c40f3ed3d61244d725d51aed9e84d7f8 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 16 Jun 2026 10:50:07 +0800
Subject: [PATCH 03/12] update run_evalscope

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../auto_round/deepseekv4/README.md           |  2 +-
 .../auto_round/deepseekv4/run_evalscope.sh    | 56 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
index 6930104d889..9c11772c83f 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -51,7 +51,7 @@ SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve <model> \
 ```
 
 If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix),
-`run_evalscope.sh` will also add:
+`run_evalscope.sh` will also add (automatically):
 
 ```bash
 --enable-expert-parallel --moe-backend deep_gemm_mega_moe
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
index 7f1f1f97196..bccb7911341 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -16,15 +16,31 @@ TENSOR_PARALLEL_SIZE=2
 SAFETENSORS_FAST_GPU="1"
 TRUST_REMOTE_CODE="true"
 NO_ENABLE_FLASHINFER_AUTOTUNE="true"
-SKIP_SERVE="false"
+SKIP_SERVE="${SKIP_SERVE:-false}"
+VLLM_PID=""
+LOG_TAIL_PID=""
 
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
 cd "${SCRIPT_DIR}"
 
 cleanup() {
+  if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then
+    kill "${LOG_TAIL_PID}" 2>/dev/null || true
+  fi
+
   if [[ "${SKIP_SERVE}" == "true" ]]; then
     return
   fi
+
+  if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then
+    CHILDREN=$(pgrep -P "${VLLM_PID}" || true)
+    if [[ -n "${CHILDREN}" ]]; then
+      kill -9 ${CHILDREN} 2>/dev/null || true
+    fi
+    kill -9 "${VLLM_PID}" 2>/dev/null || true
+    return
+  fi
+
   # Kill the process listening on the specified port to free GPU.
   VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
   if [[ -n "${VLLM_PIDS}" ]]; then
@@ -40,6 +56,13 @@ cleanup() {
 
 trap cleanup EXIT
 
+stop_log_tail() {
+  if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then
+    kill "${LOG_TAIL_PID}" 2>/dev/null || true
+    LOG_TAIL_PID=""
+  fi
+}
+
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --port)
@@ -50,6 +73,8 @@ while [[ $# -gt 0 ]]; do
       TEMPERATURE="$2"; shift 2 ;;
     --skip_serve)
       SKIP_SERVE="true"; shift 1 ;;
+    --skip-serve)
+      SKIP_SERVE="true"; shift 1 ;;
     --tp)
       TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
     --kv-cache-dtype)
@@ -61,6 +86,8 @@ while [[ $# -gt 0 ]]; do
   esac
 done
 
+SKIP_SERVE="$(echo "${SKIP_SERVE}" | tr '[:upper:]' '[:lower:]')"
+
 API_URL="http://127.0.0.1:${PORT}/v1"
 
 if [[ "${SKIP_SERVE}" != "true" ]]; then
@@ -91,15 +118,40 @@ if [[ "${SKIP_SERVE}" != "true" ]]; then
   VLLM_CMD+=("${EXTRA_ARGS[@]}")
 
   SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 &
+  VLLM_PID=$!
   echo "vLLM launched. Log: /tmp/vllm_${PORT}.log"
+  echo "vLLM PID: ${VLLM_PID}"
+  echo "=== vLLM startup log (will stop after API wait ends) ==="
+  tail -n +1 -f "/tmp/vllm_${PORT}.log" &
+  LOG_TAIL_PID=$!
 fi
 
 # Wait until the API is ready
 echo "Waiting for API at ${API_URL} ..."
-until curl -sf "${API_URL}/models" -o /dev/null; do
+for _ in $(seq 1 90); do
+  if curl -sf "${API_URL}/models" -o /dev/null; then
+    break
+  fi
+  if [[ "${SKIP_SERVE}" != "true" ]] && [[ -n "${VLLM_PID}" ]] && ! kill -0 "${VLLM_PID}" 2>/dev/null; then
+    stop_log_tail
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] vLLM exited before API became ready."
+    echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----"
+    tail -n 80 "/tmp/vllm_${PORT}.log" || true
+    exit 1
+  fi
   echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..."
   sleep 20
 done
+
+stop_log_tail
+
+if ! curl -sf "${API_URL}/models" -o /dev/null; then
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for API at ${API_URL}."
+  echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----"
+  tail -n 80 "/tmp/vllm_${PORT}.log" || true
+  exit 1
+fi
+
 echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation."
 
 MODEL_NORMALIZED="${MODEL%/}"

From 09559c90c1d583570d49b4bd5c1a28947ce85aaa Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Tue, 16 Jun 2026 10:51:53 +0800
Subject: [PATCH 04/12] update readme

Signed-off-by: Xin He <xin3.he@intel.com>
---
 examples/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 26e4a5792d7..9d590b1b7f5 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -15,6 +15,12 @@ Intel® Neural Compressor validated examples with multiple compression technique
   </tr>
 </thead>
 <tbody>
+<tr>
+    <td>deepseek-ai/DeepSeek-V4</td>
+    <td>Natural Language Processing</td>
+    <td>Quantization (MXFP8/MXFP4)</td>
+    <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4">link</a></td>
+</tr>
 <tr>
     <td>deepseek-ai/DeepSeek-R1</td>
     <td>Natural Language Processing</td>

From f156b16604960259edd517e6a355ca8edabb2731 Mon Sep 17 00:00:00 2001
From: changwangss <chang1.wang@intel.com>
Date: Wed, 17 Jun 2026 08:07:48 +0000
Subject: [PATCH 05/12] add lm_eval ruler_qa_squad

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 .../auto_round/deepseekv4/run_evalscope.sh    | 27 ++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
index bccb7911341..dcfb75466b7 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -109,6 +109,9 @@ if [[ "${SKIP_SERVE}" != "true" ]]; then
     --attention_config.use_fp4_indexer_cache=True
     --port "${PORT}"
   )
+  if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+    VLLM_CMD+=(--max-model-len 1048576)
+  fi
   if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then
     VLLM_CMD+=(--trust-remote-code)
   fi
@@ -168,7 +171,7 @@ echo "" | tee -a "$OUTPUT_FILE"
   
 
 echo "" | tee -a "$OUTPUT_FILE"
-echo "=== [1/3] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE"
+echo "=== [1/4] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE"
 evalscope eval \
   --model "$MODEL" \
   --eval-type openai_api \
@@ -179,7 +182,7 @@ evalscope eval \
   --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
 echo "" | tee -a "$OUTPUT_FILE"
 
-echo "=== [2/3] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE"
+echo "=== [2/4] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE"
 evalscope eval \
   --model "$MODEL" \
   --eval-type openai_api \
@@ -189,7 +192,7 @@ evalscope eval \
   --eval-batch-size 10  --timeout 3000 \
   --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
 
-echo "=== [3/3] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE"
+echo "=== [3/4] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE"
 evalscope eval \
   --model "$MODEL" \
   --eval-type openai_api \
@@ -198,6 +201,24 @@ evalscope eval \
   --eval-batch-size 10 --timeout 3000 \
   --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
 
+echo "=== [4/4] ruler_qa_squad (lm_eval, 1M) ===" | tee -a "$OUTPUT_FILE"
+if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+  LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa"
+  mkdir -p "${LMEVAL_OUTPUT_DIR}"
+  LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}")
+  lm_eval \
+    --model local-completions \
+    --tasks ruler_qa_squad \
+    --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \
+    --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \
+    --metadata "${LMEVAL_METADATA}" \
+    --batch_size 1 \
+    --log_samples \
+    --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE"
+else
+  echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE"
+fi
+
 
 echo "" | tee -a "$OUTPUT_FILE"
 echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE"

From 602c4a0bc37bb6a910ae7e24f912a2a09c741e35 Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 17 Jun 2026 16:25:42 +0800
Subject: [PATCH 06/12] add requirement

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../quantization/auto_round/deepseekv4/README.md      | 11 +++++++++++
 .../auto_round/deepseekv4/run_evalscope.sh            |  4 ++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
index 9c11772c83f..d0002c6fec2 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -17,6 +17,17 @@ model = prepare(model, config)
 model = convert(model)
 ```
 
+## Requirements
+
+Install dependencies before running quantization or evaluation:
+
+```bash
+uv pip install -U pip
+uv pip install -U "git+https://github.com/intel/auto-round.git@main"
+uv pip install -U evalscope vllm lm_eval transformers datasets compressed-tensors
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+```
+
 ## Quick Start
 
 ```bash
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
index dcfb75466b7..0736d4ff2cb 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -109,7 +109,7 @@ if [[ "${SKIP_SERVE}" != "true" ]]; then
     --attention_config.use_fp4_indexer_cache=True
     --port "${PORT}"
   )
-  if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+  if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
     VLLM_CMD+=(--max-model-len 1048576)
   fi
   if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then
@@ -202,7 +202,7 @@ evalscope eval \
   --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
 
 echo "=== [4/4] ruler_qa_squad (lm_eval, 1M) ===" | tee -a "$OUTPUT_FILE"
-if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
   LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa"
   mkdir -p "${LMEVAL_OUTPUT_DIR}"
   LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}")

From a51e4115e150ac7a24ba851e9326fcd3887cb7ca Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Thu, 18 Jun 2026 16:14:18 +0800
Subject: [PATCH 07/12] Update README and run_evalscope.sh to enhance task
 handling and installation instructions

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../auto_round/deepseekv4/README.md           |   5 +-
 .../auto_round/deepseekv4/run_evalscope.sh    | 206 ++++++++++++++----
 2 files changed, 162 insertions(+), 49 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
index d0002c6fec2..5e9ca74dae2 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -24,8 +24,10 @@ Install dependencies before running quantization or evaluation:
 ```bash
 uv pip install -U pip
 uv pip install -U "git+https://github.com/intel/auto-round.git@main"
-uv pip install -U evalscope vllm lm_eval transformers datasets compressed-tensors
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install compressed-tensors --no-deps
 bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
 ```
 
 ## Quick Start
@@ -45,6 +47,7 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \
   --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \
   --tp 2 \
   --port 8009 \
+  --tasks piqa,hellaswag,gsm8k,mmlu_pro,math_500,mmlu,aime26,gpqa_diamond,ruler_qa_squad
   --temp 1.0
 ```
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
index 0736d4ff2cb..6c19df57576 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -3,7 +3,7 @@
 set -euo pipefail
 
 # Usage:
-#   bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE]
+#   bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] [--tasks TASK1,TASK2]
 #
 # This script can start vLLM serve and then run evalscope automatically.
 
@@ -16,6 +16,7 @@ TENSOR_PARALLEL_SIZE=2
 SAFETENSORS_FAST_GPU="1"
 TRUST_REMOTE_CODE="true"
 NO_ENABLE_FLASHINFER_AUTOTUNE="true"
+TASKS=""
 SKIP_SERVE="${SKIP_SERVE:-false}"
 VLLM_PID=""
 LOG_TAIL_PID=""
@@ -63,6 +64,30 @@ stop_log_tail() {
   fi
 }
 
+trim_task_name() {
+  local task_name="$1"
+  task_name="${task_name#${task_name%%[![:space:]]*}}"
+  task_name="${task_name%${task_name##*[![:space:]]}}"
+  echo "${task_name}"
+}
+
+task_in_list() {
+  local target_task="$1"
+  shift
+  local task_name
+  for task_name in "$@"; do
+    if [[ "${task_name}" == "${target_task}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+print_section_header() {
+  echo "=== [${STEP_INDEX}/${TOTAL_STEPS}] $1 ===" | tee -a "$OUTPUT_FILE"
+  STEP_INDEX=$((STEP_INDEX + 1))
+}
+
 while [[ $# -gt 0 ]]; do
   case "$1" in
     --port)
@@ -71,6 +96,8 @@ while [[ $# -gt 0 ]]; do
       MODEL="$2"; shift 2 ;;
     --temp)
       TEMPERATURE="$2"; shift 2 ;;
+    --tasks)
+      TASKS="$2"; shift 2 ;;
     --skip_serve)
       SKIP_SERVE="true"; shift 1 ;;
     --skip-serve)
@@ -156,67 +183,150 @@ if ! curl -sf "${API_URL}/models" -o /dev/null; then
 fi
 
 echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation."
-
 MODEL_NORMALIZED="${MODEL%/}"
 MODEL_NAME="${MODEL_NORMALIZED##*/}"
 LOG_DIR="logs/${MODEL_NAME}"
 mkdir -p "$LOG_DIR"
 OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log"
 
+DEFAULT_STANDARD_TASKS=(piqa hellaswag gsm8k mmlu_pro math_500 mmlu)
+SUPPORTED_TASKS=(aime26 gpqa_diamond ruler_qa_squad "${DEFAULT_STANDARD_TASKS[@]}")
+SELECTED_STANDARD_TASKS=()
+RUN_AIME26="true"
+RUN_GPQA_DIAMOND="true"
+RUN_STANDARD_TASKS="true"
+RUN_RULER_QA_SQUAD="true"
+
+if [[ -n "${TASKS}" ]]; then
+  RUN_AIME26="false"
+  RUN_GPQA_DIAMOND="false"
+  RUN_STANDARD_TASKS="false"
+  RUN_RULER_QA_SQUAD="false"
+
+  IFS=',' read -r -a REQUESTED_TASKS <<< "${TASKS}"
+  for raw_task in "${REQUESTED_TASKS[@]}"; do
+    task_name="$(trim_task_name "${raw_task}")"
+    if [[ -z "${task_name}" ]]; then
+      continue
+    fi
+    if ! task_in_list "${task_name}" "${SUPPORTED_TASKS[@]}"; then
+      echo "Unsupported task: ${task_name}"
+      echo "Supported tasks: ${SUPPORTED_TASKS[*]}"
+      exit 1
+    fi
+
+    case "${task_name}" in
+      aime26)
+        RUN_AIME26="true"
+        ;;
+      gpqa_diamond)
+        RUN_GPQA_DIAMOND="true"
+        ;;
+      ruler_qa_squad)
+        RUN_RULER_QA_SQUAD="true"
+        ;;
+      *)
+        if ! task_in_list "${task_name}" "${SELECTED_STANDARD_TASKS[@]}"; then
+          SELECTED_STANDARD_TASKS+=("${task_name}")
+          RUN_STANDARD_TASKS="true"
+        fi
+        ;;
+    esac
+  done
+
+  if [[ "${RUN_AIME26}" != "true" ]] && [[ "${RUN_GPQA_DIAMOND}" != "true" ]] \
+    && [[ "${RUN_STANDARD_TASKS}" != "true" ]] && [[ "${RUN_RULER_QA_SQUAD}" != "true" ]]; then
+    echo "No valid tasks selected from --tasks '${TASKS}'."
+    exit 1
+  fi
+else
+  SELECTED_STANDARD_TASKS=("${DEFAULT_STANDARD_TASKS[@]}")
+fi
+
+TOTAL_STEPS=0
+if [[ "${RUN_AIME26}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+STEP_INDEX=1
+
 echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE"
 echo "Model: $MODEL" | tee -a "$OUTPUT_FILE"
 echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE"
 echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE"
+if [[ -n "${TASKS}" ]]; then
+  echo "Tasks: ${TASKS}" | tee -a "$OUTPUT_FILE"
+else
+  echo "Tasks: all default tasks" | tee -a "$OUTPUT_FILE"
+fi
 echo "" | tee -a "$OUTPUT_FILE"
   
 
-echo "" | tee -a "$OUTPUT_FILE"
-echo "=== [1/4] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE"
-evalscope eval \
-  --model "$MODEL" \
-  --eval-type openai_api \
-  --api-key EMPTY \
-  --datasets aime26 \
-  --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \
-  --eval-batch-size 10  --timeout 3000 \
-  --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
-echo "" | tee -a "$OUTPUT_FILE"
+if [[ "${RUN_AIME26}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "aime26 (n=10)"
+  evalscope eval \
+    --model "$MODEL" \
+    --eval-type openai_api \
+    --api-key EMPTY \
+    --datasets aime26 \
+    --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \
+    --eval-batch-size 10 --timeout 3000 \
+    --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
 
-echo "=== [2/4] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE"
-evalscope eval \
-  --model "$MODEL" \
-  --eval-type openai_api \
-  --api-key EMPTY \
-  --datasets gpqa_diamond \
-  --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \
-  --eval-batch-size 10  --timeout 3000 \
-  --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
-
-echo "=== [3/4] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE"
-evalscope eval \
-  --model "$MODEL" \
-  --eval-type openai_api \
-  --api-key EMPTY \
-  --datasets piqa hellaswag gsm8k mmlu_pro math_500 mmlu \
-  --eval-batch-size 10 --timeout 3000 \
-  --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
-
-echo "=== [4/4] ruler_qa_squad (lm_eval, 1M) ===" | tee -a "$OUTPUT_FILE"
-if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
-  LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa"
-  mkdir -p "${LMEVAL_OUTPUT_DIR}"
-  LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}")
-  lm_eval \
-    --model local-completions \
-    --tasks ruler_qa_squad \
-    --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \
-    --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \
-    --metadata "${LMEVAL_METADATA}" \
-    --batch_size 1 \
-    --log_samples \
-    --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE"
-else
-  echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE"
+if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "gpqa_diamond (n=5)"
+  evalscope eval \
+    --model "$MODEL" \
+    --eval-type openai_api \
+    --api-key EMPTY \
+    --datasets gpqa_diamond \
+    --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \
+    --eval-batch-size 10 --timeout 3000 \
+    --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "${SELECTED_STANDARD_TASKS[*]}"
+  evalscope eval \
+    --model "$MODEL" \
+    --eval-type openai_api \
+    --api-key EMPTY \
+    --datasets "${SELECTED_STANDARD_TASKS[@]}" \
+    --eval-batch-size 10 --timeout 3000 \
+    --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "ruler_qa_squad (lm_eval, 1M)"
+  if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
+    LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa"
+    mkdir -p "${LMEVAL_OUTPUT_DIR}"
+    LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}")
+    lm_eval \
+      --model local-completions \
+      --tasks ruler_qa_squad \
+      --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \
+      --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \
+      --metadata "${LMEVAL_METADATA}" \
+      --batch_size 1 \
+      --log_samples \
+      --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE"
+  else
+    echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE"
+  fi
 fi
 
 

From 88cfd46a816dec4829fa56771a52e3b454160399 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 19 Jun 2026 17:16:30 +0800
Subject: [PATCH 08/12] add setup.sh

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseekv4/setup.sh           | 8 ++++++++
 1 file changed, 8 insertions(+)
 create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
new file mode 100644
index 00000000000..63b4d0ab6e6
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+set -e
+
+uv pip install -U pip
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install compressed-tensors --no-deps
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
\ No newline at end of file

From 46456a723c704a4186ae747f0f91676f25ca54f3 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 19 Jun 2026 18:15:04 +0800
Subject: [PATCH 09/12] update install

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseekv4/README.md                | 2 +-
 .../quantization/auto_round/deepseekv4/setup.sh                 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
index 5e9ca74dae2..03b21cf3023 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -27,7 +27,7 @@ uv pip install -U "git+https://github.com/intel/auto-round.git@main"
 uv pip install -U evalscope lm_eval transformers datasets
 uv pip install compressed-tensors --no-deps
 bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
-VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
 ```
 
 ## Quick Start
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
index 63b4d0ab6e6..ddfe2ddebfc 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
@@ -5,4 +5,4 @@ uv pip install -U pip
 uv pip install -U evalscope lm_eval transformers datasets
 uv pip install compressed-tensors --no-deps
 bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
-VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
\ No newline at end of file
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
\ No newline at end of file

From 8c98036639b35a75cd65ef500014819a5852d7a5 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Mon, 22 Jun 2026 15:25:54 +0800
Subject: [PATCH 10/12] update setup env

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseekv4/setup.sh               | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
index ddfe2ddebfc..19685ec1176 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
@@ -1,8 +1,8 @@
 #!/bin/bash
 set -e
 
-uv pip install -U pip
+uv pip install -U pip setuptools_rust setuptools_scm
 uv pip install -U evalscope lm_eval transformers datasets
 uv pip install compressed-tensors --no-deps
 bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
-VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
\ No newline at end of file
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
\ No newline at end of file

From cd86e70a14c6c7f990501677e60c4a5b01cfe3e9 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Mon, 22 Jun 2026 22:21:35 +0800
Subject: [PATCH 11/12] update env setup

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseekv4/setup.sh                  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
index 19685ec1176..0b1215ebfba 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
@@ -3,6 +3,7 @@ set -e
 
 uv pip install -U pip setuptools_rust setuptools_scm
 uv pip install -U evalscope lm_eval transformers datasets
+uv pip install git+https://github.com/intel/auto-round.git@main
 uv pip install compressed-tensors --no-deps
 bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
 VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
\ No newline at end of file

From 90f9379c34bfd3caf49fbe795eee8e8bc3eb26be Mon Sep 17 00:00:00 2001
From: Xin He <xin3.he@intel.com>
Date: Wed, 24 Jun 2026 07:30:24 +0000
Subject: [PATCH 12/12] update requirement

Signed-off-by: Xin He <xin3.he@intel.com>
---
 .../quantization/auto_round/deepseekv4/README.md               | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
index 03b21cf3023..5ab41000942 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -27,7 +27,8 @@ uv pip install -U "git+https://github.com/intel/auto-round.git@main"
 uv pip install -U evalscope lm_eval transformers datasets
 uv pip install compressed-tensors --no-deps
 bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
-VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp
+uv pip install setuptools_rust setuptools_scm
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
 ```
 
 ## Quick Start