diff --git a/examples/README.md b/examples/README.md
index 26e4a5792d7..9d590b1b7f5 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -15,6 +15,12 @@ Intel® Neural Compressor validated examples with multiple compression technique
   </tr>
 </thead>
 <tbody>
+<tr>
+    <td>deepseek-ai/DeepSeek-V4</td>
+    <td>Natural Language Processing</td>
+    <td>Quantization (MXFP8/MXFP4)</td>
+    <td><a href="./pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4">link</a></td>
+</tr>
 <tr>
     <td>deepseek-ai/DeepSeek-R1</td>
     <td>Natural Language Processing</td>
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
new file mode 100644
index 00000000000..5ab41000942
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md
@@ -0,0 +1,112 @@
+# DeepSeek V4 AutoRound (INC prepare/convert)
+
+This example demonstrates model-free quantization through INC API:
+
+```python
+from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert
+
+config = AutoRoundConfig(
+    model_free=True,
+    scheme="MXFP4",
+    ignore_layers="compressor,indexer.weights_proj",
+    export_format="llm_compressor",
+    output_dir="/path/to/output",
+)
+model = "/path/or/hf_model_name"
+model = prepare(model, config)
+model = convert(model)
+```
+
+## Requirements
+
+Install dependencies before running quantization or evaluation:
+
+```bash
+uv pip install -U pip
+uv pip install -U "git+https://github.com/intel/auto-round.git@main"
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install compressed-tensors --no-deps
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+uv pip install setuptools_rust setuptools_scm
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
+```
+
+## Quick Start
+
+```bash
+cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4
+bash run_quant.sh \
+  --dtype=mxfp4_mixed \
+  --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+  --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed
+```
+
+Then run serving + evaluation in one command:
+
+```bash
+CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \
+  --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \
+  --tp 2 \
+  --port 8009 \
+  --tasks piqa,hellaswag,gsm8k,mmlu_pro,math_500,mmlu,aime26,gpqa_diamond,ruler_qa_squad
+  --temp 1.0
+```
+
+Equivalent vLLM defaults inside `run_evalscope.sh`:
+
+```bash
+SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve <model> \
+  --trust-remote-code \
+  --kv-cache-dtype fp8 \
+  --block-size 256 \
+  --tensor-parallel-size 2 \
+  --attention_config.use_fp4_indexer_cache=True \
+  --port 8009 \
+  --no-enable-flashinfer-autotune
+```
+
+If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix),
+`run_evalscope.sh` will also add (automatically):
+
+```bash
+--enable-expert-parallel --moe-backend deep_gemm_mega_moe
+```
+
+Mixed preset example:
+
+```bash
+bash run_quant.sh \
+  --dtype=mxfp4_mixed \
+  --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \
+  --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP8
+```
+
+## CLI Arguments
+
+- `--dtype`: quantization preset.
+  - `mxfp4`: `scheme=MXFP4`
+  - `mxfp4_mixed`: `scheme=MXFP8` + `layer_config={"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}`
+  - `mxfp8`: `scheme=MXFP8`
+  - `w4a16`: `scheme=W4A16` + `layer_config={"wo_a": {"bits": 16}}`
+- `--input_model`: HF model name or local model path.
+- `--output_model`: output directory.
+- `--format`: `auto_round` or `llm_compressor` (default: `llm_compressor`).
+- `--ignore_layers`: comma-separated layer patterns (default: `compressor,indexer.weights_proj`).
+
+`run_evalscope.sh` arguments:
+
+- `--model`: model path for vLLM and evalscope.
+- `--port`: vLLM API port (default: `8009`).
+- `--temp`: generation temperature used by evalscope (default: `0`).
+- `--skip_serve`: skip starting vLLM (use existing endpoint on the same `--port`).
+- `--tp`: tensor parallel size for vLLM (default: `2`).
+- `--kv-cache-dtype`: kv cache dtype for vLLM (default: `fp8`).
+- `--block-size`: vLLM block size (default: `256`).
+
+## Notes
+
+- This flow is enabled only when:
+  - `config` is `AutoRoundConfig`
+  - `config.model_free=True`
+  - `model` passed to `prepare/convert` is a `str` (model path or model name)
+- The example uses `reloading=False` by default and saves quantized artifacts to `--output_model`.
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
new file mode 100644
index 00000000000..d64fd3a480d
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2026 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+
+from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+
+
+_PRESET_CONFIG = {
+    "mxfp4": {
+        "scheme": "MXFP4",
+        "layer_config": None,
+    },
+    # MXFP8 + experts FP4 mixed setup.
+    "mxfp4_mixed": {
+        "scheme": "MXFP8",
+        "layer_config": {"ffn.experts": {"bits": 4, "data_type": "mx_fp"}},
+    },
+    "mxfp8": {
+        "scheme": "MXFP8",
+        "layer_config": None,
+    },
+    "w4a16": {
+        "scheme": "W4A16",
+        "layer_config": {"wo_a": {"bits": 16}},
+    },
+}
+
+
+def build_config(args: argparse.Namespace) -> AutoRoundConfig:
+    dtype_key = args.dtype.lower()
+    if dtype_key not in _PRESET_CONFIG:
+        raise ValueError(f"Unsupported dtype: {args.dtype}. Supported: {', '.join(_PRESET_CONFIG.keys())}")
+
+    preset = _PRESET_CONFIG[dtype_key]
+    layer_config = preset["layer_config"]
+    if args.disable_preset_layer_config:
+        layer_config = None
+
+    return AutoRoundConfig(
+        model_free=True,
+        scheme=preset["scheme"],
+        ignore_layers=args.ignore_layers,
+        layer_config=layer_config,
+        export_format=args.format,
+        output_dir=args.output_model,
+        reloading=False,
+    )
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="DeepSeek V4 model-free quantization via INC AutoRound prepare/convert.")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        required=True,
+        choices=sorted(_PRESET_CONFIG.keys()),
+        help="Quantization preset. e.g. mxfp4 or mxfp4_mixed",
+    )
+    parser.add_argument(
+        "--input_model",
+        type=str,
+        required=True,
+        help="Model name or local path.",
+    )
+    parser.add_argument(
+        "--output_model",
+        type=str,
+        required=True,
+        help="Output directory for quantized model.",
+    )
+    parser.add_argument(
+        "--ignore_layers",
+        type=str,
+        default="compressor,indexer.weights_proj",
+        help="Comma-separated layer name patterns to skip.",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        default="llm_compressor",
+        choices=["auto_round", "llm_compressor"],
+        help="Export format.",
+    )
+    parser.add_argument(
+        "--disable_preset_layer_config",
+        action="store_true",
+        help="Disable preset layer_config for the selected dtype.",
+    )
+    args = parser.parse_args()
+
+    quant_config = build_config(args)
+
+    model = args.input_model
+    model = prepare(model, quant_config)
+    _ = convert(model)
+    logger.info("Quantized model saved to %s", args.output_model)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
new file mode 100644
index 00000000000..6c19df57576
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh
@@ -0,0 +1,354 @@
+#!/bin/bash
+
+set -euo pipefail
+
+# Usage:
+#   bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] [--tasks TASK1,TASK2]
+#
+# This script can start vLLM serve and then run evalscope automatically.
+
+PORT=8009
+MODEL=/workspace/models/deepseek-ai/DeepSeek-V4-Flash
+TEMPERATURE=0
+KV_CACHE_DTYPE="fp8"
+BLOCK_SIZE=256
+TENSOR_PARALLEL_SIZE=2
+SAFETENSORS_FAST_GPU="1"
+TRUST_REMOTE_CODE="true"
+NO_ENABLE_FLASHINFER_AUTOTUNE="true"
+TASKS=""
+SKIP_SERVE="${SKIP_SERVE:-false}"
+VLLM_PID=""
+LOG_TAIL_PID=""
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+cd "${SCRIPT_DIR}"
+
+cleanup() {
+  if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then
+    kill "${LOG_TAIL_PID}" 2>/dev/null || true
+  fi
+
+  if [[ "${SKIP_SERVE}" == "true" ]]; then
+    return
+  fi
+
+  if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then
+    CHILDREN=$(pgrep -P "${VLLM_PID}" || true)
+    if [[ -n "${CHILDREN}" ]]; then
+      kill -9 ${CHILDREN} 2>/dev/null || true
+    fi
+    kill -9 "${VLLM_PID}" 2>/dev/null || true
+    return
+  fi
+
+  # Kill the process listening on the specified port to free GPU.
+  VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
+  if [[ -n "${VLLM_PIDS}" ]]; then
+    for PID in ${VLLM_PIDS}; do
+      CHILDREN=$(pgrep -P "${PID}" || true)
+      if [[ -n "${CHILDREN}" ]]; then
+        kill -9 ${CHILDREN} 2>/dev/null || true
+      fi
+      kill -9 "${PID}" 2>/dev/null || true
+    done
+  fi
+}
+
+trap cleanup EXIT
+
+stop_log_tail() {
+  if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then
+    kill "${LOG_TAIL_PID}" 2>/dev/null || true
+    LOG_TAIL_PID=""
+  fi
+}
+
+trim_task_name() {
+  local task_name="$1"
+  task_name="${task_name#${task_name%%[![:space:]]*}}"
+  task_name="${task_name%${task_name##*[![:space:]]}}"
+  echo "${task_name}"
+}
+
+task_in_list() {
+  local target_task="$1"
+  shift
+  local task_name
+  for task_name in "$@"; do
+    if [[ "${task_name}" == "${target_task}" ]]; then
+      return 0
+    fi
+  done
+  return 1
+}
+
+print_section_header() {
+  echo "=== [${STEP_INDEX}/${TOTAL_STEPS}] $1 ===" | tee -a "$OUTPUT_FILE"
+  STEP_INDEX=$((STEP_INDEX + 1))
+}
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --port)
+      PORT="$2"; shift 2 ;;
+    --model)
+      MODEL="$2"; shift 2 ;;
+    --temp)
+      TEMPERATURE="$2"; shift 2 ;;
+    --tasks)
+      TASKS="$2"; shift 2 ;;
+    --skip_serve)
+      SKIP_SERVE="true"; shift 1 ;;
+    --skip-serve)
+      SKIP_SERVE="true"; shift 1 ;;
+    --tp)
+      TENSOR_PARALLEL_SIZE="$2"; shift 2 ;;
+    --kv-cache-dtype)
+      KV_CACHE_DTYPE="$2"; shift 2 ;;
+    --block-size)
+      BLOCK_SIZE="$2"; shift 2 ;;
+    *)
+      echo "Unknown option: $1"; exit 1 ;;
+  esac
+done
+
+SKIP_SERVE="$(echo "${SKIP_SERVE}" | tr '[:upper:]' '[:lower:]')"
+
+API_URL="http://127.0.0.1:${PORT}/v1"
+
+if [[ "${SKIP_SERVE}" != "true" ]]; then
+  echo "Starting vLLM serve on port ${PORT} ..."
+  MODEL_NORMALIZED="${MODEL%/}"
+  MODEL_NAME="${MODEL_NORMALIZED##*/}"
+  EXTRA_ARGS=()
+  # Only for base DeepSeek-V4-Flash/Pro model names without quantized suffixes.
+  if [[ "${MODEL_NAME}" == "DeepSeek-V4-Flash" || "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then
+    EXTRA_ARGS+=(--enable-expert-parallel)
+    EXTRA_ARGS+=(--moe-backend deep_gemm_mega_moe)
+  fi
+
+  VLLM_CMD=(
+    vllm serve "${MODEL}"
+    --kv-cache-dtype "${KV_CACHE_DTYPE}"
+    --block-size "${BLOCK_SIZE}"
+    --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}"
+    --attention_config.use_fp4_indexer_cache=True
+    --port "${PORT}"
+  )
+  if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
+    VLLM_CMD+=(--max-model-len 1048576)
+  fi
+  if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then
+    VLLM_CMD+=(--trust-remote-code)
+  fi
+  if [[ "${NO_ENABLE_FLASHINFER_AUTOTUNE}" == "true" ]]; then
+    VLLM_CMD+=(--no-enable-flashinfer-autotune)
+  fi
+  VLLM_CMD+=("${EXTRA_ARGS[@]}")
+
+  SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 &
+  VLLM_PID=$!
+  echo "vLLM launched. Log: /tmp/vllm_${PORT}.log"
+  echo "vLLM PID: ${VLLM_PID}"
+  echo "=== vLLM startup log (will stop after API wait ends) ==="
+  tail -n +1 -f "/tmp/vllm_${PORT}.log" &
+  LOG_TAIL_PID=$!
+fi
+
+# Wait until the API is ready
+echo "Waiting for API at ${API_URL} ..."
+for _ in $(seq 1 90); do
+  if curl -sf "${API_URL}/models" -o /dev/null; then
+    break
+  fi
+  if [[ "${SKIP_SERVE}" != "true" ]] && [[ -n "${VLLM_PID}" ]] && ! kill -0 "${VLLM_PID}" 2>/dev/null; then
+    stop_log_tail
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] vLLM exited before API became ready."
+    echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----"
+    tail -n 80 "/tmp/vllm_${PORT}.log" || true
+    exit 1
+  fi
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..."
+  sleep 20
+done
+
+stop_log_tail
+
+if ! curl -sf "${API_URL}/models" -o /dev/null; then
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for API at ${API_URL}."
+  echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----"
+  tail -n 80 "/tmp/vllm_${PORT}.log" || true
+  exit 1
+fi
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation."
+MODEL_NORMALIZED="${MODEL%/}"
+MODEL_NAME="${MODEL_NORMALIZED##*/}"
+LOG_DIR="logs/${MODEL_NAME}"
+mkdir -p "$LOG_DIR"
+OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log"
+
+DEFAULT_STANDARD_TASKS=(piqa hellaswag gsm8k mmlu_pro math_500 mmlu)
+SUPPORTED_TASKS=(aime26 gpqa_diamond ruler_qa_squad "${DEFAULT_STANDARD_TASKS[@]}")
+SELECTED_STANDARD_TASKS=()
+RUN_AIME26="true"
+RUN_GPQA_DIAMOND="true"
+RUN_STANDARD_TASKS="true"
+RUN_RULER_QA_SQUAD="true"
+
+if [[ -n "${TASKS}" ]]; then
+  RUN_AIME26="false"
+  RUN_GPQA_DIAMOND="false"
+  RUN_STANDARD_TASKS="false"
+  RUN_RULER_QA_SQUAD="false"
+
+  IFS=',' read -r -a REQUESTED_TASKS <<< "${TASKS}"
+  for raw_task in "${REQUESTED_TASKS[@]}"; do
+    task_name="$(trim_task_name "${raw_task}")"
+    if [[ -z "${task_name}" ]]; then
+      continue
+    fi
+    if ! task_in_list "${task_name}" "${SUPPORTED_TASKS[@]}"; then
+      echo "Unsupported task: ${task_name}"
+      echo "Supported tasks: ${SUPPORTED_TASKS[*]}"
+      exit 1
+    fi
+
+    case "${task_name}" in
+      aime26)
+        RUN_AIME26="true"
+        ;;
+      gpqa_diamond)
+        RUN_GPQA_DIAMOND="true"
+        ;;
+      ruler_qa_squad)
+        RUN_RULER_QA_SQUAD="true"
+        ;;
+      *)
+        if ! task_in_list "${task_name}" "${SELECTED_STANDARD_TASKS[@]}"; then
+          SELECTED_STANDARD_TASKS+=("${task_name}")
+          RUN_STANDARD_TASKS="true"
+        fi
+        ;;
+    esac
+  done
+
+  if [[ "${RUN_AIME26}" != "true" ]] && [[ "${RUN_GPQA_DIAMOND}" != "true" ]] \
+    && [[ "${RUN_STANDARD_TASKS}" != "true" ]] && [[ "${RUN_RULER_QA_SQUAD}" != "true" ]]; then
+    echo "No valid tasks selected from --tasks '${TASKS}'."
+    exit 1
+  fi
+else
+  SELECTED_STANDARD_TASKS=("${DEFAULT_STANDARD_TASKS[@]}")
+fi
+
+TOTAL_STEPS=0
+if [[ "${RUN_AIME26}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then
+  TOTAL_STEPS=$((TOTAL_STEPS + 1))
+fi
+STEP_INDEX=1
+
+echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE"
+echo "Model: $MODEL" | tee -a "$OUTPUT_FILE"
+echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE"
+echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE"
+if [[ -n "${TASKS}" ]]; then
+  echo "Tasks: ${TASKS}" | tee -a "$OUTPUT_FILE"
+else
+  echo "Tasks: all default tasks" | tee -a "$OUTPUT_FILE"
+fi
+echo "" | tee -a "$OUTPUT_FILE"
+  
+
+if [[ "${RUN_AIME26}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "aime26 (n=10)"
+  evalscope eval \
+    --model "$MODEL" \
+    --eval-type openai_api \
+    --api-key EMPTY \
+    --datasets aime26 \
+    --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \
+    --eval-batch-size 10 --timeout 3000 \
+    --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "gpqa_diamond (n=5)"
+  evalscope eval \
+    --model "$MODEL" \
+    --eval-type openai_api \
+    --api-key EMPTY \
+    --datasets gpqa_diamond \
+    --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \
+    --eval-batch-size 10 --timeout 3000 \
+    --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "${SELECTED_STANDARD_TASKS[*]}"
+  evalscope eval \
+    --model "$MODEL" \
+    --eval-type openai_api \
+    --api-key EMPTY \
+    --datasets "${SELECTED_STANDARD_TASKS[@]}" \
+    --eval-batch-size 10 --timeout 3000 \
+    --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE"
+fi
+
+if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then
+  echo "" | tee -a "$OUTPUT_FILE"
+  print_section_header "ruler_qa_squad (lm_eval, 1M)"
+  if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then
+    LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa"
+    mkdir -p "${LMEVAL_OUTPUT_DIR}"
+    LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}")
+    lm_eval \
+      --model local-completions \
+      --tasks ruler_qa_squad \
+      --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \
+      --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \
+      --metadata "${LMEVAL_METADATA}" \
+      --batch_size 1 \
+      --log_samples \
+      --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE"
+  else
+    echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE"
+  fi
+fi
+
+
+echo "" | tee -a "$OUTPUT_FILE"
+echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE"
+echo "Results saved to: $OUTPUT_FILE"
+
+# Kill the process listening on the specified port to free GPU
+echo "Stopping process on port ${PORT} to free GPU..." | tee -a "$OUTPUT_FILE"
+VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}')
+if [[ -n "$VLLM_PIDS" ]]; then
+  echo "Found vllm serve process(es) with --port ${PORT}: $VLLM_PIDS" | tee -a "$OUTPUT_FILE"
+  for PID in $VLLM_PIDS; do
+    # Kill all child processes (including GPU processes)
+    CHILDREN=$(pgrep -P $PID)
+    if [[ -n "$CHILDREN" ]]; then
+      echo "Killing child processes of $PID: $CHILDREN" | tee -a "$OUTPUT_FILE"
+      kill -9 $CHILDREN 2>/dev/null
+    fi
+    kill -9 $PID 2>/dev/null
+    echo "Killed vllm serve process and its children: $PID $CHILDREN" | tee -a "$OUTPUT_FILE"
+  done
+else
+  echo "No vllm serve process found with --port ${PORT}." | tee -a "$OUTPUT_FILE"
+fi
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
new file mode 100644
index 00000000000..305c2cd266d
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -e
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)
+
+DTYPE=""
+INPUT_MODEL=""
+OUTPUT_MODEL=""
+FORMAT="llm_compressor"
+IGNORE_LAYERS="compressor,indexer.weights_proj"
+
+usage() {
+  echo "Usage: bash run_quant.sh --dtype=<mxfp4|mxfp4_mixed|mxfp8|w4a16> --input_model=<path_or_name> --output_model=<output_dir>"
+  echo "Optional: --format=<auto_round|llm_compressor> --ignore_layers=<comma_separated_patterns>"
+  exit 1
+}
+
+for arg in "$@"; do
+  case $arg in
+    --dtype=*)
+      DTYPE="${arg#*=}"
+      ;;
+    --input_model=*)
+      INPUT_MODEL="${arg#*=}"
+      ;;
+    --output_model=*)
+      OUTPUT_MODEL="${arg#*=}"
+      ;;
+    --format=*)
+      FORMAT="${arg#*=}"
+      ;;
+    --ignore_layers=*)
+      IGNORE_LAYERS="${arg#*=}"
+      ;;
+    -h|--help)
+      usage
+      ;;
+    *)
+      echo "Unknown option: $arg"
+      usage
+      ;;
+  esac
+done
+
+[[ -z "$DTYPE" ]] && echo "Error: --dtype is required" && usage
+[[ -z "$INPUT_MODEL" ]] && echo "Error: --input_model is required" && usage
+[[ -z "$OUTPUT_MODEL" ]] && echo "Error: --output_model is required" && usage
+
+cd "$SCRIPT_DIR"
+python quantize.py \
+  --dtype "$DTYPE" \
+  --input_model "$INPUT_MODEL" \
+  --output_model "$OUTPUT_MODEL" \
+  --format "$FORMAT" \
+  --ignore_layers "$IGNORE_LAYERS"
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
new file mode 100644
index 00000000000..0b1215ebfba
--- /dev/null
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -e
+
+uv pip install -U pip setuptools_rust setuptools_scm
+uv pip install -U evalscope lm_eval transformers datasets
+uv pip install git+https://github.com/intel/auto-round.git@main
+uv pip install compressed-tensors --no-deps
+bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh)
+VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation
\ No newline at end of file
diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py
index 94e012b0298..c4601528c96 100644
--- a/neural_compressor/torch/algorithms/autoround/autoround.py
+++ b/neural_compressor/torch/algorithms/autoround/autoround.py
@@ -158,6 +158,8 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs):
         Returns:
             A prepared model.
         """
+        if isinstance(model, str) and bool(getattr(self, "model_free", False)):
+            return model
         prepare_model = InputCaptureModule(model)
         return prepare_model
 
@@ -171,20 +173,26 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             The quantized model.
         """
         pipe = kwargs.pop("pipeline", None)
-        tokenizer = getattr(model.orig_model, "tokenizer", None)
-        if tokenizer is not None:
-            delattr(model.orig_model, "tokenizer")
-        elif pipe is None:
-            tokenizer = "Placeholder"
-            self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
-        # Retrieve processor/image_processor/template from model if they were attached there
-        # (moved from quant_config to model to avoid duplicating large objects in per-layer configs)
-        for _attr in ("processor", "image_processor", "template"):
-            _val = getattr(model.orig_model, _attr, None)
-            if _val is not None:
-                setattr(self, _attr, _val)
-                delattr(model.orig_model, _attr)
-        model = model.orig_model
+        is_model_reference = isinstance(model, str)
+        if is_model_reference:
+            tokenizer = getattr(self, "tokenizer", None)
+            if tokenizer is None and pipe is None:
+                tokenizer = "Placeholder"
+        else:
+            tokenizer = getattr(model.orig_model, "tokenizer", None)
+            if tokenizer is not None:
+                delattr(model.orig_model, "tokenizer")
+            elif pipe is None:
+                tokenizer = "Placeholder"
+                self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
+            # Retrieve processor/image_processor/template from model if they were attached there
+            # (moved from quant_config to model to avoid duplicating large objects in per-layer configs)
+            for _attr in ("processor", "image_processor", "template"):
+                _val = getattr(model.orig_model, _attr, None)
+                if _val is not None:
+                    setattr(self, _attr, _val)
+                    delattr(model.orig_model, _attr)
+            model = model.orig_model
         if pipe is not None:
             model = pipe
         # Remove AutoRound specific args before passing to AutoRound constructor
@@ -221,7 +229,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
 
         if self._is_w4afp8():
             model, weight_config = rounder.quantize()
-            model.autoround_config = weight_config
+            if hasattr(model, "__dict__"):
+                model.autoround_config = weight_config
             return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
         else:  # pragma: no cover
             _, quantized_model_path = rounder.quantize_and_save(
@@ -229,10 +238,12 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             )
             self.output_dir = quantized_model_path
             model = rounder.model
-            model.autoround_config = rounder.layer_config
+            if hasattr(model, "__dict__"):
+                model.autoround_config = rounder.layer_config
 
         self.accelerator.empty_cache()
-        dump_model_op_stats(rounder.layer_config)
+        if not bool(getattr(self, "model_free", False)):
+            dump_model_op_stats(rounder.layer_config)
 
         reloading = self.__dict__.get("reloading", True)
         if self.export_format in ["auto_round", "llm_compressor"] and reloading:
@@ -248,7 +259,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
             except Exception as e:
                 logger.error(f"Error reloading model: {e}")
 
-        setattr(model, "name_or_path", self.output_dir)  # model is saved in a subfolder of output_dir based on scheme
+        if hasattr(model, "__dict__"):
+            # model is saved in a subfolder of output_dir based on scheme
+            setattr(model, "name_or_path", self.output_dir)
         return model
 
 
diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
index e15d19fba96..95651695030 100644
--- a/neural_compressor/torch/quantization/algorithm_entry.py
+++ b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -560,12 +560,12 @@ def teq_quantize_entry(
 ###################### AUTOROUND Algo Entry ##################################
 @register_algo(name=AUTOROUND)
 def autoround_quantize_entry(
-    model: torch.nn.Module,
+    model,
     configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig],
     mode: Mode = Mode.QUANTIZE,
     *args,
     **kwargs,
-) -> torch.nn.Module:
+):
     """The main entry to apply AutoRound quantization.
 
     Args:
@@ -630,8 +630,10 @@ def autoround_quantize_entry(
     kwargs.pop("example_inputs")
     quantizer = get_quantizer(model, quantizer_cls=AutoRoundQuantizer, quant_config=quant_config, **params_dict)
     model = quantizer.execute(model=model, mode=mode, *args, **kwargs)
-    model.qconfig = configs_mapping
-    model.save = MethodType(save, model)
+    if hasattr(model, "__dict__"):
+        model.qconfig = configs_mapping
+        if isinstance(model, torch.nn.Module):
+            model.save = MethodType(save, model)
     postprocess_model(model, mode, quantizer)
     return model
 
diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py
index 17641d81f2f..3bd7729b2b3 100644
--- a/neural_compressor/torch/quantization/quantize.py
+++ b/neural_compressor/torch/quantization/quantize.py
@@ -33,6 +33,25 @@
 FRAMEWORK_NAME = "torch"
 
 
+class _AutoRoundModelReference:
+    """A lightweight container for model-free AutoRound prepare/convert flow."""
+
+    def __init__(self, model_reference: str, quant_config: BaseConfig, example_inputs: Any = None):
+        self.model_reference = model_reference
+        self.quant_config = quant_config
+        self.example_inputs = example_inputs
+        self.is_prepared = True
+
+
+def _is_autoround_model_free_string_case(model: Any, quant_config: BaseConfig) -> bool:
+    """Return True when model-free AutoRound is called with a string model reference."""
+    return (
+        isinstance(quant_config, AutoRoundConfig)
+        and bool(getattr(quant_config, "model_free", False))
+        and isinstance(model, str)
+    )
+
+
 def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name):
     """Check whether to apply this algorithm according to configs_mapping.
 
@@ -89,12 +108,16 @@ def preprocess_quant_config(model, quant_config, mode="prepare", example_inputs=
                 )
         model_info = quant_config.get_model_info(model, example_inputs)
     elif isinstance(quant_config, AutoRoundConfig):
-        for _attr in ("tokenizer", "processor", "image_processor", "template"):
-            _backup = getattr(quant_config, _attr, None)
-            if _backup is not None:
-                setattr(model, _attr, _backup)
-                delattr(quant_config, _attr)
-        model_info = quant_config.get_model_info(model=model)
+        if _is_autoround_model_free_string_case(model, quant_config):
+            # Keep optional large objects on config when model is a string reference.
+            model_info = quant_config.get_model_info(model=None)
+        else:
+            for _attr in ("tokenizer", "processor", "image_processor", "template"):
+                _backup = getattr(quant_config, _attr, None)
+                if _backup is not None:
+                    setattr(model, _attr, _backup)
+                    delattr(quant_config, _attr)
+            model_info = quant_config.get_model_info(model=model)
     else:
         model_info = quant_config.get_model_info(model=model)
 
@@ -172,6 +195,9 @@ def prepare(
     Returns:
         prepared and calibrated module.
     """
+    if _is_autoround_model_free_string_case(model, quant_config):
+        return _AutoRoundModelReference(model_reference=model, quant_config=quant_config, example_inputs=example_inputs)
+
     prepared_model = model if inplace else copy.deepcopy(model)
     prepared_model, configs_mapping = preprocess_quant_config(
         prepared_model, quant_config, mode="prepare", example_inputs=example_inputs
@@ -240,6 +266,13 @@ def convert(
     Returns:
         The quantized model.
     """
+    if isinstance(model, _AutoRoundModelReference):
+        if quant_config is None:
+            quant_config = model.quant_config
+        else:
+            logger.warning("quant_config will be ignored since the model has been prepared.")
+        model = model.model_reference
+
     q_model = model if inplace else copy.deepcopy(model)
 
     assert (
@@ -287,7 +320,8 @@ def convert(
                 mode=Mode.CONVERT,
                 **kwargs,
             )
-    setattr(q_model, "is_quantized", True)
+    if hasattr(q_model, "__dict__"):
+        setattr(q_model, "is_quantized", True)
     return q_model
 
 
diff --git a/test/torch/quantization/test_autoround_cpu.py b/test/torch/quantization/test_autoround_cpu.py
index d25fc9bb1e6..f70e95248d7 100644
--- a/test/torch/quantization/test_autoround_cpu.py
+++ b/test/torch/quantization/test_autoround_cpu.py
@@ -16,6 +16,10 @@
     prepare,
     quantize,
 )
+from neural_compressor.torch.quantization.quantize import (
+    _AutoRoundModelReference,
+    _is_autoround_model_free_string_case,
+)
 from neural_compressor.torch.utils import logger
 
 torch.backends.__allow_nonbracketed_mutation_flag = True
@@ -638,3 +642,82 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tmp_
             assert (
                 getattr(attn, "q_scale", None) is not None
             ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}"
+
+    def test_is_autoround_model_free_string_case_true(self):
+        """Test detection when model is string and config has model_free=True."""
+        config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+        model = "/path/to/model"
+        assert _is_autoround_model_free_string_case(model, config) is True
+
+    def test_is_autoround_model_free_string_case_false_not_string(self):
+        """Test detection returns False when model is not a string."""
+        config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+        model = torch.nn.Linear(10, 10)
+        assert _is_autoround_model_free_string_case(model, config) is False
+
+    def test_is_autoround_model_free_string_case_false_no_flag(self):
+        """Test detection returns False when model_free is not set."""
+        config = AutoRoundConfig(scheme="MXFP4")
+        model = "/path/to/model"
+        assert _is_autoround_model_free_string_case(model, config) is False
+
+    def test_is_autoround_model_free_string_case_false_model_free_false(self):
+        """Test detection returns False when model_free is explicitly False."""
+        config = AutoRoundConfig(model_free=False, scheme="MXFP4")
+        model = "/path/to/model"
+        assert _is_autoround_model_free_string_case(model, config) is False
+
+    def test_autoround_model_reference_creation(self):
+        """Test _AutoRoundModelReference wrapper creation."""
+        model_ref = "/path/to/deepseek-v4"
+        config = AutoRoundConfig(model_free=True, scheme="MXFP4")
+        example_inputs = {"input_ids": torch.ones(1, 10, dtype=torch.long)}
+
+        ref = _AutoRoundModelReference(model_reference=model_ref, quant_config=config, example_inputs=example_inputs)
+
+        assert ref.model_reference == model_ref
+        assert ref.quant_config is config
+        assert ref.example_inputs == example_inputs
+        assert ref.is_prepared is True
+
+    def test_prepare_with_string_model_and_model_free_returns_reference(self):
+        """Test that prepare() returns _AutoRoundModelReference when called with string model and model_free=True."""
+        model = "/path/to/model"
+        config = AutoRoundConfig(
+            model_free=True,
+            scheme="MXFP4",
+            ignore_layers="compressor",
+            output_dir="/tmp/test_output",
+        )
+
+        result = prepare(model, config)
+
+        assert isinstance(result, _AutoRoundModelReference)
+        assert result.model_reference == model
+        assert result.quant_config is config
+
+    def test_model_free_with_string_model(self):
+        """Test that prepare() preserves all config attributes in _AutoRoundModelReference."""
+        model = "facebook/opt-125m"
+        layer_config = {"fc2": {"bits": 4, "data_type": "mx_fp"}}
+        config = AutoRoundConfig(
+            model_free=True,
+            scheme="MXFP8",
+            ignore_layers="self_attn",
+            layer_config=layer_config,
+            export_format="llm_compressor",
+            output_dir="/tmp/quantized_model",
+        )
+
+        result = prepare(model, config)
+
+        assert isinstance(result, _AutoRoundModelReference)
+        assert result.quant_config.scheme == "MXFP8"
+        assert result.quant_config.ignore_layers == "self_attn"
+        assert result.quant_config.layer_config == layer_config
+        assert result.quant_config.export_format == "llm_compressor"
+
+        result = convert(result)
+        assert not hasattr(result.model.decoder.layers[0].self_attn.k_proj, "quantization_scheme"), "Ignored layers were not preserved during conversion."
+        assert result.model.decoder.layers[0].fc1.quantization_scheme.format.value == 'mxfp8-quantized', "Model conversion did not preserve the quantization scheme format."
+        assert result.model.decoder.layers[0].fc2.quantization_scheme.format.value == 'mxfp4-pack-quantized', "Model conversion did not preserve the quantization scheme format for layer_config."