intel · yiliu30 · Jun 5, 2026 · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/...huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/...huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
@@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
 ## Requirement
 ```bash
 pip install neural-compressor-pt
-# auto-round
 pip install auto-round
-# vLLM
-git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
-VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
-# other requirements
-pip install -r requirements.txt
-pip uninstall flash_attn
+bash setup.sh
 ```
 
 ### Quantize Model
@@ -65,7 +59,7 @@ bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
 ```
 - NVFP4
 ```bash
-bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4
+bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_nvfp4
 ```
 ### Evaluation
 
@@ -75,17 +69,14 @@ Usage:
 bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
 ```
 ```bash
-bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
-bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
+bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
 
 ```
 - MXFP4
 ```bash
-bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
-bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
+bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
 ```
 - NVFP4
 ```bash
-bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4
-bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
+bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
 ```
diff --git a/...rch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/...rch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -82,6 +82,7 @@ def quant_model(args):
         iters=iters,
         ignore_layers=config["fp_layers"],
         export_format=args.export_format,
+        device_map="auto",
         output_dir=output_dir,
         low_gpu_mem_usage=True,
         static_kv_dtype=static_kv_dtype,

diff --git a/...lp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/...lp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
@@ -1,9 +1,4 @@
-lm-eval==0.4.10
-lm-eval[api]
 loguru
-compressed-tensors==0.12.2
+compressed-tensors==0.15.0.1
 hf_transfer
-transformers==4.57.3
-torch==2.9.0
-# pip install git+https://github.com/yiliu30/long-bench-eval
-long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
+transformers==4.57.3
diff --git a/...p/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/...p/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -99,12 +99,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then
 fi
 
 # update max_length based on the task
-if [[ "$TASK_NAME" == *"ruler"* ]]; then
+if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
+    max_gen_toks=128
     MODEL_MAX_POS=${RULER_MAX_POS:-131072}
+    if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then
+        # if input task is ruler_qa_squad
+        MODEL_MAX_POS=$((131072 - max_gen_toks))
+        TASK_NAME="ruler_qa_squad"
+    else
+        # if input task is ruler or niah_multiquery
+        TASK_NAME="niah_multiquery"
+    fi
     max_length=${MODEL_MAX_POS}
-    max_gen_toks=128
-    SEQ_LENGTHS="${MODEL_MAX_POS}"
-    TASK_NAME="niah_multiquery"
+    SEQ_LENGTHS=${MODEL_MAX_POS}
     BATCH_SIZE=32
 fi
 
@@ -205,14 +212,28 @@ run_standard_eval() {
 # Function to start vLLM server
 start_vllm_server() {
     echo "Starting vLLM server on port ${SERVER_PORT}..."
+
+    # Detect vLLM version for backward-compatible rope scaling
+    # vLLM >= 0.19 removed --rope-scaling; use --hf-overrides instead
+    VLLM_VERSION=$(python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "0.0.0")
+    VLLM_MAJOR_MINOR=$(echo "$VLLM_VERSION" | awk -F. '{printf "%d%02d", $1, $2}')
+    ROPE_SCALING_JSON='{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}'
+    if [ "$VLLM_MAJOR_MINOR" -ge 19 ] 2>/dev/null; then
+        ROPE_FLAG="--hf-overrides"
+        ROPE_VALUE="{\"rope_scaling\":${ROPE_SCALING_JSON},\"max_position_embeddings\":131072}"
+    else
+        ROPE_FLAG="--rope-scaling"
+        ROPE_VALUE="${ROPE_SCALING_JSON}"
+    fi
+    echo "vLLM version: ${VLLM_VERSION}, using: ${ROPE_FLAG} '${ROPE_VALUE}'"
+
     vllm serve ${MODEL_PATH} \
         --port ${SERVER_PORT} \
         --tensor-parallel-size ${TP_SIZE} \
         --max-model-len ${max_length} \
         --gpu-memory-utilization 0.8 \
         --dtype bfloat16 \
         --kv-cache-dtype ${KV_CACHE_DTYPE} \
-        --disable-log-requests \
         > ${OUTPUT_DIR}/vllm_server.log 2>&1 &
 
     VLLM_PID=$!

diff --git a/...ch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/...ch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
@@ -4,7 +4,7 @@ set -e
 MODEL=""
 TARGET=""
 OUTPUT_DIR=""
-EXPORT_FORMAT="auto_round"
+EXPORT_FORMAT="llm_compressor"
 STATIC_KV_DTYPE="None"
 STATIC_ATTENTION_DTYPE="None"
 
@@ -15,7 +15,7 @@ usage() {
   echo "  -kv datatype for kv cache (auto, fp8)"
   echo "  -attn        Data type for static attention cache (default: None)"
   echo "  --output_dir output directory for quantized model"
-  echo "  -f           quantize model export_format (default: auto_round)"
+  echo "  -f           quantize model export_format (default: llm_compressor)"
   exit 1
 }
 

diff --git a/...ytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh b/...ytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh
@@ -1,12 +1,116 @@
-pip install -r requirements.txt
-pip install setuptools --upgrade
-pip install packaging --upgrade
-pip install transformers==4.57.3
-pip install -U "huggingface_hub[cli]"
-# Install vllm
-git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
-VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
-cd ..
-# Uninstall flash_attn to avoid conflicts
-pip uninstall flash_attn -y
-pip install lm_eval["ruler"]
+#!/bin/bash
+set -e
+usage() {
+    echo "Usage: $0 --device=[gpu|xpu] --format=[AR|LLMC] --task=[task_list] --bench_tool=[lm_eval|aisbench]"
+    echo "  --device    target device for quantization (gpu or xpu)"
+    echo "  --format    quantization format (AR for auto_round, LLMC for llm_compressor)"
+    echo "  --task      comma-separated list of evaluation tasks (e.g. gsm8k,hellaswag)"
+    echo "  --bench_tool benchmarking tool to use (lm_eval or aisbench)"
+}
+
+detect_cuda_version() {
+    local cuda_version=""
+    local candidate_version=""
+
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        candidate_version=$(nvidia-smi 2>/dev/null | sed -n 's/.*CUDA Version: \([^ ]*\).*/\1/p' | head -n 1)
+        if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then
+            cuda_version="$candidate_version"
+        fi
+    fi
+
+    if [[ -z "$cuda_version" ]] && command -v nvcc >/dev/null 2>&1; then
+        candidate_version=$(nvcc --version | awk '/release/ {print $6}' | sed 's/^V//; s/,//')
+        if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then
+            cuda_version="$candidate_version"
+        fi
+    fi
+
+    if [[ -z "$cuda_version" ]]; then
+        echo "Unable to detect CUDA version from nvidia-smi or nvcc." >&2
+        exit 1
+    fi
+
+    echo "$cuda_version"
+}
+
+DEVICE="${DEVICE:-gpu}"
+FORMAT="${FORMAT:-LLMC}"
+TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k,ruler}"
+BENCH_TOOL="${BENCH_TOOL:-lm_eval}"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --device=*)
+            DEVICE="${1#*=}"
+            shift
+            ;;
+        --format=*)
+            FORMAT="${1#*=}"
+            shift
+            ;;
+        --task=*)
+            TASKS="${1#*=}"
+            shift
+            ;;
+        --bench_tool=*)
+            BENCH_TOOL="${1#*=}"
+            shift
+            ;;
+        *)
+            echo "Unknown parameter: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if [[ "$DEVICE" == "xpu" ]]; then
+    # support quant only on xpu for now
+    uv pip install torch==2.11.0 torchvision==0.26.0 --index-url https://download.pytorch.org/whl/xpu
+    uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/xpu
+elif [[ "$DEVICE" == "gpu" ]]; then
+    uv pip install -r requirements.txt
+    uv pip install setuptools --upgrade
+    uv pip install packaging --upgrade
+    uv pip install -U "huggingface_hub[cli]"
+    if [[ "$FORMAT" == "LLMC" ]]; then
+        CUDA_VERSION=$(detect_cuda_version)
+        echo "Detected system CUDA version: $CUDA_VERSION"
+        if [[ "$CUDA_VERSION" == "12."* ]]; then
+            uv pip install vllm==0.22.0 --extra-index-url https://wheels.vllm.ai/0.22.0/cu129 --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match
+            uv pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match
+        elif [[ "$CUDA_VERSION" == "13."* ]]; then
+            uv pip install vllm==0.22.0
+        else
+            echo "Unsupported CUDA version: $CUDA_VERSION. Supported versions are 12.x and 13.x."
+            exit 1
+        fi
+
+        uv pip install ray
+        git clone https://github.com/yiliu30/vllm-qdq-plugin.git
+        uv pip install vllm-qdq-plugin/ -v
+    else
+        # use default setting for AR format, required by fused-moe-ar
+        uv pip install torch==2.9.0
+        git clone -b fused-moe-ar  --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
+        VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
+        cd ..
+    fi
+    if [[ "$BENCH_TOOL" == "lm_eval" ]]; then
+        uv pip install lm-eval==0.4.12
+        uv pip install lm-eval[api]
+        uv pip install lm-eval["ruler"]
+        if [[ "$TASKS" == *"longbench"* ]]; then
+            uv pip install "long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval"
+        fi
+    elif [[ "$BENCH_TOOL" == "aisbench" ]]; then
+        echo "Installing aisbench..."
+    fi
+    # Uninstall flash_attn to avoid conflicts
+    uv pip uninstall flash_attn
+else
+    echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu."
+    usage
+    exit 1
+fi
diff --git a/...orch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/...orch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -5,7 +5,7 @@
 # Parse command line arguments
 KV_CACHE_DTYPE="auto"
 STATIC_ATTENTION_DTYPE="auto"
-EXPORT_FORMAT="auto_round"
+EXPORT_FORMAT="llm_compressor"
 while [[ $# -gt 0 ]]; do
     case $1 in
         --topology=*)

diff --git a/.../pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh b/.../pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh
@@ -35,7 +35,7 @@ detect_cuda_version() {
 }
 
 DEVICE="${DEVICE:-gpu}"
-FORMAT="${FORMAT:-AR}"
+FORMAT="${FORMAT:-LLMC}"
 TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k}"
 BENCH_TOOL="${BENCH_TOOL:-lm_eval}"
 

diff --git a/...nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/...nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md
@@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
 ## Requirement
 ```bash
 uv pip install neural-compressor-pt
-# auto-round
 uv pip install auto-round
-# vLLM
-git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
-VLLM_USE_PRECOMPILED=1 uv pip install --editable . -vvv
-# other requirements
-uv pip install -r requirements.txt
-uv pip uninstall flash_attn
+bash setup.sh
 ```
 
 ### Quantize Model
@@ -74,12 +68,10 @@ Usage:
 bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
 ```
 ```bash
-bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8
-bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8
+bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8
 
 ```
 - MXFP4
 ```bash
-bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4
-bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
+bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
 ```
diff --git a/...h/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/...h/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -102,12 +102,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then
 fi
 
 # update max_length based on the task
-if [[ "$TASK_NAME" == *"ruler"* ]]; then
+if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
+    max_gen_toks=128
     MODEL_MAX_POS=${RULER_MAX_POS:-131072}
+    if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then
+        # if input task is ruler_qa_squad
+        MODEL_MAX_POS=$((131072 - max_gen_toks))
+        TASK_NAME="ruler_qa_squad"
+    else
+        # if input task is ruler or niah_multiquery
+        TASK_NAME="niah_multiquery"
+    fi
     max_length=${MODEL_MAX_POS}
-    max_gen_toks=128
-    SEQ_LENGTHS="${MODEL_MAX_POS}"
-    TASK_NAME="niah_multiquery"
+    SEQ_LENGTHS=${MODEL_MAX_POS}
     BATCH_SIZE=32
 fi
 
@@ -413,7 +420,7 @@ run_aisbench_eval() {
 if [[ "$TASK_NAME" == *"longbench"* ]]; then
     echo "Running LongBench v2 evaluation..."
     run_longbench_eval
-elif [[ "$TASK_NAME" == *"niah"* ]]; then
+elif [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
     echo "Running RULER evaluation..."
     run_ruler_eval
 elif [[ "$TASK_NAME" == *"aisbench"* ]]; then