From ba0c5e37890ae67312219498739b45e6b3a274dd Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 5 Jun 2026 11:50:10 +0800
Subject: [PATCH 01/16] for test

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../language-modeling/quantization/auto_round/qwen/run_quant.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh
index bccc69a712d..ceeba2e843e 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh
@@ -61,7 +61,7 @@ done
 [ -z "$MODEL" ] && echo "Error: --model is required" && usage
 [ -z "$TARGET" ] && echo "Error: -t is required" && usage
 [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage
-
+export AR_DYNAMO_CACHE_SIZE_LIMIT=8 # for tuning duration regression test
 python quantize.py \
   --model "$MODEL" \
   -t "$TARGET" \

From c9b26003e55621ab566630f4a838b2e08ac4e015 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Tue, 9 Jun 2026 10:32:54 +0800
Subject: [PATCH 02/16] remove test code

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../language-modeling/quantization/auto_round/qwen/run_quant.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh
index ceeba2e843e..bccc69a712d 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh
@@ -61,7 +61,7 @@ done
 [ -z "$MODEL" ] && echo "Error: --model is required" && usage
 [ -z "$TARGET" ] && echo "Error: -t is required" && usage
 [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage
-export AR_DYNAMO_CACHE_SIZE_LIMIT=8 # for tuning duration regression test
+
 python quantize.py \
   --model "$MODEL" \
   -t "$TARGET" \

From c6c8c305d540feb50fb7158de5c87e9eddcec23c Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Wed, 10 Jun 2026 13:31:58 +0800
Subject: [PATCH 03/16] update ds test deps

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../auto_round/deepseek/requirements.txt      |   9 +-
 .../quantization/auto_round/deepseek/setup.sh | 128 ++++++++++++++++--
 2 files changed, 118 insertions(+), 19 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
index f690755a087..32ed6147a26 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt
@@ -1,9 +1,4 @@
-lm-eval==0.4.10
-lm-eval[api]
 loguru
-compressed-tensors==0.12.2
+compressed-tensors==0.15.0.1
 hf_transfer
-transformers==4.57.3
-torch==2.9.0
-# pip install git+https://github.com/yiliu30/long-bench-eval
-long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
\ No newline at end of file
+transformers==4.57.3
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh
index 0486f99bfdd..67c4675e0b4 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh
@@ -1,12 +1,116 @@
-pip install -r requirements.txt
-pip install setuptools --upgrade
-pip install packaging --upgrade
-pip install transformers==4.57.3
-pip install -U "huggingface_hub[cli]"
-# Install vllm
-git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
-VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
-cd ..
-# Uninstall flash_attn to avoid conflicts
-pip uninstall flash_attn -y
-pip install lm_eval["ruler"]
\ No newline at end of file
+#!/bin/bash
+set -e
+usage() {
+    echo "Usage: $0 --device=[gpu|xpu] --format=[AR|LLMC] --task=[task_list] --bench_tool=[lm_eval|aisbench]"
+    echo "  --device    target device for quantization (gpu or xpu)"
+    echo "  --format    quantization format (AR for auto_round, LLMC for llm_compressor)"
+    echo "  --task      comma-separated list of evaluation tasks (e.g. gsm8k,hellaswag)"
+    echo "  --bench_tool benchmarking tool to use (lm_eval or aisbench)"
+}
+
+detect_cuda_version() {
+    local cuda_version=""
+    local candidate_version=""
+
+    if command -v nvidia-smi >/dev/null 2>&1; then
+        candidate_version=$(nvidia-smi 2>/dev/null | sed -n 's/.*CUDA Version: \([^ ]*\).*/\1/p' | head -n 1)
+        if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then
+            cuda_version="$candidate_version"
+        fi
+    fi
+
+    if [[ -z "$cuda_version" ]] && command -v nvcc >/dev/null 2>&1; then
+        candidate_version=$(nvcc --version | awk '/release/ {print $6}' | sed 's/^V//; s/,//')
+        if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then
+            cuda_version="$candidate_version"
+        fi
+    fi
+
+    if [[ -z "$cuda_version" ]]; then
+        echo "Unable to detect CUDA version from nvidia-smi or nvcc." >&2
+        exit 1
+    fi
+
+    echo "$cuda_version"
+}
+
+DEVICE="${DEVICE:-gpu}"
+FORMAT="${FORMAT:-LLMC}"
+TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k,ruler}"
+BENCH_TOOL="${BENCH_TOOL:-lm_eval}"
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --device=*)
+            DEVICE="${1#*=}"
+            shift
+            ;;
+        --format=*)
+            FORMAT="${1#*=}"
+            shift
+            ;;
+        --task=*)
+            TASKS="${1#*=}"
+            shift
+            ;;
+        --bench_tool=*)
+            BENCH_TOOL="${1#*=}"
+            shift
+            ;;
+        *)
+            echo "Unknown parameter: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if [[ "$DEVICE" == "xpu" ]]; then
+    # support quant only on xpu for now
+    uv pip install torch==2.11.0 torchvision==0.26.0 --index-url https://download.pytorch.org/whl/xpu
+    uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/xpu
+elif [[ "$DEVICE" == "gpu" ]]; then
+    uv pip install -r requirements.txt
+    uv pip install setuptools --upgrade
+    uv pip install packaging --upgrade
+    uv pip install -U "huggingface_hub[cli]"
+    if [[ "$FORMAT" == "LLMC" ]]; then
+        CUDA_VERSION=$(detect_cuda_version)
+        echo "Detected system CUDA version: $CUDA_VERSION"
+        if [[ "$CUDA_VERSION" == "12."* ]]; then
+            uv pip install vllm==0.22.0 --extra-index-url https://wheels.vllm.ai/0.22.0/cu129 --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match
+            uv pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match
+        elif [[ "$CUDA_VERSION" == "13."* ]]; then
+            uv pip install vllm==0.22.0
+        else
+            echo "Unsupported CUDA version: $CUDA_VERSION. Supported versions are 12.x and 13.x."
+            exit 1
+        fi
+
+        uv pip install ray
+        git clone https://github.com/yiliu30/vllm-qdq-plugin.git
+        uv pip install vllm-qdq-plugin/ -v
+    else
+        # use default setting for AR format, required by fused-moe-ar
+        uv pip install torch==2.9.0
+        git clone -b fused-moe-ar  --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
+        VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
+        cd ..
+    fi
+    if [[ "$BENCH_TOOL" == "lm_eval" ]]; then
+        uv pip install lm-eval==0.4.12
+        uv pip install lm-eval[api]
+        uv pip install lm-eval["ruler"]
+        if [[ "$TASKS" == *"longbench"* ]]; then
+            uv pip install "long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval"
+        fi
+    elif [[ "$BENCH_TOOL" == "aisbench" ]]; then
+        echo "Installing aisbench..."
+    fi
+    # Uninstall flash_attn to avoid conflicts
+    uv pip uninstall flash_attn
+else
+    echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu."
+    usage
+    exit 1
+fi
\ No newline at end of file

From 4923dede90dcf0314ef2d6ef9a421bdec9c279dd Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Wed, 10 Jun 2026 13:42:57 +0800
Subject: [PATCH 04/16] update device map for ds

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseek/quantize.py              | 1 +
 .../quantization/auto_round/deepseek/run_quant.sh             | 4 ++--
 .../quantization/auto_round/llama3/run_quant.sh               | 2 +-
 .../language-modeling/quantization/auto_round/llama3/setup.sh | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
index 48d849556b8..eccc16b8a26 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py
@@ -82,6 +82,7 @@ def quant_model(args):
         iters=iters,
         ignore_layers=config["fp_layers"],
         export_format=args.export_format,
+        device_map="auto",
         output_dir=output_dir,
         low_gpu_mem_usage=True,
         static_kv_dtype=static_kv_dtype,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
index cd9d28e65a0..91639e8c2ad 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh
@@ -4,7 +4,7 @@ set -e
 MODEL=""
 TARGET=""
 OUTPUT_DIR=""
-EXPORT_FORMAT="auto_round"
+EXPORT_FORMAT="llm_compressor"
 STATIC_KV_DTYPE="None"
 STATIC_ATTENTION_DTYPE="None"
 
@@ -15,7 +15,7 @@ usage() {
   echo "  -kv datatype for kv cache (auto, fp8)"
   echo "  -attn        Data type for static attention cache (default: None)"
   echo "  --output_dir output directory for quantized model"
-  echo "  -f           quantize model export_format (default: auto_round)"
+  echo "  -f           quantize model export_format (default: llm_compressor)"
   exit 1
 }
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index cd7595e09cf..026ffe7bfe3 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -5,7 +5,7 @@
 # Parse command line arguments
 KV_CACHE_DTYPE="auto"
 STATIC_ATTENTION_DTYPE="auto"
-EXPORT_FORMAT="auto_round"
+EXPORT_FORMAT="llm_compressor"
 while [[ $# -gt 0 ]]; do
     case $1 in
         --topology=*)
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh
index 9e4862e19f1..709b67ed117 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh
@@ -35,7 +35,7 @@ detect_cuda_version() {
 }
 
 DEVICE="${DEVICE:-gpu}"
-FORMAT="${FORMAT:-AR}"
+FORMAT="${FORMAT:-LLMC}"
 TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k}"
 BENCH_TOOL="${BENCH_TOOL:-lm_eval}"
 

From a105748604c9849a1d932b6896f46e8a4fd034dd Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 12 Jun 2026 16:37:01 +0800
Subject: [PATCH 05/16] update readme

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../auto_round/deepseek/README.md             | 19 +++++--------------
 .../quantization/auto_round/qwen/README.md    | 14 +++-----------
 2 files changed, 8 insertions(+), 25 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
index 66c2c403b22..5541bd9b483 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md
@@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
 ## Requirement
 ```bash
 pip install neural-compressor-pt
-# auto-round
 pip install auto-round
-# vLLM
-git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
-VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
-# other requirements
-pip install -r requirements.txt
-pip uninstall flash_attn
+bash setup.sh
 ```
 
 ### Quantize Model
@@ -65,7 +59,7 @@ bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
 ```
 - NVFP4
 ```bash
-bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4
+bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_nvfp4
 ```
 ### Evaluation
 
@@ -75,17 +69,14 @@ Usage:
 bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
 ```
 ```bash
-bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
-bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
+bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
 
 ```
 - MXFP4
 ```bash
-bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
-bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
+bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
 ```
 - NVFP4
 ```bash
-bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4
-bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
+bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
 ```
\ No newline at end of file
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md
index f8621d30d63..07ae97da23d 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md
@@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
 ## Requirement
 ```bash
 uv pip install neural-compressor-pt
-# auto-round
 uv pip install auto-round
-# vLLM
-git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
-VLLM_USE_PRECOMPILED=1 uv pip install --editable . -vvv
-# other requirements
-uv pip install -r requirements.txt
-uv pip uninstall flash_attn
+bash setup.sh
 ```
 
 ### Quantize Model
@@ -74,12 +68,10 @@ Usage:
 bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
 ```
 ```bash
-bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8
-bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8
+bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8
 
 ```
 - MXFP4
 ```bash
-bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4
-bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
+bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
 ```
\ No newline at end of file

From c1245beb0d1294e2d0949082785f3951f8a42dd5 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Fri, 12 Jun 2026 16:41:20 +0800
Subject: [PATCH 06/16] Update ds ruler with ruler_qa_squad task

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseek/run_evaluation.sh          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index 0ce8e608ffc..17fa299ef1a 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -104,7 +104,7 @@ if [[ "$TASK_NAME" == *"ruler"* ]]; then
     max_length=${MODEL_MAX_POS}
     max_gen_toks=128
     SEQ_LENGTHS="${MODEL_MAX_POS}"
-    TASK_NAME="niah_multiquery"
+    TASK_NAME="niah_multiquery,ruler_qa_squad"
     BATCH_SIZE=32
 fi
 

From 810cd5eb7a9083fddcec5cde82fcf2a5242860c8 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Mon, 15 Jun 2026 20:50:51 +0800
Subject: [PATCH 07/16] run ruler_qa_squad

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/qwen/run_evaluation.sh              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
index 1811191057b..dc2fa25ec39 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -107,7 +107,7 @@ if [[ "$TASK_NAME" == *"ruler"* ]]; then
     max_length=${MODEL_MAX_POS}
     max_gen_toks=128
     SEQ_LENGTHS="${MODEL_MAX_POS}"
-    TASK_NAME="niah_multiquery"
+    TASK_NAME="ruler_qa_squad" # niah_multiquery,ruler_qa_squad
     BATCH_SIZE=32
 fi
 

From d09dc05555c38b5359c83cdf697979771b02c355 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Tue, 16 Jun 2026 10:24:28 +0800
Subject: [PATCH 08/16] fix ruler test path

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/qwen/run_evaluation.sh              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
index dc2fa25ec39..29ac3fc7f38 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -413,7 +413,7 @@ run_aisbench_eval() {
 if [[ "$TASK_NAME" == *"longbench"* ]]; then
     echo "Running LongBench v2 evaluation..."
     run_longbench_eval
-elif [[ "$TASK_NAME" == *"niah"* ]]; then
+elif [[ "$TASK_NAME" == *"ruler"* ]]; then
     echo "Running RULER evaluation..."
     run_ruler_eval
 elif [[ "$TASK_NAME" == *"aisbench"* ]]; then

From 8e544a662a8939fd7a8423dbb72ff07bc1492f3a Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Tue, 16 Jun 2026 13:35:14 +0800
Subject: [PATCH 09/16] use saperate task for ruler

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../auto_round/qwen/run_evaluation.sh             | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
index 29ac3fc7f38..a1aa2ea1679 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -102,12 +102,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then
 fi
 
 # update max_length based on the task
-if [[ "$TASK_NAME" == *"ruler"* ]]; then
+if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
+    max_gen_toks=128
     MODEL_MAX_POS=${RULER_MAX_POS:-131072}
+    if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then
+        # if input task is ruler_qa_squad
+        MODEL_MAX_POS=$((131072 - max_gen_toks))
+        TASK_NAME="ruler_qa_squad"
+    else
+        # if input task is ruler or niah_multiquery
+        TASK_NAME="niah_multiquery"
+    fi
     max_length=${MODEL_MAX_POS}
-    max_gen_toks=128
     SEQ_LENGTHS="${MODEL_MAX_POS}"
-    TASK_NAME="ruler_qa_squad" # niah_multiquery,ruler_qa_squad
     BATCH_SIZE=32
 fi
 
@@ -413,7 +420,7 @@ run_aisbench_eval() {
 if [[ "$TASK_NAME" == *"longbench"* ]]; then
     echo "Running LongBench v2 evaluation..."
     run_longbench_eval
-elif [[ "$TASK_NAME" == *"ruler"* ]]; then
+elif [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
     echo "Running RULER evaluation..."
     run_ruler_eval
 elif [[ "$TASK_NAME" == *"aisbench"* ]]; then

From d451a6040687e0fb073e129f8f55adc371823235 Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Tue, 16 Jun 2026 23:19:50 +0800
Subject: [PATCH 10/16] update ds ruler params

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../auto_round/deepseek/run_evaluation.sh         | 15 +++++++++++----
 .../auto_round/qwen/run_evaluation.sh             |  2 +-
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index 17fa299ef1a..b2662295da9 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -99,12 +99,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then
 fi
 
 # update max_length based on the task
-if [[ "$TASK_NAME" == *"ruler"* ]]; then
+if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
+    max_gen_toks=128
     MODEL_MAX_POS=${RULER_MAX_POS:-131072}
+    if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then
+        # if input task is ruler_qa_squad
+        MODEL_MAX_POS=$((131072 - max_gen_toks))
+        TASK_NAME="ruler_qa_squad"
+    else
+        # if input task is ruler or niah_multiquery
+        TASK_NAME="niah_multiquery"
+    fi
     max_length=${MODEL_MAX_POS}
-    max_gen_toks=128
-    SEQ_LENGTHS="${MODEL_MAX_POS}"
-    TASK_NAME="niah_multiquery,ruler_qa_squad"
+    SEQ_LENGTHS=${MODEL_MAX_POS}
     BATCH_SIZE=32
 fi
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
index a1aa2ea1679..91f58c37be1 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh
@@ -114,7 +114,7 @@ if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]];
         TASK_NAME="niah_multiquery"
     fi
     max_length=${MODEL_MAX_POS}
-    SEQ_LENGTHS="${MODEL_MAX_POS}"
+    SEQ_LENGTHS=${MODEL_MAX_POS}
     BATCH_SIZE=32
 fi
 

From d23bdf0b4374e954a2e28918f2a37a186e8733fd Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Wed, 17 Jun 2026 07:27:03 +0800
Subject: [PATCH 11/16] fix vllm server

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../auto_round/deepseek/run_evaluation.sh         | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index b2662295da9..4b53f299dac 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -212,6 +212,21 @@ run_standard_eval() {
 # Function to start vLLM server
 start_vllm_server() {
     echo "Starting vLLM server on port ${SERVER_PORT}..."
+
+    # Detect vLLM version for backward-compatible rope scaling
+    # vLLM >= 0.19 removed --rope-scaling; use --hf-overrides instead
+    VLLM_VERSION=$(python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "0.0.0")
+    VLLM_MAJOR_MINOR=$(echo "$VLLM_VERSION" | awk -F. '{printf "%d%02d", $1, $2}')
+    ROPE_SCALING_JSON='{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}'
+    if [ "$VLLM_MAJOR_MINOR" -ge 19 ] 2>/dev/null; then
+        ROPE_FLAG="--hf-overrides"
+        ROPE_VALUE="{\"rope_scaling\":${ROPE_SCALING_JSON},\"max_position_embeddings\":131072}"
+    else
+        ROPE_FLAG="--rope-scaling"
+        ROPE_VALUE="${ROPE_SCALING_JSON}"
+    fi
+    echo "vLLM version: ${VLLM_VERSION}, using: ${ROPE_FLAG} '${ROPE_VALUE}'"
+
     vllm serve ${MODEL_PATH} \
         --port ${SERVER_PORT} \
         --tensor-parallel-size ${TP_SIZE} \

From c1036f48edc2282e2793dcab97d060f27dc8d30c Mon Sep 17 00:00:00 2001
From: chensuyue <suyue.chen@intel.com>
Date: Wed, 17 Jun 2026 07:41:58 +0800
Subject: [PATCH 12/16] bug fix

Signed-off-by: chensuyue <suyue.chen@intel.com>
---
 .../quantization/auto_round/deepseek/run_evaluation.sh           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index 4b53f299dac..355c5ed69d7 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -234,7 +234,6 @@ start_vllm_server() {
         --gpu-memory-utilization 0.8 \
         --dtype bfloat16 \
         --kv-cache-dtype ${KV_CACHE_DTYPE} \
-        --disable-log-requests \
         > ${OUTPUT_DIR}/vllm_server.log 2>&1 &
     
     VLLM_PID=$!

From 63a63a221ed9068d457162c0c286da9da1702f3c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 17 Jun 2026 11:23:01 +0000
Subject: [PATCH 13/16] use con > 1

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../quantization/auto_round/deepseek/run_evaluation.sh          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index 355c5ed69d7..c9a6f375af5 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -313,7 +313,7 @@ run_ruler_eval() {
     echo "Running Ruler evaluation against vLLM server..."
     lm_eval \
         --model local-completions \
-        --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=1,max_retries=50,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \
+        --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=16,max_retries=500,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \
         --tasks $TASK_NAME \
         --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \
         --gen_kwargs "max_gen_toks=${max_gen_toks}" \

From eb89c72ddf6b4b59511edbbe13d7440d07f6fcca Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 17 Jun 2026 21:09:35 +0800
Subject: [PATCH 14/16] Update run_evaluation.sh

---
 .../quantization/auto_round/deepseek/run_evaluation.sh           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index c9a6f375af5..a5b7ed2af56 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -318,6 +318,7 @@ run_ruler_eval() {
         --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \
         --gen_kwargs "max_gen_toks=${max_gen_toks}" \
         --batch_size ${BATCH_SIZE} \
+        --limit 32 \
         --output_path "${OUTPUT_DIR}/seq_${SEQ_LENGTHS}" \
         --seed 42
 

From 4fc41091be59e98eed951703852a3c63016b8de6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 17 Jun 2026 15:56:25 +0000
Subject: [PATCH 15/16] Revert "Update run_evaluation.sh"

This reverts commit eb89c72ddf6b4b59511edbbe13d7440d07f6fcca.
---
 .../quantization/auto_round/deepseek/run_evaluation.sh           | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index a5b7ed2af56..c9a6f375af5 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -318,7 +318,6 @@ run_ruler_eval() {
         --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \
         --gen_kwargs "max_gen_toks=${max_gen_toks}" \
         --batch_size ${BATCH_SIZE} \
-        --limit 32 \
         --output_path "${OUTPUT_DIR}/seq_${SEQ_LENGTHS}" \
         --seed 42
 

From cfac3ad959a219bdbad6d29b2fc49da23d7efbf8 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 17 Jun 2026 15:56:25 +0000
Subject: [PATCH 16/16] Revert "use con > 1"

This reverts commit 63a63a221ed9068d457162c0c286da9da1702f3c.
---
 .../quantization/auto_round/deepseek/run_evaluation.sh          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
index c9a6f375af5..355c5ed69d7 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh
@@ -313,7 +313,7 @@ run_ruler_eval() {
     echo "Running Ruler evaluation against vLLM server..."
     lm_eval \
         --model local-completions \
-        --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=16,max_retries=500,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \
+        --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=1,max_retries=50,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \
         --tasks $TASK_NAME \
         --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \
         --gen_kwargs "max_gen_toks=${max_gen_toks}" \