From ba0c5e37890ae67312219498739b45e6b3a274dd Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 5 Jun 2026 11:50:10 +0800 Subject: [PATCH 01/16] for test Signed-off-by: chensuyue --- .../language-modeling/quantization/auto_round/qwen/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh index bccc69a712d..ceeba2e843e 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh @@ -61,7 +61,7 @@ done [ -z "$MODEL" ] && echo "Error: --model is required" && usage [ -z "$TARGET" ] && echo "Error: -t is required" && usage [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage - +export AR_DYNAMO_CACHE_SIZE_LIMIT=8 # for tuning duration regression test python quantize.py \ --model "$MODEL" \ -t "$TARGET" \ From c9b26003e55621ab566630f4a838b2e08ac4e015 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Tue, 9 Jun 2026 10:32:54 +0800 Subject: [PATCH 02/16] remove test code Signed-off-by: chensuyue --- .../language-modeling/quantization/auto_round/qwen/run_quant.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh index ceeba2e843e..bccc69a712d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_quant.sh @@ -61,7 +61,7 @@ done [ -z "$MODEL" ] && echo "Error: --model is required" && usage [ -z "$TARGET" ] && echo "Error: -t is required" && usage [ -z "$OUTPUT_DIR" ] && echo "Error: --output_dir is required" && usage -export AR_DYNAMO_CACHE_SIZE_LIMIT=8 # for tuning duration regression test + python quantize.py \ --model "$MODEL" \ -t "$TARGET" \ From c6c8c305d540feb50fb7158de5c87e9eddcec23c Mon Sep 17 00:00:00 2001 From: chensuyue Date: Wed, 10 Jun 2026 13:31:58 +0800 Subject: [PATCH 03/16] update ds test deps Signed-off-by: chensuyue --- .../auto_round/deepseek/requirements.txt | 9 +- .../quantization/auto_round/deepseek/setup.sh | 128 ++++++++++++++++-- 2 files changed, 118 insertions(+), 19 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt index f690755a087..32ed6147a26 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt @@ -1,9 +1,4 @@ -lm-eval==0.4.10 -lm-eval[api] loguru -compressed-tensors==0.12.2 +compressed-tensors==0.15.0.1 hf_transfer -transformers==4.57.3 -torch==2.9.0 -# pip install git+https://github.com/yiliu30/long-bench-eval -long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval \ No newline at end of file +transformers==4.57.3 \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh index 0486f99bfdd..67c4675e0b4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh @@ -1,12 +1,116 @@ -pip install -r requirements.txt -pip install setuptools --upgrade -pip install packaging --upgrade -pip install transformers==4.57.3 -pip install -U "huggingface_hub[cli]" -# Install vllm -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v -cd .. -# Uninstall flash_attn to avoid conflicts -pip uninstall flash_attn -y -pip install lm_eval["ruler"] \ No newline at end of file +#!/bin/bash +set -e +usage() { + echo "Usage: $0 --device=[gpu|xpu] --format=[AR|LLMC] --task=[task_list] --bench_tool=[lm_eval|aisbench]" + echo " --device target device for quantization (gpu or xpu)" + echo " --format quantization format (AR for auto_round, LLMC for llm_compressor)" + echo " --task comma-separated list of evaluation tasks (e.g. gsm8k,hellaswag)" + echo " --bench_tool benchmarking tool to use (lm_eval or aisbench)" +} + +detect_cuda_version() { + local cuda_version="" + local candidate_version="" + + if command -v nvidia-smi >/dev/null 2>&1; then + candidate_version=$(nvidia-smi 2>/dev/null | sed -n 's/.*CUDA Version: \([^ ]*\).*/\1/p' | head -n 1) + if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then + cuda_version="$candidate_version" + fi + fi + + if [[ -z "$cuda_version" ]] && command -v nvcc >/dev/null 2>&1; then + candidate_version=$(nvcc --version | awk '/release/ {print $6}' | sed 's/^V//; s/,//') + if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then + cuda_version="$candidate_version" + fi + fi + + if [[ -z "$cuda_version" ]]; then + echo "Unable to detect CUDA version from nvidia-smi or nvcc." >&2 + exit 1 + fi + + echo "$cuda_version" +} + +DEVICE="${DEVICE:-gpu}" +FORMAT="${FORMAT:-LLMC}" +TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k,ruler}" +BENCH_TOOL="${BENCH_TOOL:-lm_eval}" + +while [[ $# -gt 0 ]]; do + case $1 in + --device=*) + DEVICE="${1#*=}" + shift + ;; + --format=*) + FORMAT="${1#*=}" + shift + ;; + --task=*) + TASKS="${1#*=}" + shift + ;; + --bench_tool=*) + BENCH_TOOL="${1#*=}" + shift + ;; + *) + echo "Unknown parameter: $1" + usage + exit 1 + ;; + esac +done + +if [[ "$DEVICE" == "xpu" ]]; then + # support quant only on xpu for now + uv pip install torch==2.11.0 torchvision==0.26.0 --index-url https://download.pytorch.org/whl/xpu + uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/xpu +elif [[ "$DEVICE" == "gpu" ]]; then + uv pip install -r requirements.txt + uv pip install setuptools --upgrade + uv pip install packaging --upgrade + uv pip install -U "huggingface_hub[cli]" + if [[ "$FORMAT" == "LLMC" ]]; then + CUDA_VERSION=$(detect_cuda_version) + echo "Detected system CUDA version: $CUDA_VERSION" + if [[ "$CUDA_VERSION" == "12."* ]]; then + uv pip install vllm==0.22.0 --extra-index-url https://wheels.vllm.ai/0.22.0/cu129 --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match + uv pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match + elif [[ "$CUDA_VERSION" == "13."* ]]; then + uv pip install vllm==0.22.0 + else + echo "Unsupported CUDA version: $CUDA_VERSION. Supported versions are 12.x and 13.x." + exit 1 + fi + + uv pip install ray + git clone https://github.com/yiliu30/vllm-qdq-plugin.git + uv pip install vllm-qdq-plugin/ -v + else + # use default setting for AR format, required by fused-moe-ar + uv pip install torch==2.9.0 + git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork + VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v + cd .. + fi + if [[ "$BENCH_TOOL" == "lm_eval" ]]; then + uv pip install lm-eval==0.4.12 + uv pip install lm-eval[api] + uv pip install lm-eval["ruler"] + if [[ "$TASKS" == *"longbench"* ]]; then + uv pip install "long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval" + fi + elif [[ "$BENCH_TOOL" == "aisbench" ]]; then + echo "Installing aisbench..." + fi + # Uninstall flash_attn to avoid conflicts + uv pip uninstall flash_attn +else + echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu." + usage + exit 1 +fi \ No newline at end of file From 4923dede90dcf0314ef2d6ef9a421bdec9c279dd Mon Sep 17 00:00:00 2001 From: chensuyue Date: Wed, 10 Jun 2026 13:42:57 +0800 Subject: [PATCH 04/16] update device map for ds Signed-off-by: chensuyue --- .../quantization/auto_round/deepseek/quantize.py | 1 + .../quantization/auto_round/deepseek/run_quant.sh | 4 ++-- .../quantization/auto_round/llama3/run_quant.sh | 2 +- .../language-modeling/quantization/auto_round/llama3/setup.sh | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index 48d849556b8..eccc16b8a26 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -82,6 +82,7 @@ def quant_model(args): iters=iters, ignore_layers=config["fp_layers"], export_format=args.export_format, + device_map="auto", output_dir=output_dir, low_gpu_mem_usage=True, static_kv_dtype=static_kv_dtype, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh index cd9d28e65a0..91639e8c2ad 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh @@ -4,7 +4,7 @@ set -e MODEL="" TARGET="" OUTPUT_DIR="" -EXPORT_FORMAT="auto_round" +EXPORT_FORMAT="llm_compressor" STATIC_KV_DTYPE="None" STATIC_ATTENTION_DTYPE="None" @@ -15,7 +15,7 @@ usage() { echo " -kv datatype for kv cache (auto, fp8)" echo " -attn Data type for static attention cache (default: None)" echo " --output_dir output directory for quantized model" - echo " -f quantize model export_format (default: auto_round)" + echo " -f quantize model export_format (default: llm_compressor)" exit 1 } diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index cd7595e09cf..026ffe7bfe3 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -5,7 +5,7 @@ # Parse command line arguments KV_CACHE_DTYPE="auto" STATIC_ATTENTION_DTYPE="auto" -EXPORT_FORMAT="auto_round" +EXPORT_FORMAT="llm_compressor" while [[ $# -gt 0 ]]; do case $1 in --topology=*) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh index 9e4862e19f1..709b67ed117 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh @@ -35,7 +35,7 @@ detect_cuda_version() { } DEVICE="${DEVICE:-gpu}" -FORMAT="${FORMAT:-AR}" +FORMAT="${FORMAT:-LLMC}" TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k}" BENCH_TOOL="${BENCH_TOOL:-lm_eval}" From a105748604c9849a1d932b6896f46e8a4fd034dd Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 12 Jun 2026 16:37:01 +0800 Subject: [PATCH 05/16] update readme Signed-off-by: chensuyue --- .../auto_round/deepseek/README.md | 19 +++++-------------- .../quantization/auto_round/qwen/README.md | 14 +++----------- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md index 66c2c403b22..5541bd9b483 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP ## Requirement ```bash pip install neural-compressor-pt -# auto-round pip install auto-round -# vLLM -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv -# other requirements -pip install -r requirements.txt -pip uninstall flash_attn +bash setup.sh ``` ### Quantize Model @@ -65,7 +59,7 @@ bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4 ``` - NVFP4 ```bash -bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4 +bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_nvfp4 ``` ### Evaluation @@ -75,17 +69,14 @@ Usage: bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] ``` ```bash -bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8 -bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 +bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 ``` - MXFP4 ```bash -bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4 -bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 +bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 ``` - NVFP4 ```bash -bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4 -bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4 +bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4 ``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md index f8621d30d63..07ae97da23d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md @@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX ## Requirement ```bash uv pip install neural-compressor-pt -# auto-round uv pip install auto-round -# vLLM -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -VLLM_USE_PRECOMPILED=1 uv pip install --editable . -vvv -# other requirements -uv pip install -r requirements.txt -uv pip uninstall flash_attn +bash setup.sh ``` ### Quantize Model @@ -74,12 +68,10 @@ Usage: bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] ``` ```bash -bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8 -bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8 +bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8 ``` - MXFP4 ```bash -bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4 -bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4 +bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4 ``` \ No newline at end of file From c1245beb0d1294e2d0949082785f3951f8a42dd5 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 12 Jun 2026 16:41:20 +0800 Subject: [PATCH 06/16] Update ds ruler with ruler_qa_squad task Signed-off-by: chensuyue --- .../quantization/auto_round/deepseek/run_evaluation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 0ce8e608ffc..17fa299ef1a 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -104,7 +104,7 @@ if [[ "$TASK_NAME" == *"ruler"* ]]; then max_length=${MODEL_MAX_POS} max_gen_toks=128 SEQ_LENGTHS="${MODEL_MAX_POS}" - TASK_NAME="niah_multiquery" + TASK_NAME="niah_multiquery,ruler_qa_squad" BATCH_SIZE=32 fi From 810cd5eb7a9083fddcec5cde82fcf2a5242860c8 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 15 Jun 2026 20:50:51 +0800 Subject: [PATCH 07/16] run ruler_qa_squad Signed-off-by: chensuyue --- .../quantization/auto_round/qwen/run_evaluation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index 1811191057b..dc2fa25ec39 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -107,7 +107,7 @@ if [[ "$TASK_NAME" == *"ruler"* ]]; then max_length=${MODEL_MAX_POS} max_gen_toks=128 SEQ_LENGTHS="${MODEL_MAX_POS}" - TASK_NAME="niah_multiquery" + TASK_NAME="ruler_qa_squad" # niah_multiquery,ruler_qa_squad BATCH_SIZE=32 fi From d09dc05555c38b5359c83cdf697979771b02c355 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Tue, 16 Jun 2026 10:24:28 +0800 Subject: [PATCH 08/16] fix ruler test path Signed-off-by: chensuyue --- .../quantization/auto_round/qwen/run_evaluation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index dc2fa25ec39..29ac3fc7f38 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -413,7 +413,7 @@ run_aisbench_eval() { if [[ "$TASK_NAME" == *"longbench"* ]]; then echo "Running LongBench v2 evaluation..." run_longbench_eval -elif [[ "$TASK_NAME" == *"niah"* ]]; then +elif [[ "$TASK_NAME" == *"ruler"* ]]; then echo "Running RULER evaluation..." run_ruler_eval elif [[ "$TASK_NAME" == *"aisbench"* ]]; then From 8e544a662a8939fd7a8423dbb72ff07bc1492f3a Mon Sep 17 00:00:00 2001 From: chensuyue Date: Tue, 16 Jun 2026 13:35:14 +0800 Subject: [PATCH 09/16] use saperate task for ruler Signed-off-by: chensuyue --- .../auto_round/qwen/run_evaluation.sh | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index 29ac3fc7f38..a1aa2ea1679 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -102,12 +102,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then fi # update max_length based on the task -if [[ "$TASK_NAME" == *"ruler"* ]]; then +if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then + max_gen_toks=128 MODEL_MAX_POS=${RULER_MAX_POS:-131072} + if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then + # if input task is ruler_qa_squad + MODEL_MAX_POS=$((131072 - max_gen_toks)) + TASK_NAME="ruler_qa_squad" + else + # if input task is ruler or niah_multiquery + TASK_NAME="niah_multiquery" + fi max_length=${MODEL_MAX_POS} - max_gen_toks=128 SEQ_LENGTHS="${MODEL_MAX_POS}" - TASK_NAME="ruler_qa_squad" # niah_multiquery,ruler_qa_squad BATCH_SIZE=32 fi @@ -413,7 +420,7 @@ run_aisbench_eval() { if [[ "$TASK_NAME" == *"longbench"* ]]; then echo "Running LongBench v2 evaluation..." run_longbench_eval -elif [[ "$TASK_NAME" == *"ruler"* ]]; then +elif [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then echo "Running RULER evaluation..." run_ruler_eval elif [[ "$TASK_NAME" == *"aisbench"* ]]; then From d451a6040687e0fb073e129f8f55adc371823235 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Tue, 16 Jun 2026 23:19:50 +0800 Subject: [PATCH 10/16] update ds ruler params Signed-off-by: chensuyue --- .../auto_round/deepseek/run_evaluation.sh | 15 +++++++++++---- .../auto_round/qwen/run_evaluation.sh | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 17fa299ef1a..b2662295da9 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -99,12 +99,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then fi # update max_length based on the task -if [[ "$TASK_NAME" == *"ruler"* ]]; then +if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then + max_gen_toks=128 MODEL_MAX_POS=${RULER_MAX_POS:-131072} + if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then + # if input task is ruler_qa_squad + MODEL_MAX_POS=$((131072 - max_gen_toks)) + TASK_NAME="ruler_qa_squad" + else + # if input task is ruler or niah_multiquery + TASK_NAME="niah_multiquery" + fi max_length=${MODEL_MAX_POS} - max_gen_toks=128 - SEQ_LENGTHS="${MODEL_MAX_POS}" - TASK_NAME="niah_multiquery,ruler_qa_squad" + SEQ_LENGTHS=${MODEL_MAX_POS} BATCH_SIZE=32 fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index a1aa2ea1679..91f58c37be1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -114,7 +114,7 @@ if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; TASK_NAME="niah_multiquery" fi max_length=${MODEL_MAX_POS} - SEQ_LENGTHS="${MODEL_MAX_POS}" + SEQ_LENGTHS=${MODEL_MAX_POS} BATCH_SIZE=32 fi From d23bdf0b4374e954a2e28918f2a37a186e8733fd Mon Sep 17 00:00:00 2001 From: chensuyue Date: Wed, 17 Jun 2026 07:27:03 +0800 Subject: [PATCH 11/16] fix vllm server Signed-off-by: chensuyue --- .../auto_round/deepseek/run_evaluation.sh | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index b2662295da9..4b53f299dac 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -212,6 +212,21 @@ run_standard_eval() { # Function to start vLLM server start_vllm_server() { echo "Starting vLLM server on port ${SERVER_PORT}..." + + # Detect vLLM version for backward-compatible rope scaling + # vLLM >= 0.19 removed --rope-scaling; use --hf-overrides instead + VLLM_VERSION=$(python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "0.0.0") + VLLM_MAJOR_MINOR=$(echo "$VLLM_VERSION" | awk -F. '{printf "%d%02d", $1, $2}') + ROPE_SCALING_JSON='{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' + if [ "$VLLM_MAJOR_MINOR" -ge 19 ] 2>/dev/null; then + ROPE_FLAG="--hf-overrides" + ROPE_VALUE="{\"rope_scaling\":${ROPE_SCALING_JSON},\"max_position_embeddings\":131072}" + else + ROPE_FLAG="--rope-scaling" + ROPE_VALUE="${ROPE_SCALING_JSON}" + fi + echo "vLLM version: ${VLLM_VERSION}, using: ${ROPE_FLAG} '${ROPE_VALUE}'" + vllm serve ${MODEL_PATH} \ --port ${SERVER_PORT} \ --tensor-parallel-size ${TP_SIZE} \ From c1036f48edc2282e2793dcab97d060f27dc8d30c Mon Sep 17 00:00:00 2001 From: chensuyue Date: Wed, 17 Jun 2026 07:41:58 +0800 Subject: [PATCH 12/16] bug fix Signed-off-by: chensuyue --- .../quantization/auto_round/deepseek/run_evaluation.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 4b53f299dac..355c5ed69d7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -234,7 +234,6 @@ start_vllm_server() { --gpu-memory-utilization 0.8 \ --dtype bfloat16 \ --kv-cache-dtype ${KV_CACHE_DTYPE} \ - --disable-log-requests \ > ${OUTPUT_DIR}/vllm_server.log 2>&1 & VLLM_PID=$! From 63a63a221ed9068d457162c0c286da9da1702f3c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 17 Jun 2026 11:23:01 +0000 Subject: [PATCH 13/16] use con > 1 Signed-off-by: yiliu30 --- .../quantization/auto_round/deepseek/run_evaluation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 355c5ed69d7..c9a6f375af5 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -313,7 +313,7 @@ run_ruler_eval() { echo "Running Ruler evaluation against vLLM server..." lm_eval \ --model local-completions \ - --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=1,max_retries=50,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \ + --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=16,max_retries=500,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \ --tasks $TASK_NAME \ --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \ --gen_kwargs "max_gen_toks=${max_gen_toks}" \ From eb89c72ddf6b4b59511edbbe13d7440d07f6fcca Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 17 Jun 2026 21:09:35 +0800 Subject: [PATCH 14/16] Update run_evaluation.sh --- .../quantization/auto_round/deepseek/run_evaluation.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index c9a6f375af5..a5b7ed2af56 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -318,6 +318,7 @@ run_ruler_eval() { --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \ --gen_kwargs "max_gen_toks=${max_gen_toks}" \ --batch_size ${BATCH_SIZE} \ + --limit 32 \ --output_path "${OUTPUT_DIR}/seq_${SEQ_LENGTHS}" \ --seed 42 From 4fc41091be59e98eed951703852a3c63016b8de6 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 17 Jun 2026 15:56:25 +0000 Subject: [PATCH 15/16] Revert "Update run_evaluation.sh" This reverts commit eb89c72ddf6b4b59511edbbe13d7440d07f6fcca. --- .../quantization/auto_round/deepseek/run_evaluation.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index a5b7ed2af56..c9a6f375af5 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -318,7 +318,6 @@ run_ruler_eval() { --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \ --gen_kwargs "max_gen_toks=${max_gen_toks}" \ --batch_size ${BATCH_SIZE} \ - --limit 32 \ --output_path "${OUTPUT_DIR}/seq_${SEQ_LENGTHS}" \ --seed 42 From cfac3ad959a219bdbad6d29b2fc49da23d7efbf8 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 17 Jun 2026 15:56:25 +0000 Subject: [PATCH 16/16] Revert "use con > 1" This reverts commit 63a63a221ed9068d457162c0c286da9da1702f3c. --- .../quantization/auto_round/deepseek/run_evaluation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index c9a6f375af5..355c5ed69d7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -313,7 +313,7 @@ run_ruler_eval() { echo "Running Ruler evaluation against vLLM server..." lm_eval \ --model local-completions \ - --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=16,max_retries=500,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \ + --model_args "model=$MODEL_PATH,base_url=http://localhost:${SERVER_PORT}/v1/completions,num_concurrent=1,max_retries=50,timeout=500,tokenized_requests=False,max_gen_toks=${max_gen_toks}" \ --tasks $TASK_NAME \ --metadata="{\"max_seq_lengths\":[${SEQ_LENGTHS}],\"tokenizer\":\"${MODEL_PATH}\"}" \ --gen_kwargs "max_gen_toks=${max_gen_toks}" \