diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md index 66c2c403b22..5541bd9b483 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/README.md @@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP ## Requirement ```bash pip install neural-compressor-pt -# auto-round pip install auto-round -# vLLM -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv -# other requirements -pip install -r requirements.txt -pip uninstall flash_attn +bash setup.sh ``` ### Quantize Model @@ -65,7 +59,7 @@ bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4 ``` - NVFP4 ```bash -bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4 +bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_nvfp4 ``` ### Evaluation @@ -75,17 +69,14 @@ Usage: bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] ``` ```bash -bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8 -bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 +bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8 ``` - MXFP4 ```bash -bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4 -bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 +bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4 ``` - NVFP4 ```bash -bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4 -bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4 +bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4 ``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py index 48d849556b8..eccc16b8a26 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/quantize.py @@ -82,6 +82,7 @@ def quant_model(args): iters=iters, ignore_layers=config["fp_layers"], export_format=args.export_format, + device_map="auto", output_dir=output_dir, low_gpu_mem_usage=True, static_kv_dtype=static_kv_dtype, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt index f690755a087..32ed6147a26 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt @@ -1,9 +1,4 @@ -lm-eval==0.4.10 -lm-eval[api] loguru -compressed-tensors==0.12.2 +compressed-tensors==0.15.0.1 hf_transfer -transformers==4.57.3 -torch==2.9.0 -# pip install git+https://github.com/yiliu30/long-bench-eval -long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval \ No newline at end of file +transformers==4.57.3 \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh index 0ce8e608ffc..355c5ed69d7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh @@ -99,12 +99,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then fi # update max_length based on the task -if [[ "$TASK_NAME" == *"ruler"* ]]; then +if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then + max_gen_toks=128 MODEL_MAX_POS=${RULER_MAX_POS:-131072} + if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then + # if input task is ruler_qa_squad + MODEL_MAX_POS=$((131072 - max_gen_toks)) + TASK_NAME="ruler_qa_squad" + else + # if input task is ruler or niah_multiquery + TASK_NAME="niah_multiquery" + fi max_length=${MODEL_MAX_POS} - max_gen_toks=128 - SEQ_LENGTHS="${MODEL_MAX_POS}" - TASK_NAME="niah_multiquery" + SEQ_LENGTHS=${MODEL_MAX_POS} BATCH_SIZE=32 fi @@ -205,6 +212,21 @@ run_standard_eval() { # Function to start vLLM server start_vllm_server() { echo "Starting vLLM server on port ${SERVER_PORT}..." + + # Detect vLLM version for backward-compatible rope scaling + # vLLM >= 0.19 removed --rope-scaling; use --hf-overrides instead + VLLM_VERSION=$(python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "0.0.0") + VLLM_MAJOR_MINOR=$(echo "$VLLM_VERSION" | awk -F. '{printf "%d%02d", $1, $2}') + ROPE_SCALING_JSON='{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}' + if [ "$VLLM_MAJOR_MINOR" -ge 19 ] 2>/dev/null; then + ROPE_FLAG="--hf-overrides" + ROPE_VALUE="{\"rope_scaling\":${ROPE_SCALING_JSON},\"max_position_embeddings\":131072}" + else + ROPE_FLAG="--rope-scaling" + ROPE_VALUE="${ROPE_SCALING_JSON}" + fi + echo "vLLM version: ${VLLM_VERSION}, using: ${ROPE_FLAG} '${ROPE_VALUE}'" + vllm serve ${MODEL_PATH} \ --port ${SERVER_PORT} \ --tensor-parallel-size ${TP_SIZE} \ @@ -212,7 +234,6 @@ start_vllm_server() { --gpu-memory-utilization 0.8 \ --dtype bfloat16 \ --kv-cache-dtype ${KV_CACHE_DTYPE} \ - --disable-log-requests \ > ${OUTPUT_DIR}/vllm_server.log 2>&1 & VLLM_PID=$! diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh index cd9d28e65a0..91639e8c2ad 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_quant.sh @@ -4,7 +4,7 @@ set -e MODEL="" TARGET="" OUTPUT_DIR="" -EXPORT_FORMAT="auto_round" +EXPORT_FORMAT="llm_compressor" STATIC_KV_DTYPE="None" STATIC_ATTENTION_DTYPE="None" @@ -15,7 +15,7 @@ usage() { echo " -kv datatype for kv cache (auto, fp8)" echo " -attn Data type for static attention cache (default: None)" echo " --output_dir output directory for quantized model" - echo " -f quantize model export_format (default: auto_round)" + echo " -f quantize model export_format (default: llm_compressor)" exit 1 } diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh index 0486f99bfdd..67c4675e0b4 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/setup.sh @@ -1,12 +1,116 @@ -pip install -r requirements.txt -pip install setuptools --upgrade -pip install packaging --upgrade -pip install transformers==4.57.3 -pip install -U "huggingface_hub[cli]" -# Install vllm -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v -cd .. -# Uninstall flash_attn to avoid conflicts -pip uninstall flash_attn -y -pip install lm_eval["ruler"] \ No newline at end of file +#!/bin/bash +set -e +usage() { + echo "Usage: $0 --device=[gpu|xpu] --format=[AR|LLMC] --task=[task_list] --bench_tool=[lm_eval|aisbench]" + echo " --device target device for quantization (gpu or xpu)" + echo " --format quantization format (AR for auto_round, LLMC for llm_compressor)" + echo " --task comma-separated list of evaluation tasks (e.g. gsm8k,hellaswag)" + echo " --bench_tool benchmarking tool to use (lm_eval or aisbench)" +} + +detect_cuda_version() { + local cuda_version="" + local candidate_version="" + + if command -v nvidia-smi >/dev/null 2>&1; then + candidate_version=$(nvidia-smi 2>/dev/null | sed -n 's/.*CUDA Version: \([^ ]*\).*/\1/p' | head -n 1) + if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then + cuda_version="$candidate_version" + fi + fi + + if [[ -z "$cuda_version" ]] && command -v nvcc >/dev/null 2>&1; then + candidate_version=$(nvcc --version | awk '/release/ {print $6}' | sed 's/^V//; s/,//') + if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then + cuda_version="$candidate_version" + fi + fi + + if [[ -z "$cuda_version" ]]; then + echo "Unable to detect CUDA version from nvidia-smi or nvcc." >&2 + exit 1 + fi + + echo "$cuda_version" +} + +DEVICE="${DEVICE:-gpu}" +FORMAT="${FORMAT:-LLMC}" +TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k,ruler}" +BENCH_TOOL="${BENCH_TOOL:-lm_eval}" + +while [[ $# -gt 0 ]]; do + case $1 in + --device=*) + DEVICE="${1#*=}" + shift + ;; + --format=*) + FORMAT="${1#*=}" + shift + ;; + --task=*) + TASKS="${1#*=}" + shift + ;; + --bench_tool=*) + BENCH_TOOL="${1#*=}" + shift + ;; + *) + echo "Unknown parameter: $1" + usage + exit 1 + ;; + esac +done + +if [[ "$DEVICE" == "xpu" ]]; then + # support quant only on xpu for now + uv pip install torch==2.11.0 torchvision==0.26.0 --index-url https://download.pytorch.org/whl/xpu + uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/xpu +elif [[ "$DEVICE" == "gpu" ]]; then + uv pip install -r requirements.txt + uv pip install setuptools --upgrade + uv pip install packaging --upgrade + uv pip install -U "huggingface_hub[cli]" + if [[ "$FORMAT" == "LLMC" ]]; then + CUDA_VERSION=$(detect_cuda_version) + echo "Detected system CUDA version: $CUDA_VERSION" + if [[ "$CUDA_VERSION" == "12."* ]]; then + uv pip install vllm==0.22.0 --extra-index-url https://wheels.vllm.ai/0.22.0/cu129 --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match + uv pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match + elif [[ "$CUDA_VERSION" == "13."* ]]; then + uv pip install vllm==0.22.0 + else + echo "Unsupported CUDA version: $CUDA_VERSION. Supported versions are 12.x and 13.x." + exit 1 + fi + + uv pip install ray + git clone https://github.com/yiliu30/vllm-qdq-plugin.git + uv pip install vllm-qdq-plugin/ -v + else + # use default setting for AR format, required by fused-moe-ar + uv pip install torch==2.9.0 + git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork + VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v + cd .. + fi + if [[ "$BENCH_TOOL" == "lm_eval" ]]; then + uv pip install lm-eval==0.4.12 + uv pip install lm-eval[api] + uv pip install lm-eval["ruler"] + if [[ "$TASKS" == *"longbench"* ]]; then + uv pip install "long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval" + fi + elif [[ "$BENCH_TOOL" == "aisbench" ]]; then + echo "Installing aisbench..." + fi + # Uninstall flash_attn to avoid conflicts + uv pip uninstall flash_attn +else + echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu." + usage + exit 1 +fi \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index cd7595e09cf..026ffe7bfe3 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -5,7 +5,7 @@ # Parse command line arguments KV_CACHE_DTYPE="auto" STATIC_ATTENTION_DTYPE="auto" -EXPORT_FORMAT="auto_round" +EXPORT_FORMAT="llm_compressor" while [[ $# -gt 0 ]]; do case $1 in --topology=*) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh index 9e4862e19f1..709b67ed117 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/setup.sh @@ -35,7 +35,7 @@ detect_cuda_version() { } DEVICE="${DEVICE:-gpu}" -FORMAT="${FORMAT:-AR}" +FORMAT="${FORMAT:-LLMC}" TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k}" BENCH_TOOL="${BENCH_TOOL:-lm_eval}" diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md index f8621d30d63..07ae97da23d 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/README.md @@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX ## Requirement ```bash uv pip install neural-compressor-pt -# auto-round uv pip install auto-round -# vLLM -git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork -VLLM_USE_PRECOMPILED=1 uv pip install --editable . -vvv -# other requirements -uv pip install -r requirements.txt -uv pip uninstall flash_attn +bash setup.sh ``` ### Quantize Model @@ -74,12 +68,10 @@ Usage: bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size] ``` ```bash -bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8 -bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8 +bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8 ``` - MXFP4 ```bash -bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4 -bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4 +bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4 ``` \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh index 1811191057b..91f58c37be1 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/qwen/run_evaluation.sh @@ -102,12 +102,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then fi # update max_length based on the task -if [[ "$TASK_NAME" == *"ruler"* ]]; then +if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then + max_gen_toks=128 MODEL_MAX_POS=${RULER_MAX_POS:-131072} + if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then + # if input task is ruler_qa_squad + MODEL_MAX_POS=$((131072 - max_gen_toks)) + TASK_NAME="ruler_qa_squad" + else + # if input task is ruler or niah_multiquery + TASK_NAME="niah_multiquery" + fi max_length=${MODEL_MAX_POS} - max_gen_toks=128 - SEQ_LENGTHS="${MODEL_MAX_POS}" - TASK_NAME="niah_multiquery" + SEQ_LENGTHS=${MODEL_MAX_POS} BATCH_SIZE=32 fi @@ -413,7 +420,7 @@ run_aisbench_eval() { if [[ "$TASK_NAME" == *"longbench"* ]]; then echo "Running LongBench v2 evaluation..." run_longbench_eval -elif [[ "$TASK_NAME" == *"niah"* ]]; then +elif [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then echo "Running RULER evaluation..." run_ruler_eval elif [[ "$TASK_NAME" == *"aisbench"* ]]; then