Skip to content
Open
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize DeepSeek models to MXFP
## Requirement
```bash
pip install neural-compressor-pt
# auto-round
pip install auto-round
# vLLM
git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
VLLM_USE_PRECOMPILED=1 pip install --editable . -vvv
# other requirements
pip install -r requirements.txt
pip uninstall flash_attn
bash setup.sh
```

### Quantize Model
Expand Down Expand Up @@ -65,7 +59,7 @@ bash ./run_generate.sh -s mxfp4 -tp 8 -m /path/to/ds_mxfp4
```
- NVFP4
```bash
bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_mxfp4
bash ./run_generate.sh -s nvfp4 -tp 8 -m /path/to/ds_nvfp4
```
### Evaluation

Expand All @@ -75,17 +69,14 @@ Usage:
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8|nvfp4] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
```
```bash
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp8
bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp8

```
- MXFP4
```bash
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_mxfp4
bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_mxfp4
```
- NVFP4
```bash
bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu -tp 8 -b 512 -m /path/to/ds_nvfp4
bash run_evaluation.sh -s nvfp4 -t gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
bash run_evaluation.sh -s nvfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 8 -b 256 -m /path/to/ds_nvfp4
```
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def quant_model(args):
iters=iters,
ignore_layers=config["fp_layers"],
export_format=args.export_format,
device_map="auto",
output_dir=output_dir,
low_gpu_mem_usage=True,
static_kv_dtype=static_kv_dtype,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
lm-eval==0.4.10
lm-eval[api]
loguru
compressed-tensors==0.12.2
compressed-tensors==0.15.0.1
hf_transfer
transformers==4.57.3
torch==2.9.0
# pip install git+https://github.com/yiliu30/long-bench-eval
long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval
transformers==4.57.3
Original file line number Diff line number Diff line change
Expand Up @@ -99,12 +99,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then
fi

# update max_length based on the task
if [[ "$TASK_NAME" == *"ruler"* ]]; then
if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
max_gen_toks=128
MODEL_MAX_POS=${RULER_MAX_POS:-131072}
if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then
# if input task is ruler_qa_squad
MODEL_MAX_POS=$((131072 - max_gen_toks))
TASK_NAME="ruler_qa_squad"
else
# if input task is ruler or niah_multiquery
TASK_NAME="niah_multiquery"
fi
max_length=${MODEL_MAX_POS}
max_gen_toks=128
SEQ_LENGTHS="${MODEL_MAX_POS}"
TASK_NAME="niah_multiquery"
SEQ_LENGTHS=${MODEL_MAX_POS}
BATCH_SIZE=32
fi

Expand Down Expand Up @@ -205,14 +212,28 @@ run_standard_eval() {
# Function to start vLLM server
start_vllm_server() {
echo "Starting vLLM server on port ${SERVER_PORT}..."

# Detect vLLM version for backward-compatible rope scaling
# vLLM >= 0.19 removed --rope-scaling; use --hf-overrides instead
VLLM_VERSION=$(python -c "import vllm; print(vllm.__version__)" 2>/dev/null || echo "0.0.0")
VLLM_MAJOR_MINOR=$(echo "$VLLM_VERSION" | awk -F. '{printf "%d%02d", $1, $2}')
ROPE_SCALING_JSON='{"rope_type":"yarn","factor":4.0,"original_max_position_embeddings":32768}'
if [ "$VLLM_MAJOR_MINOR" -ge 19 ] 2>/dev/null; then
ROPE_FLAG="--hf-overrides"
ROPE_VALUE="{\"rope_scaling\":${ROPE_SCALING_JSON},\"max_position_embeddings\":131072}"
else
ROPE_FLAG="--rope-scaling"
ROPE_VALUE="${ROPE_SCALING_JSON}"
fi
echo "vLLM version: ${VLLM_VERSION}, using: ${ROPE_FLAG} '${ROPE_VALUE}'"

vllm serve ${MODEL_PATH} \
--port ${SERVER_PORT} \
--tensor-parallel-size ${TP_SIZE} \
--max-model-len ${max_length} \
--gpu-memory-utilization 0.8 \
--dtype bfloat16 \
--kv-cache-dtype ${KV_CACHE_DTYPE} \
--disable-log-requests \
> ${OUTPUT_DIR}/vllm_server.log 2>&1 &

VLLM_PID=$!
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -e
MODEL=""
TARGET=""
OUTPUT_DIR=""
EXPORT_FORMAT="auto_round"
EXPORT_FORMAT="llm_compressor"
STATIC_KV_DTYPE="None"
STATIC_ATTENTION_DTYPE="None"

Expand All @@ -15,7 +15,7 @@ usage() {
echo " -kv datatype for kv cache (auto, fp8)"
echo " -attn Data type for static attention cache (default: None)"
echo " --output_dir output directory for quantized model"
echo " -f quantize model export_format (default: auto_round)"
echo " -f quantize model export_format (default: llm_compressor)"
exit 1
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,116 @@
pip install -r requirements.txt
pip install setuptools --upgrade
pip install packaging --upgrade
pip install transformers==4.57.3
pip install -U "huggingface_hub[cli]"
# Install vllm
git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
cd ..
# Uninstall flash_attn to avoid conflicts
pip uninstall flash_attn -y
pip install lm_eval["ruler"]
#!/bin/bash
set -e
usage() {
echo "Usage: $0 --device=[gpu|xpu] --format=[AR|LLMC] --task=[task_list] --bench_tool=[lm_eval|aisbench]"
echo " --device target device for quantization (gpu or xpu)"
echo " --format quantization format (AR for auto_round, LLMC for llm_compressor)"
echo " --task comma-separated list of evaluation tasks (e.g. gsm8k,hellaswag)"
echo " --bench_tool benchmarking tool to use (lm_eval or aisbench)"
}

detect_cuda_version() {
local cuda_version=""
local candidate_version=""

if command -v nvidia-smi >/dev/null 2>&1; then
candidate_version=$(nvidia-smi 2>/dev/null | sed -n 's/.*CUDA Version: \([^ ]*\).*/\1/p' | head -n 1)
if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then
cuda_version="$candidate_version"
fi
fi

if [[ -z "$cuda_version" ]] && command -v nvcc >/dev/null 2>&1; then
candidate_version=$(nvcc --version | awk '/release/ {print $6}' | sed 's/^V//; s/,//')
if [[ "$candidate_version" =~ ^[0-9.]+$ ]]; then
cuda_version="$candidate_version"
fi
fi

if [[ -z "$cuda_version" ]]; then
echo "Unable to detect CUDA version from nvidia-smi or nvcc." >&2
exit 1
fi

echo "$cuda_version"
}

DEVICE="${DEVICE:-gpu}"
FORMAT="${FORMAT:-LLMC}"
TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k,ruler}"
BENCH_TOOL="${BENCH_TOOL:-lm_eval}"

while [[ $# -gt 0 ]]; do
case $1 in
--device=*)
DEVICE="${1#*=}"
shift
;;
--format=*)
FORMAT="${1#*=}"
shift
;;
--task=*)
TASKS="${1#*=}"
shift
;;
--bench_tool=*)
BENCH_TOOL="${1#*=}"
shift
;;
*)
echo "Unknown parameter: $1"
usage
exit 1
;;
esac
done

if [[ "$DEVICE" == "xpu" ]]; then
# support quant only on xpu for now
uv pip install torch==2.11.0 torchvision==0.26.0 --index-url https://download.pytorch.org/whl/xpu
uv pip install -r requirements.txt --extra-index-url https://download.pytorch.org/whl/xpu
elif [[ "$DEVICE" == "gpu" ]]; then
uv pip install -r requirements.txt
uv pip install setuptools --upgrade
uv pip install packaging --upgrade
uv pip install -U "huggingface_hub[cli]"
if [[ "$FORMAT" == "LLMC" ]]; then
CUDA_VERSION=$(detect_cuda_version)
echo "Detected system CUDA version: $CUDA_VERSION"
if [[ "$CUDA_VERSION" == "12."* ]]; then
uv pip install vllm==0.22.0 --extra-index-url https://wheels.vllm.ai/0.22.0/cu129 --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match
uv pip install torch torchvision --extra-index-url https://download.pytorch.org/whl/cu129 --index-strategy unsafe-best-match
elif [[ "$CUDA_VERSION" == "13."* ]]; then
uv pip install vllm==0.22.0
else
echo "Unsupported CUDA version: $CUDA_VERSION. Supported versions are 12.x and 13.x."
exit 1
fi

uv pip install ray
git clone https://github.com/yiliu30/vllm-qdq-plugin.git
uv pip install vllm-qdq-plugin/ -v
else
# use default setting for AR format, required by fused-moe-ar
uv pip install torch==2.9.0
git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
VLLM_USE_PRECOMPILED=1 uv pip install --prerelease=allow . -v
cd ..
fi
if [[ "$BENCH_TOOL" == "lm_eval" ]]; then
uv pip install lm-eval==0.4.12
uv pip install lm-eval[api]
uv pip install lm-eval["ruler"]
if [[ "$TASKS" == *"longbench"* ]]; then
uv pip install "long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval"
fi
elif [[ "$BENCH_TOOL" == "aisbench" ]]; then
echo "Installing aisbench..."
fi
# Uninstall flash_attn to avoid conflicts
uv pip uninstall flash_attn
else
echo "Unsupported device: $DEVICE. Supported devices are gpu and xpu."
usage
exit 1
fi
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Parse command line arguments
KV_CACHE_DTYPE="auto"
STATIC_ATTENTION_DTYPE="auto"
EXPORT_FORMAT="auto_round"
EXPORT_FORMAT="llm_compressor"
while [[ $# -gt 0 ]]; do
case $1 in
--topology=*)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ detect_cuda_version() {
}

DEVICE="${DEVICE:-gpu}"
FORMAT="${FORMAT:-AR}"
FORMAT="${FORMAT:-LLMC}"
TASKS="${TASKS:-hellaswag,piqa,mmlu,gsm8k}"
BENCH_TOOL="${BENCH_TOOL:-lm_eval}"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,8 @@ This example provides an end-to-end workflow to quantize Qwen models to MXFP4/MX
## Requirement
```bash
uv pip install neural-compressor-pt
# auto-round
uv pip install auto-round
# vLLM
git clone -b fused-moe-ar --single-branch --quiet https://github.com/yiliu30/vllm-fork.git && cd vllm-fork
VLLM_USE_PRECOMPILED=1 uv pip install --editable . -vvv
# other requirements
uv pip install -r requirements.txt
uv pip uninstall flash_attn
bash setup.sh
```

### Quantize Model
Expand Down Expand Up @@ -74,12 +68,10 @@ Usage:
bash run_evaluation.sh -m [model_path] -s [mxfp4|mxfp8] -t [task_name] -tp [tensor_parallel_size] -b [batch_size]
```
```bash
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp8
bash run_evaluation.sh -s mxfp8 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8
bash run_evaluation.sh -s mxfp8 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp8

```
- MXFP4
```bash
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu -tp 4 -b 512 -m /path/to/qwen_mxfp4
bash run_evaluation.sh -s mxfp4 -t gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
bash run_evaluation.sh -s mxfp4 -t piqa,hellaswag,mmlu,gsm8k -tp 4 -b 256 -m /path/to/qwen_mxfp4
```
Original file line number Diff line number Diff line change
Expand Up @@ -102,12 +102,19 @@ if [[ "$TASK_NAME" == *"longbench"* ]]; then
fi

# update max_length based on the task
if [[ "$TASK_NAME" == *"ruler"* ]]; then
if [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
max_gen_toks=128
MODEL_MAX_POS=${RULER_MAX_POS:-131072}
if [[ "$TASK_NAME" == *"ruler_qa_squad"* ]]; then
# if input task is ruler_qa_squad
MODEL_MAX_POS=$((131072 - max_gen_toks))
TASK_NAME="ruler_qa_squad"
else
# if input task is ruler or niah_multiquery
TASK_NAME="niah_multiquery"
fi
max_length=${MODEL_MAX_POS}
max_gen_toks=128
SEQ_LENGTHS="${MODEL_MAX_POS}"
TASK_NAME="niah_multiquery"
SEQ_LENGTHS=${MODEL_MAX_POS}
BATCH_SIZE=32
fi

Expand Down Expand Up @@ -413,7 +420,7 @@ run_aisbench_eval() {
if [[ "$TASK_NAME" == *"longbench"* ]]; then
echo "Running LongBench v2 evaluation..."
run_longbench_eval
elif [[ "$TASK_NAME" == *"niah"* ]]; then
elif [[ "$TASK_NAME" == *"ruler"* ]] || [[ "$TASK_NAME" == *"niah_multiquery"* ]]; then
echo "Running RULER evaluation..."
run_ruler_eval
elif [[ "$TASK_NAME" == *"aisbench"* ]]; then
Expand Down