diff --git a/examples/README.md b/examples/README.md index 26e4a5792d7..9d590b1b7f5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -15,6 +15,12 @@ IntelĀ® Neural Compressor validated examples with multiple compression technique + + deepseek-ai/DeepSeek-V4 + Natural Language Processing + Quantization (MXFP8/MXFP4) + link + deepseek-ai/DeepSeek-R1 Natural Language Processing diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md new file mode 100644 index 00000000000..5ab41000942 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -0,0 +1,112 @@ +# DeepSeek V4 AutoRound (INC prepare/convert) + +This example demonstrates model-free quantization through INC API: + +```python +from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert + +config = AutoRoundConfig( + model_free=True, + scheme="MXFP4", + ignore_layers="compressor,indexer.weights_proj", + export_format="llm_compressor", + output_dir="/path/to/output", +) +model = "/path/or/hf_model_name" +model = prepare(model, config) +model = convert(model) +``` + +## Requirements + +Install dependencies before running quantization or evaluation: + +```bash +uv pip install -U pip +uv pip install -U "git+https://github.com/intel/auto-round.git@main" +uv pip install -U evalscope lm_eval transformers datasets +uv pip install compressed-tensors --no-deps +bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) +uv pip install setuptools_rust setuptools_scm +VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation +``` + +## Quick Start + +```bash +cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4 +bash run_quant.sh \ + --dtype=mxfp4_mixed \ + --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \ + --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed +``` + +Then run serving + evaluation in one command: + +```bash +CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \ + --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \ + --tp 2 \ + --port 8009 \ + --tasks piqa,hellaswag,gsm8k,mmlu_pro,math_500,mmlu,aime26,gpqa_diamond,ruler_qa_squad + --temp 1.0 +``` + +Equivalent vLLM defaults inside `run_evalscope.sh`: + +```bash +SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve \ + --trust-remote-code \ + --kv-cache-dtype fp8 \ + --block-size 256 \ + --tensor-parallel-size 2 \ + --attention_config.use_fp4_indexer_cache=True \ + --port 8009 \ + --no-enable-flashinfer-autotune +``` + +If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix), +`run_evalscope.sh` will also add (automatically): + +```bash +--enable-expert-parallel --moe-backend deep_gemm_mega_moe +``` + +Mixed preset example: + +```bash +bash run_quant.sh \ + --dtype=mxfp4_mixed \ + --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \ + --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP8 +``` + +## CLI Arguments + +- `--dtype`: quantization preset. + - `mxfp4`: `scheme=MXFP4` + - `mxfp4_mixed`: `scheme=MXFP8` + `layer_config={"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}` + - `mxfp8`: `scheme=MXFP8` + - `w4a16`: `scheme=W4A16` + `layer_config={"wo_a": {"bits": 16}}` +- `--input_model`: HF model name or local model path. +- `--output_model`: output directory. +- `--format`: `auto_round` or `llm_compressor` (default: `llm_compressor`). +- `--ignore_layers`: comma-separated layer patterns (default: `compressor,indexer.weights_proj`). + +`run_evalscope.sh` arguments: + +- `--model`: model path for vLLM and evalscope. +- `--port`: vLLM API port (default: `8009`). +- `--temp`: generation temperature used by evalscope (default: `0`). +- `--skip_serve`: skip starting vLLM (use existing endpoint on the same `--port`). +- `--tp`: tensor parallel size for vLLM (default: `2`). +- `--kv-cache-dtype`: kv cache dtype for vLLM (default: `fp8`). +- `--block-size`: vLLM block size (default: `256`). + +## Notes + +- This flow is enabled only when: + - `config` is `AutoRoundConfig` + - `config.model_free=True` + - `model` passed to `prepare/convert` is a `str` (model path or model name) +- The example uses `reloading=False` by default and saves quantized artifacts to `--output_model`. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py new file mode 100644 index 00000000000..d64fd3a480d --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py @@ -0,0 +1,116 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging + +from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +_PRESET_CONFIG = { + "mxfp4": { + "scheme": "MXFP4", + "layer_config": None, + }, + # MXFP8 + experts FP4 mixed setup. + "mxfp4_mixed": { + "scheme": "MXFP8", + "layer_config": {"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}, + }, + "mxfp8": { + "scheme": "MXFP8", + "layer_config": None, + }, + "w4a16": { + "scheme": "W4A16", + "layer_config": {"wo_a": {"bits": 16}}, + }, +} + + +def build_config(args: argparse.Namespace) -> AutoRoundConfig: + dtype_key = args.dtype.lower() + if dtype_key not in _PRESET_CONFIG: + raise ValueError(f"Unsupported dtype: {args.dtype}. Supported: {', '.join(_PRESET_CONFIG.keys())}") + + preset = _PRESET_CONFIG[dtype_key] + layer_config = preset["layer_config"] + if args.disable_preset_layer_config: + layer_config = None + + return AutoRoundConfig( + model_free=True, + scheme=preset["scheme"], + ignore_layers=args.ignore_layers, + layer_config=layer_config, + export_format=args.format, + output_dir=args.output_model, + reloading=False, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="DeepSeek V4 model-free quantization via INC AutoRound prepare/convert.") + parser.add_argument( + "--dtype", + type=str, + required=True, + choices=sorted(_PRESET_CONFIG.keys()), + help="Quantization preset. e.g. mxfp4 or mxfp4_mixed", + ) + parser.add_argument( + "--input_model", + type=str, + required=True, + help="Model name or local path.", + ) + parser.add_argument( + "--output_model", + type=str, + required=True, + help="Output directory for quantized model.", + ) + parser.add_argument( + "--ignore_layers", + type=str, + default="compressor,indexer.weights_proj", + help="Comma-separated layer name patterns to skip.", + ) + parser.add_argument( + "--format", + type=str, + default="llm_compressor", + choices=["auto_round", "llm_compressor"], + help="Export format.", + ) + parser.add_argument( + "--disable_preset_layer_config", + action="store_true", + help="Disable preset layer_config for the selected dtype.", + ) + args = parser.parse_args() + + quant_config = build_config(args) + + model = args.input_model + model = prepare(model, quant_config) + _ = convert(model) + logger.info("Quantized model saved to %s", args.output_model) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh new file mode 100644 index 00000000000..6c19df57576 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh @@ -0,0 +1,354 @@ +#!/bin/bash + +set -euo pipefail + +# Usage: +# bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] [--tasks TASK1,TASK2] +# +# This script can start vLLM serve and then run evalscope automatically. + +PORT=8009 +MODEL=/workspace/models/deepseek-ai/DeepSeek-V4-Flash +TEMPERATURE=0 +KV_CACHE_DTYPE="fp8" +BLOCK_SIZE=256 +TENSOR_PARALLEL_SIZE=2 +SAFETENSORS_FAST_GPU="1" +TRUST_REMOTE_CODE="true" +NO_ENABLE_FLASHINFER_AUTOTUNE="true" +TASKS="" +SKIP_SERVE="${SKIP_SERVE:-false}" +VLLM_PID="" +LOG_TAIL_PID="" + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd) +cd "${SCRIPT_DIR}" + +cleanup() { + if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then + kill "${LOG_TAIL_PID}" 2>/dev/null || true + fi + + if [[ "${SKIP_SERVE}" == "true" ]]; then + return + fi + + if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then + CHILDREN=$(pgrep -P "${VLLM_PID}" || true) + if [[ -n "${CHILDREN}" ]]; then + kill -9 ${CHILDREN} 2>/dev/null || true + fi + kill -9 "${VLLM_PID}" 2>/dev/null || true + return + fi + + # Kill the process listening on the specified port to free GPU. + VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}') + if [[ -n "${VLLM_PIDS}" ]]; then + for PID in ${VLLM_PIDS}; do + CHILDREN=$(pgrep -P "${PID}" || true) + if [[ -n "${CHILDREN}" ]]; then + kill -9 ${CHILDREN} 2>/dev/null || true + fi + kill -9 "${PID}" 2>/dev/null || true + done + fi +} + +trap cleanup EXIT + +stop_log_tail() { + if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then + kill "${LOG_TAIL_PID}" 2>/dev/null || true + LOG_TAIL_PID="" + fi +} + +trim_task_name() { + local task_name="$1" + task_name="${task_name#${task_name%%[![:space:]]*}}" + task_name="${task_name%${task_name##*[![:space:]]}}" + echo "${task_name}" +} + +task_in_list() { + local target_task="$1" + shift + local task_name + for task_name in "$@"; do + if [[ "${task_name}" == "${target_task}" ]]; then + return 0 + fi + done + return 1 +} + +print_section_header() { + echo "=== [${STEP_INDEX}/${TOTAL_STEPS}] $1 ===" | tee -a "$OUTPUT_FILE" + STEP_INDEX=$((STEP_INDEX + 1)) +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --port) + PORT="$2"; shift 2 ;; + --model) + MODEL="$2"; shift 2 ;; + --temp) + TEMPERATURE="$2"; shift 2 ;; + --tasks) + TASKS="$2"; shift 2 ;; + --skip_serve) + SKIP_SERVE="true"; shift 1 ;; + --skip-serve) + SKIP_SERVE="true"; shift 1 ;; + --tp) + TENSOR_PARALLEL_SIZE="$2"; shift 2 ;; + --kv-cache-dtype) + KV_CACHE_DTYPE="$2"; shift 2 ;; + --block-size) + BLOCK_SIZE="$2"; shift 2 ;; + *) + echo "Unknown option: $1"; exit 1 ;; + esac +done + +SKIP_SERVE="$(echo "${SKIP_SERVE}" | tr '[:upper:]' '[:lower:]')" + +API_URL="http://127.0.0.1:${PORT}/v1" + +if [[ "${SKIP_SERVE}" != "true" ]]; then + echo "Starting vLLM serve on port ${PORT} ..." + MODEL_NORMALIZED="${MODEL%/}" + MODEL_NAME="${MODEL_NORMALIZED##*/}" + EXTRA_ARGS=() + # Only for base DeepSeek-V4-Flash/Pro model names without quantized suffixes. + if [[ "${MODEL_NAME}" == "DeepSeek-V4-Flash" || "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then + EXTRA_ARGS+=(--enable-expert-parallel) + EXTRA_ARGS+=(--moe-backend deep_gemm_mega_moe) + fi + + VLLM_CMD=( + vllm serve "${MODEL}" + --kv-cache-dtype "${KV_CACHE_DTYPE}" + --block-size "${BLOCK_SIZE}" + --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" + --attention_config.use_fp4_indexer_cache=True + --port "${PORT}" + ) + if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then + VLLM_CMD+=(--max-model-len 1048576) + fi + if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then + VLLM_CMD+=(--trust-remote-code) + fi + if [[ "${NO_ENABLE_FLASHINFER_AUTOTUNE}" == "true" ]]; then + VLLM_CMD+=(--no-enable-flashinfer-autotune) + fi + VLLM_CMD+=("${EXTRA_ARGS[@]}") + + SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 & + VLLM_PID=$! + echo "vLLM launched. Log: /tmp/vllm_${PORT}.log" + echo "vLLM PID: ${VLLM_PID}" + echo "=== vLLM startup log (will stop after API wait ends) ===" + tail -n +1 -f "/tmp/vllm_${PORT}.log" & + LOG_TAIL_PID=$! +fi + +# Wait until the API is ready +echo "Waiting for API at ${API_URL} ..." +for _ in $(seq 1 90); do + if curl -sf "${API_URL}/models" -o /dev/null; then + break + fi + if [[ "${SKIP_SERVE}" != "true" ]] && [[ -n "${VLLM_PID}" ]] && ! kill -0 "${VLLM_PID}" 2>/dev/null; then + stop_log_tail + echo "[$(date '+%Y-%m-%d %H:%M:%S')] vLLM exited before API became ready." + echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----" + tail -n 80 "/tmp/vllm_${PORT}.log" || true + exit 1 + fi + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..." + sleep 20 +done + +stop_log_tail + +if ! curl -sf "${API_URL}/models" -o /dev/null; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for API at ${API_URL}." + echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----" + tail -n 80 "/tmp/vllm_${PORT}.log" || true + exit 1 +fi + +echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation." +MODEL_NORMALIZED="${MODEL%/}" +MODEL_NAME="${MODEL_NORMALIZED##*/}" +LOG_DIR="logs/${MODEL_NAME}" +mkdir -p "$LOG_DIR" +OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log" + +DEFAULT_STANDARD_TASKS=(piqa hellaswag gsm8k mmlu_pro math_500 mmlu) +SUPPORTED_TASKS=(aime26 gpqa_diamond ruler_qa_squad "${DEFAULT_STANDARD_TASKS[@]}") +SELECTED_STANDARD_TASKS=() +RUN_AIME26="true" +RUN_GPQA_DIAMOND="true" +RUN_STANDARD_TASKS="true" +RUN_RULER_QA_SQUAD="true" + +if [[ -n "${TASKS}" ]]; then + RUN_AIME26="false" + RUN_GPQA_DIAMOND="false" + RUN_STANDARD_TASKS="false" + RUN_RULER_QA_SQUAD="false" + + IFS=',' read -r -a REQUESTED_TASKS <<< "${TASKS}" + for raw_task in "${REQUESTED_TASKS[@]}"; do + task_name="$(trim_task_name "${raw_task}")" + if [[ -z "${task_name}" ]]; then + continue + fi + if ! task_in_list "${task_name}" "${SUPPORTED_TASKS[@]}"; then + echo "Unsupported task: ${task_name}" + echo "Supported tasks: ${SUPPORTED_TASKS[*]}" + exit 1 + fi + + case "${task_name}" in + aime26) + RUN_AIME26="true" + ;; + gpqa_diamond) + RUN_GPQA_DIAMOND="true" + ;; + ruler_qa_squad) + RUN_RULER_QA_SQUAD="true" + ;; + *) + if ! task_in_list "${task_name}" "${SELECTED_STANDARD_TASKS[@]}"; then + SELECTED_STANDARD_TASKS+=("${task_name}") + RUN_STANDARD_TASKS="true" + fi + ;; + esac + done + + if [[ "${RUN_AIME26}" != "true" ]] && [[ "${RUN_GPQA_DIAMOND}" != "true" ]] \ + && [[ "${RUN_STANDARD_TASKS}" != "true" ]] && [[ "${RUN_RULER_QA_SQUAD}" != "true" ]]; then + echo "No valid tasks selected from --tasks '${TASKS}'." + exit 1 + fi +else + SELECTED_STANDARD_TASKS=("${DEFAULT_STANDARD_TASKS[@]}") +fi + +TOTAL_STEPS=0 +if [[ "${RUN_AIME26}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +STEP_INDEX=1 + +echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE" +echo "Model: $MODEL" | tee -a "$OUTPUT_FILE" +echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE" +echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE" +if [[ -n "${TASKS}" ]]; then + echo "Tasks: ${TASKS}" | tee -a "$OUTPUT_FILE" +else + echo "Tasks: all default tasks" | tee -a "$OUTPUT_FILE" +fi +echo "" | tee -a "$OUTPUT_FILE" + + +if [[ "${RUN_AIME26}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "aime26 (n=10)" + evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets aime26 \ + --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +fi + +if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "gpqa_diamond (n=5)" + evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets gpqa_diamond \ + --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +fi + +if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "${SELECTED_STANDARD_TASKS[*]}" + evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets "${SELECTED_STANDARD_TASKS[@]}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +fi + +if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "ruler_qa_squad (lm_eval, 1M)" + if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then + LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa" + mkdir -p "${LMEVAL_OUTPUT_DIR}" + LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}") + lm_eval \ + --model local-completions \ + --tasks ruler_qa_squad \ + --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \ + --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \ + --metadata "${LMEVAL_METADATA}" \ + --batch_size 1 \ + --log_samples \ + --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE" + else + echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE" + fi +fi + + +echo "" | tee -a "$OUTPUT_FILE" +echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE" +echo "Results saved to: $OUTPUT_FILE" + +# Kill the process listening on the specified port to free GPU +echo "Stopping process on port ${PORT} to free GPU..." | tee -a "$OUTPUT_FILE" +VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}') +if [[ -n "$VLLM_PIDS" ]]; then + echo "Found vllm serve process(es) with --port ${PORT}: $VLLM_PIDS" | tee -a "$OUTPUT_FILE" + for PID in $VLLM_PIDS; do + # Kill all child processes (including GPU processes) + CHILDREN=$(pgrep -P $PID) + if [[ -n "$CHILDREN" ]]; then + echo "Killing child processes of $PID: $CHILDREN" | tee -a "$OUTPUT_FILE" + kill -9 $CHILDREN 2>/dev/null + fi + kill -9 $PID 2>/dev/null + echo "Killed vllm serve process and its children: $PID $CHILDREN" | tee -a "$OUTPUT_FILE" + done +else + echo "No vllm serve process found with --port ${PORT}." | tee -a "$OUTPUT_FILE" +fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh new file mode 100644 index 00000000000..305c2cd266d --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd) + +DTYPE="" +INPUT_MODEL="" +OUTPUT_MODEL="" +FORMAT="llm_compressor" +IGNORE_LAYERS="compressor,indexer.weights_proj" + +usage() { + echo "Usage: bash run_quant.sh --dtype= --input_model= --output_model=" + echo "Optional: --format= --ignore_layers=" + exit 1 +} + +for arg in "$@"; do + case $arg in + --dtype=*) + DTYPE="${arg#*=}" + ;; + --input_model=*) + INPUT_MODEL="${arg#*=}" + ;; + --output_model=*) + OUTPUT_MODEL="${arg#*=}" + ;; + --format=*) + FORMAT="${arg#*=}" + ;; + --ignore_layers=*) + IGNORE_LAYERS="${arg#*=}" + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $arg" + usage + ;; + esac +done + +[[ -z "$DTYPE" ]] && echo "Error: --dtype is required" && usage +[[ -z "$INPUT_MODEL" ]] && echo "Error: --input_model is required" && usage +[[ -z "$OUTPUT_MODEL" ]] && echo "Error: --output_model is required" && usage + +cd "$SCRIPT_DIR" +python quantize.py \ + --dtype "$DTYPE" \ + --input_model "$INPUT_MODEL" \ + --output_model "$OUTPUT_MODEL" \ + --format "$FORMAT" \ + --ignore_layers "$IGNORE_LAYERS" diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh new file mode 100644 index 00000000000..0b1215ebfba --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh @@ -0,0 +1,9 @@ +#!/bin/bash +set -e + +uv pip install -U pip setuptools_rust setuptools_scm +uv pip install -U evalscope lm_eval transformers datasets +uv pip install git+https://github.com/intel/auto-round.git@main +uv pip install compressed-tensors --no-deps +bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) +VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation \ No newline at end of file diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py index 94e012b0298..c4601528c96 100644 --- a/neural_compressor/torch/algorithms/autoround/autoround.py +++ b/neural_compressor/torch/algorithms/autoround/autoround.py @@ -158,6 +158,8 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs): Returns: A prepared model. """ + if isinstance(model, str) and bool(getattr(self, "model_free", False)): + return model prepare_model = InputCaptureModule(model) return prepare_model @@ -171,20 +173,26 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): The quantized model. """ pipe = kwargs.pop("pipeline", None) - tokenizer = getattr(model.orig_model, "tokenizer", None) - if tokenizer is not None: - delattr(model.orig_model, "tokenizer") - elif pipe is None: - tokenizer = "Placeholder" - self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) - # Retrieve processor/image_processor/template from model if they were attached there - # (moved from quant_config to model to avoid duplicating large objects in per-layer configs) - for _attr in ("processor", "image_processor", "template"): - _val = getattr(model.orig_model, _attr, None) - if _val is not None: - setattr(self, _attr, _val) - delattr(model.orig_model, _attr) - model = model.orig_model + is_model_reference = isinstance(model, str) + if is_model_reference: + tokenizer = getattr(self, "tokenizer", None) + if tokenizer is None and pipe is None: + tokenizer = "Placeholder" + else: + tokenizer = getattr(model.orig_model, "tokenizer", None) + if tokenizer is not None: + delattr(model.orig_model, "tokenizer") + elif pipe is None: + tokenizer = "Placeholder" + self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) + # Retrieve processor/image_processor/template from model if they were attached there + # (moved from quant_config to model to avoid duplicating large objects in per-layer configs) + for _attr in ("processor", "image_processor", "template"): + _val = getattr(model.orig_model, _attr, None) + if _val is not None: + setattr(self, _attr, _val) + delattr(model.orig_model, _attr) + model = model.orig_model if pipe is not None: model = pipe # Remove AutoRound specific args before passing to AutoRound constructor @@ -221,7 +229,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): if self._is_w4afp8(): model, weight_config = rounder.quantize() - model.autoround_config = weight_config + if hasattr(model, "__dict__"): + model.autoround_config = weight_config return rounder.save_quantized(output_dir=self.output_dir, inplace=True) else: # pragma: no cover _, quantized_model_path = rounder.quantize_and_save( @@ -229,10 +238,12 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): ) self.output_dir = quantized_model_path model = rounder.model - model.autoround_config = rounder.layer_config + if hasattr(model, "__dict__"): + model.autoround_config = rounder.layer_config self.accelerator.empty_cache() - dump_model_op_stats(rounder.layer_config) + if not bool(getattr(self, "model_free", False)): + dump_model_op_stats(rounder.layer_config) reloading = self.__dict__.get("reloading", True) if self.export_format in ["auto_round", "llm_compressor"] and reloading: @@ -248,7 +259,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): except Exception as e: logger.error(f"Error reloading model: {e}") - setattr(model, "name_or_path", self.output_dir) # model is saved in a subfolder of output_dir based on scheme + if hasattr(model, "__dict__"): + # model is saved in a subfolder of output_dir based on scheme + setattr(model, "name_or_path", self.output_dir) return model diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index e15d19fba96..95651695030 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -560,12 +560,12 @@ def teq_quantize_entry( ###################### AUTOROUND Algo Entry ################################## @register_algo(name=AUTOROUND) def autoround_quantize_entry( - model: torch.nn.Module, + model, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], mode: Mode = Mode.QUANTIZE, *args, **kwargs, -) -> torch.nn.Module: +): """The main entry to apply AutoRound quantization. Args: @@ -630,8 +630,10 @@ def autoround_quantize_entry( kwargs.pop("example_inputs") quantizer = get_quantizer(model, quantizer_cls=AutoRoundQuantizer, quant_config=quant_config, **params_dict) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) - model.qconfig = configs_mapping - model.save = MethodType(save, model) + if hasattr(model, "__dict__"): + model.qconfig = configs_mapping + if isinstance(model, torch.nn.Module): + model.save = MethodType(save, model) postprocess_model(model, mode, quantizer) return model diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 17641d81f2f..3bd7729b2b3 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -33,6 +33,25 @@ FRAMEWORK_NAME = "torch" +class _AutoRoundModelReference: + """A lightweight container for model-free AutoRound prepare/convert flow.""" + + def __init__(self, model_reference: str, quant_config: BaseConfig, example_inputs: Any = None): + self.model_reference = model_reference + self.quant_config = quant_config + self.example_inputs = example_inputs + self.is_prepared = True + + +def _is_autoround_model_free_string_case(model: Any, quant_config: BaseConfig) -> bool: + """Return True when model-free AutoRound is called with a string model reference.""" + return ( + isinstance(quant_config, AutoRoundConfig) + and bool(getattr(quant_config, "model_free", False)) + and isinstance(model, str) + ) + + def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name): """Check whether to apply this algorithm according to configs_mapping. @@ -89,12 +108,16 @@ def preprocess_quant_config(model, quant_config, mode="prepare", example_inputs= ) model_info = quant_config.get_model_info(model, example_inputs) elif isinstance(quant_config, AutoRoundConfig): - for _attr in ("tokenizer", "processor", "image_processor", "template"): - _backup = getattr(quant_config, _attr, None) - if _backup is not None: - setattr(model, _attr, _backup) - delattr(quant_config, _attr) - model_info = quant_config.get_model_info(model=model) + if _is_autoround_model_free_string_case(model, quant_config): + # Keep optional large objects on config when model is a string reference. + model_info = quant_config.get_model_info(model=None) + else: + for _attr in ("tokenizer", "processor", "image_processor", "template"): + _backup = getattr(quant_config, _attr, None) + if _backup is not None: + setattr(model, _attr, _backup) + delattr(quant_config, _attr) + model_info = quant_config.get_model_info(model=model) else: model_info = quant_config.get_model_info(model=model) @@ -172,6 +195,9 @@ def prepare( Returns: prepared and calibrated module. """ + if _is_autoround_model_free_string_case(model, quant_config): + return _AutoRoundModelReference(model_reference=model, quant_config=quant_config, example_inputs=example_inputs) + prepared_model = model if inplace else copy.deepcopy(model) prepared_model, configs_mapping = preprocess_quant_config( prepared_model, quant_config, mode="prepare", example_inputs=example_inputs @@ -240,6 +266,13 @@ def convert( Returns: The quantized model. """ + if isinstance(model, _AutoRoundModelReference): + if quant_config is None: + quant_config = model.quant_config + else: + logger.warning("quant_config will be ignored since the model has been prepared.") + model = model.model_reference + q_model = model if inplace else copy.deepcopy(model) assert ( @@ -287,7 +320,8 @@ def convert( mode=Mode.CONVERT, **kwargs, ) - setattr(q_model, "is_quantized", True) + if hasattr(q_model, "__dict__"): + setattr(q_model, "is_quantized", True) return q_model diff --git a/test/torch/quantization/test_autoround_cpu.py b/test/torch/quantization/test_autoround_cpu.py index d25fc9bb1e6..f70e95248d7 100644 --- a/test/torch/quantization/test_autoround_cpu.py +++ b/test/torch/quantization/test_autoround_cpu.py @@ -16,6 +16,10 @@ prepare, quantize, ) +from neural_compressor.torch.quantization.quantize import ( + _AutoRoundModelReference, + _is_autoround_model_free_string_case, +) from neural_compressor.torch.utils import logger torch.backends.__allow_nonbracketed_mutation_flag = True @@ -638,3 +642,82 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tmp_ assert ( getattr(attn, "q_scale", None) is not None ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" + + def test_is_autoround_model_free_string_case_true(self): + """Test detection when model is string and config has model_free=True.""" + config = AutoRoundConfig(model_free=True, scheme="MXFP4") + model = "/path/to/model" + assert _is_autoround_model_free_string_case(model, config) is True + + def test_is_autoround_model_free_string_case_false_not_string(self): + """Test detection returns False when model is not a string.""" + config = AutoRoundConfig(model_free=True, scheme="MXFP4") + model = torch.nn.Linear(10, 10) + assert _is_autoround_model_free_string_case(model, config) is False + + def test_is_autoround_model_free_string_case_false_no_flag(self): + """Test detection returns False when model_free is not set.""" + config = AutoRoundConfig(scheme="MXFP4") + model = "/path/to/model" + assert _is_autoround_model_free_string_case(model, config) is False + + def test_is_autoround_model_free_string_case_false_model_free_false(self): + """Test detection returns False when model_free is explicitly False.""" + config = AutoRoundConfig(model_free=False, scheme="MXFP4") + model = "/path/to/model" + assert _is_autoround_model_free_string_case(model, config) is False + + def test_autoround_model_reference_creation(self): + """Test _AutoRoundModelReference wrapper creation.""" + model_ref = "/path/to/deepseek-v4" + config = AutoRoundConfig(model_free=True, scheme="MXFP4") + example_inputs = {"input_ids": torch.ones(1, 10, dtype=torch.long)} + + ref = _AutoRoundModelReference(model_reference=model_ref, quant_config=config, example_inputs=example_inputs) + + assert ref.model_reference == model_ref + assert ref.quant_config is config + assert ref.example_inputs == example_inputs + assert ref.is_prepared is True + + def test_prepare_with_string_model_and_model_free_returns_reference(self): + """Test that prepare() returns _AutoRoundModelReference when called with string model and model_free=True.""" + model = "/path/to/model" + config = AutoRoundConfig( + model_free=True, + scheme="MXFP4", + ignore_layers="compressor", + output_dir="/tmp/test_output", + ) + + result = prepare(model, config) + + assert isinstance(result, _AutoRoundModelReference) + assert result.model_reference == model + assert result.quant_config is config + + def test_model_free_with_string_model(self): + """Test that prepare() preserves all config attributes in _AutoRoundModelReference.""" + model = "facebook/opt-125m" + layer_config = {"fc2": {"bits": 4, "data_type": "mx_fp"}} + config = AutoRoundConfig( + model_free=True, + scheme="MXFP8", + ignore_layers="self_attn", + layer_config=layer_config, + export_format="llm_compressor", + output_dir="/tmp/quantized_model", + ) + + result = prepare(model, config) + + assert isinstance(result, _AutoRoundModelReference) + assert result.quant_config.scheme == "MXFP8" + assert result.quant_config.ignore_layers == "self_attn" + assert result.quant_config.layer_config == layer_config + assert result.quant_config.export_format == "llm_compressor" + + result = convert(result) + assert not hasattr(result.model.decoder.layers[0].self_attn.k_proj, "quantization_scheme"), "Ignored layers were not preserved during conversion." + assert result.model.decoder.layers[0].fc1.quantization_scheme.format.value == 'mxfp8-quantized', "Model conversion did not preserve the quantization scheme format." + assert result.model.decoder.layers[0].fc2.quantization_scheme.format.value == 'mxfp4-pack-quantized', "Model conversion did not preserve the quantization scheme format for layer_config."