From 6140eab0c1ff9504dbbe2f265940b54acb3301d1 Mon Sep 17 00:00:00 2001 From: Xin He Date: Mon, 15 Jun 2026 16:23:46 +0800 Subject: [PATCH 01/12] Add DeepSeek V4 AutoRound example and scripts for model-free quantization Signed-off-by: Xin He --- .../auto_round/deepseekv4/README.md | 97 ++++++++++ .../auto_round/deepseekv4/quantize.py | 116 ++++++++++++ .../auto_round/deepseekv4/run_evalscope.sh | 171 ++++++++++++++++++ .../auto_round/deepseekv4/run_quant.sh | 55 ++++++ .../torch/algorithms/autoround/autoround.py | 46 +++-- .../torch/quantization/algorithm_entry.py | 10 +- .../torch/quantization/quantize.py | 48 ++++- 7 files changed, 515 insertions(+), 28 deletions(-) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md new file mode 100644 index 00000000000..6930104d889 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -0,0 +1,97 @@ +# DeepSeek V4 AutoRound (INC prepare/convert) + +This example demonstrates model-free quantization through INC API: + +```python +from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert + +config = AutoRoundConfig( + model_free=True, + scheme="MXFP4", + ignore_layers="compressor,indexer.weights_proj", + export_format="llm_compressor", + output_dir="/path/to/output", +) +model = "/path/or/hf_model_name" +model = prepare(model, config) +model = convert(model) +``` + +## Quick Start + +```bash +cd examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4 +bash run_quant.sh \ + --dtype=mxfp4_mixed \ + --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \ + --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed +``` + +Then run serving + evaluation in one command: + +```bash +CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \ + --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \ + --tp 2 \ + --port 8009 \ + --temp 1.0 +``` + +Equivalent vLLM defaults inside `run_evalscope.sh`: + +```bash +SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve \ + --trust-remote-code \ + --kv-cache-dtype fp8 \ + --block-size 256 \ + --tensor-parallel-size 2 \ + --attention_config.use_fp4_indexer_cache=True \ + --port 8009 \ + --no-enable-flashinfer-autotune +``` + +If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix), +`run_evalscope.sh` will also add: + +```bash +--enable-expert-parallel --moe-backend deep_gemm_mega_moe +``` + +Mixed preset example: + +```bash +bash run_quant.sh \ + --dtype=mxfp4_mixed \ + --input_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash \ + --output_model=/workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP8 +``` + +## CLI Arguments + +- `--dtype`: quantization preset. + - `mxfp4`: `scheme=MXFP4` + - `mxfp4_mixed`: `scheme=MXFP8` + `layer_config={"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}` + - `mxfp8`: `scheme=MXFP8` + - `w4a16`: `scheme=W4A16` + `layer_config={"wo_a": {"bits": 16}}` +- `--input_model`: HF model name or local model path. +- `--output_model`: output directory. +- `--format`: `auto_round` or `llm_compressor` (default: `llm_compressor`). +- `--ignore_layers`: comma-separated layer patterns (default: `compressor,indexer.weights_proj`). + +`run_evalscope.sh` arguments: + +- `--model`: model path for vLLM and evalscope. +- `--port`: vLLM API port (default: `8009`). +- `--temp`: generation temperature used by evalscope (default: `0`). +- `--skip_serve`: skip starting vLLM (use existing endpoint on the same `--port`). +- `--tp`: tensor parallel size for vLLM (default: `2`). +- `--kv-cache-dtype`: kv cache dtype for vLLM (default: `fp8`). +- `--block-size`: vLLM block size (default: `256`). + +## Notes + +- This flow is enabled only when: + - `config` is `AutoRoundConfig` + - `config.model_free=True` + - `model` passed to `prepare/convert` is a `str` (model path or model name) +- The example uses `reloading=False` by default and saves quantized artifacts to `--output_model`. diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py new file mode 100644 index 00000000000..d64fd3a480d --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/quantize.py @@ -0,0 +1,116 @@ +# Copyright (c) 2026 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging + +from neural_compressor.torch.quantization import AutoRoundConfig, convert, prepare + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + + +_PRESET_CONFIG = { + "mxfp4": { + "scheme": "MXFP4", + "layer_config": None, + }, + # MXFP8 + experts FP4 mixed setup. + "mxfp4_mixed": { + "scheme": "MXFP8", + "layer_config": {"ffn.experts": {"bits": 4, "data_type": "mx_fp"}}, + }, + "mxfp8": { + "scheme": "MXFP8", + "layer_config": None, + }, + "w4a16": { + "scheme": "W4A16", + "layer_config": {"wo_a": {"bits": 16}}, + }, +} + + +def build_config(args: argparse.Namespace) -> AutoRoundConfig: + dtype_key = args.dtype.lower() + if dtype_key not in _PRESET_CONFIG: + raise ValueError(f"Unsupported dtype: {args.dtype}. Supported: {', '.join(_PRESET_CONFIG.keys())}") + + preset = _PRESET_CONFIG[dtype_key] + layer_config = preset["layer_config"] + if args.disable_preset_layer_config: + layer_config = None + + return AutoRoundConfig( + model_free=True, + scheme=preset["scheme"], + ignore_layers=args.ignore_layers, + layer_config=layer_config, + export_format=args.format, + output_dir=args.output_model, + reloading=False, + ) + + +def main() -> None: + parser = argparse.ArgumentParser(description="DeepSeek V4 model-free quantization via INC AutoRound prepare/convert.") + parser.add_argument( + "--dtype", + type=str, + required=True, + choices=sorted(_PRESET_CONFIG.keys()), + help="Quantization preset. e.g. mxfp4 or mxfp4_mixed", + ) + parser.add_argument( + "--input_model", + type=str, + required=True, + help="Model name or local path.", + ) + parser.add_argument( + "--output_model", + type=str, + required=True, + help="Output directory for quantized model.", + ) + parser.add_argument( + "--ignore_layers", + type=str, + default="compressor,indexer.weights_proj", + help="Comma-separated layer name patterns to skip.", + ) + parser.add_argument( + "--format", + type=str, + default="llm_compressor", + choices=["auto_round", "llm_compressor"], + help="Export format.", + ) + parser.add_argument( + "--disable_preset_layer_config", + action="store_true", + help="Disable preset layer_config for the selected dtype.", + ) + args = parser.parse_args() + + quant_config = build_config(args) + + model = args.input_model + model = prepare(model, quant_config) + _ = convert(model) + logger.info("Quantized model saved to %s", args.output_model) + + +if __name__ == "__main__": + main() diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh new file mode 100644 index 00000000000..7f1f1f97196 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh @@ -0,0 +1,171 @@ +#!/bin/bash + +set -euo pipefail + +# Usage: +# bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] +# +# This script can start vLLM serve and then run evalscope automatically. + +PORT=8009 +MODEL=/workspace/models/deepseek-ai/DeepSeek-V4-Flash +TEMPERATURE=0 +KV_CACHE_DTYPE="fp8" +BLOCK_SIZE=256 +TENSOR_PARALLEL_SIZE=2 +SAFETENSORS_FAST_GPU="1" +TRUST_REMOTE_CODE="true" +NO_ENABLE_FLASHINFER_AUTOTUNE="true" +SKIP_SERVE="false" + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd) +cd "${SCRIPT_DIR}" + +cleanup() { + if [[ "${SKIP_SERVE}" == "true" ]]; then + return + fi + # Kill the process listening on the specified port to free GPU. + VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}') + if [[ -n "${VLLM_PIDS}" ]]; then + for PID in ${VLLM_PIDS}; do + CHILDREN=$(pgrep -P "${PID}" || true) + if [[ -n "${CHILDREN}" ]]; then + kill -9 ${CHILDREN} 2>/dev/null || true + fi + kill -9 "${PID}" 2>/dev/null || true + done + fi +} + +trap cleanup EXIT + +while [[ $# -gt 0 ]]; do + case "$1" in + --port) + PORT="$2"; shift 2 ;; + --model) + MODEL="$2"; shift 2 ;; + --temp) + TEMPERATURE="$2"; shift 2 ;; + --skip_serve) + SKIP_SERVE="true"; shift 1 ;; + --tp) + TENSOR_PARALLEL_SIZE="$2"; shift 2 ;; + --kv-cache-dtype) + KV_CACHE_DTYPE="$2"; shift 2 ;; + --block-size) + BLOCK_SIZE="$2"; shift 2 ;; + *) + echo "Unknown option: $1"; exit 1 ;; + esac +done + +API_URL="http://127.0.0.1:${PORT}/v1" + +if [[ "${SKIP_SERVE}" != "true" ]]; then + echo "Starting vLLM serve on port ${PORT} ..." + MODEL_NORMALIZED="${MODEL%/}" + MODEL_NAME="${MODEL_NORMALIZED##*/}" + EXTRA_ARGS=() + # Only for base DeepSeek-V4-Flash/Pro model names without quantized suffixes. + if [[ "${MODEL_NAME}" == "DeepSeek-V4-Flash" || "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then + EXTRA_ARGS+=(--enable-expert-parallel) + EXTRA_ARGS+=(--moe-backend deep_gemm_mega_moe) + fi + + VLLM_CMD=( + vllm serve "${MODEL}" + --kv-cache-dtype "${KV_CACHE_DTYPE}" + --block-size "${BLOCK_SIZE}" + --tensor-parallel-size "${TENSOR_PARALLEL_SIZE}" + --attention_config.use_fp4_indexer_cache=True + --port "${PORT}" + ) + if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then + VLLM_CMD+=(--trust-remote-code) + fi + if [[ "${NO_ENABLE_FLASHINFER_AUTOTUNE}" == "true" ]]; then + VLLM_CMD+=(--no-enable-flashinfer-autotune) + fi + VLLM_CMD+=("${EXTRA_ARGS[@]}") + + SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 & + echo "vLLM launched. Log: /tmp/vllm_${PORT}.log" +fi + +# Wait until the API is ready +echo "Waiting for API at ${API_URL} ..." +until curl -sf "${API_URL}/models" -o /dev/null; do + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..." + sleep 20 +done +echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation." + +MODEL_NORMALIZED="${MODEL%/}" +MODEL_NAME="${MODEL_NORMALIZED##*/}" +LOG_DIR="logs/${MODEL_NAME}" +mkdir -p "$LOG_DIR" +OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log" + +echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE" +echo "Model: $MODEL" | tee -a "$OUTPUT_FILE" +echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE" +echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE" +echo "" | tee -a "$OUTPUT_FILE" + + +echo "" | tee -a "$OUTPUT_FILE" +echo "=== [1/3] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE" +evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets aime26 \ + --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +echo "" | tee -a "$OUTPUT_FILE" + +echo "=== [2/3] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE" +evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets gpqa_diamond \ + --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" + +echo "=== [3/3] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE" +evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets piqa hellaswag gsm8k mmlu_pro math_500 mmlu \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" + + +echo "" | tee -a "$OUTPUT_FILE" +echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE" +echo "Results saved to: $OUTPUT_FILE" + +# Kill the process listening on the specified port to free GPU +echo "Stopping process on port ${PORT} to free GPU..." | tee -a "$OUTPUT_FILE" +VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}') +if [[ -n "$VLLM_PIDS" ]]; then + echo "Found vllm serve process(es) with --port ${PORT}: $VLLM_PIDS" | tee -a "$OUTPUT_FILE" + for PID in $VLLM_PIDS; do + # Kill all child processes (including GPU processes) + CHILDREN=$(pgrep -P $PID) + if [[ -n "$CHILDREN" ]]; then + echo "Killing child processes of $PID: $CHILDREN" | tee -a "$OUTPUT_FILE" + kill -9 $CHILDREN 2>/dev/null + fi + kill -9 $PID 2>/dev/null + echo "Killed vllm serve process and its children: $PID $CHILDREN" | tee -a "$OUTPUT_FILE" + done +else + echo "No vllm serve process found with --port ${PORT}." | tee -a "$OUTPUT_FILE" +fi diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh new file mode 100644 index 00000000000..305c2cd266d --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_quant.sh @@ -0,0 +1,55 @@ +#!/bin/bash +set -e + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd) + +DTYPE="" +INPUT_MODEL="" +OUTPUT_MODEL="" +FORMAT="llm_compressor" +IGNORE_LAYERS="compressor,indexer.weights_proj" + +usage() { + echo "Usage: bash run_quant.sh --dtype= --input_model= --output_model=" + echo "Optional: --format= --ignore_layers=" + exit 1 +} + +for arg in "$@"; do + case $arg in + --dtype=*) + DTYPE="${arg#*=}" + ;; + --input_model=*) + INPUT_MODEL="${arg#*=}" + ;; + --output_model=*) + OUTPUT_MODEL="${arg#*=}" + ;; + --format=*) + FORMAT="${arg#*=}" + ;; + --ignore_layers=*) + IGNORE_LAYERS="${arg#*=}" + ;; + -h|--help) + usage + ;; + *) + echo "Unknown option: $arg" + usage + ;; + esac +done + +[[ -z "$DTYPE" ]] && echo "Error: --dtype is required" && usage +[[ -z "$INPUT_MODEL" ]] && echo "Error: --input_model is required" && usage +[[ -z "$OUTPUT_MODEL" ]] && echo "Error: --output_model is required" && usage + +cd "$SCRIPT_DIR" +python quantize.py \ + --dtype "$DTYPE" \ + --input_model "$INPUT_MODEL" \ + --output_model "$OUTPUT_MODEL" \ + --format "$FORMAT" \ + --ignore_layers "$IGNORE_LAYERS" diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py index 5b440a57c0b..bf820b083ab 100644 --- a/neural_compressor/torch/algorithms/autoround/autoround.py +++ b/neural_compressor/torch/algorithms/autoround/autoround.py @@ -158,6 +158,8 @@ def prepare(self, model: torch.nn.Module, *args, **kwargs): Returns: A prepared model. """ + if isinstance(model, str) and bool(getattr(self, "model_free", False)): + return model prepare_model = InputCaptureModule(model) return prepare_model @@ -171,20 +173,26 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): The quantized model. """ pipe = kwargs.pop("pipeline", None) - tokenizer = getattr(model.orig_model, "tokenizer", None) - if tokenizer is not None: - delattr(model.orig_model, "tokenizer") - elif pipe is None: - tokenizer = "Placeholder" - self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) - # Retrieve processor/image_processor/template from model if they were attached there - # (moved from quant_config to model to avoid duplicating large objects in per-layer configs) - for _attr in ("processor", "image_processor", "template"): - _val = getattr(model.orig_model, _attr, None) - if _val is not None: - setattr(self, _attr, _val) - delattr(model.orig_model, _attr) - model = model.orig_model + is_model_reference = isinstance(model, str) + if is_model_reference: + tokenizer = getattr(self, "tokenizer", None) + if tokenizer is None and pipe is None: + tokenizer = "Placeholder" + else: + tokenizer = getattr(model.orig_model, "tokenizer", None) + if tokenizer is not None: + delattr(model.orig_model, "tokenizer") + elif pipe is None: + tokenizer = "Placeholder" + self.dataset = CapturedDataloader(model.args_list, model.kwargs_list) + # Retrieve processor/image_processor/template from model if they were attached there + # (moved from quant_config to model to avoid duplicating large objects in per-layer configs) + for _attr in ("processor", "image_processor", "template"): + _val = getattr(model.orig_model, _attr, None) + if _val is not None: + setattr(self, _attr, _val) + delattr(model.orig_model, _attr) + model = model.orig_model if pipe is not None: model = pipe # Remove AutoRound specific args before passing to AutoRound constructor @@ -221,7 +229,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): if self._is_w4afp8(): model, weight_config = rounder.quantize() - model.autoround_config = weight_config + if hasattr(model, "__dict__"): + model.autoround_config = weight_config return rounder.save_quantized(output_dir=self.output_dir, inplace=True) else: # pragma: no cover _, quantized_model_path = rounder.quantize_and_save( @@ -229,7 +238,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): ) self.output_dir = quantized_model_path model = rounder.model - model.autoround_config = rounder.layer_config + if hasattr(model, "__dict__"): + model.autoround_config = rounder.layer_config self.accelerator.empty_cache() dump_model_op_stats(rounder.layer_config) @@ -248,7 +258,9 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): except Exception as e: logger.error(f"Error reloading model: {e}") - setattr(model, "name_or_path", self.output_dir) # model is saved in a subfolder of output_dir based on scheme + if hasattr(model, "__dict__"): + # model is saved in a subfolder of output_dir based on scheme + setattr(model, "name_or_path", self.output_dir) return model diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py index e15d19fba96..95651695030 100644 --- a/neural_compressor/torch/quantization/algorithm_entry.py +++ b/neural_compressor/torch/quantization/algorithm_entry.py @@ -560,12 +560,12 @@ def teq_quantize_entry( ###################### AUTOROUND Algo Entry ################################## @register_algo(name=AUTOROUND) def autoround_quantize_entry( - model: torch.nn.Module, + model, configs_mapping: Dict[Tuple[str, callable], AutoRoundConfig], mode: Mode = Mode.QUANTIZE, *args, **kwargs, -) -> torch.nn.Module: +): """The main entry to apply AutoRound quantization. Args: @@ -630,8 +630,10 @@ def autoround_quantize_entry( kwargs.pop("example_inputs") quantizer = get_quantizer(model, quantizer_cls=AutoRoundQuantizer, quant_config=quant_config, **params_dict) model = quantizer.execute(model=model, mode=mode, *args, **kwargs) - model.qconfig = configs_mapping - model.save = MethodType(save, model) + if hasattr(model, "__dict__"): + model.qconfig = configs_mapping + if isinstance(model, torch.nn.Module): + model.save = MethodType(save, model) postprocess_model(model, mode, quantizer) return model diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 17641d81f2f..3bd7729b2b3 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -33,6 +33,25 @@ FRAMEWORK_NAME = "torch" +class _AutoRoundModelReference: + """A lightweight container for model-free AutoRound prepare/convert flow.""" + + def __init__(self, model_reference: str, quant_config: BaseConfig, example_inputs: Any = None): + self.model_reference = model_reference + self.quant_config = quant_config + self.example_inputs = example_inputs + self.is_prepared = True + + +def _is_autoround_model_free_string_case(model: Any, quant_config: BaseConfig) -> bool: + """Return True when model-free AutoRound is called with a string model reference.""" + return ( + isinstance(quant_config, AutoRoundConfig) + and bool(getattr(quant_config, "model_free", False)) + and isinstance(model, str) + ) + + def need_apply(configs_mapping: Dict[Tuple[str, callable], BaseConfig], algo_name): """Check whether to apply this algorithm according to configs_mapping. @@ -89,12 +108,16 @@ def preprocess_quant_config(model, quant_config, mode="prepare", example_inputs= ) model_info = quant_config.get_model_info(model, example_inputs) elif isinstance(quant_config, AutoRoundConfig): - for _attr in ("tokenizer", "processor", "image_processor", "template"): - _backup = getattr(quant_config, _attr, None) - if _backup is not None: - setattr(model, _attr, _backup) - delattr(quant_config, _attr) - model_info = quant_config.get_model_info(model=model) + if _is_autoround_model_free_string_case(model, quant_config): + # Keep optional large objects on config when model is a string reference. + model_info = quant_config.get_model_info(model=None) + else: + for _attr in ("tokenizer", "processor", "image_processor", "template"): + _backup = getattr(quant_config, _attr, None) + if _backup is not None: + setattr(model, _attr, _backup) + delattr(quant_config, _attr) + model_info = quant_config.get_model_info(model=model) else: model_info = quant_config.get_model_info(model=model) @@ -172,6 +195,9 @@ def prepare( Returns: prepared and calibrated module. """ + if _is_autoround_model_free_string_case(model, quant_config): + return _AutoRoundModelReference(model_reference=model, quant_config=quant_config, example_inputs=example_inputs) + prepared_model = model if inplace else copy.deepcopy(model) prepared_model, configs_mapping = preprocess_quant_config( prepared_model, quant_config, mode="prepare", example_inputs=example_inputs @@ -240,6 +266,13 @@ def convert( Returns: The quantized model. """ + if isinstance(model, _AutoRoundModelReference): + if quant_config is None: + quant_config = model.quant_config + else: + logger.warning("quant_config will be ignored since the model has been prepared.") + model = model.model_reference + q_model = model if inplace else copy.deepcopy(model) assert ( @@ -287,7 +320,8 @@ def convert( mode=Mode.CONVERT, **kwargs, ) - setattr(q_model, "is_quantized", True) + if hasattr(q_model, "__dict__"): + setattr(q_model, "is_quantized", True) return q_model From a893a69369bdd8f9d33b69f5d40b68eeba2c34f7 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 16 Jun 2026 10:33:44 +0800 Subject: [PATCH 02/12] Refactor AutoRoundQuantizer to conditionally dump model operation stats and add tests for model-free string case detection Signed-off-by: Xin He --- .../torch/algorithms/autoround/autoround.py | 3 +- test/torch/quantization/test_autoround_cpu.py | 83 +++++++++++++++++++ 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/neural_compressor/torch/algorithms/autoround/autoround.py b/neural_compressor/torch/algorithms/autoround/autoround.py index bf820b083ab..654b58d9501 100644 --- a/neural_compressor/torch/algorithms/autoround/autoround.py +++ b/neural_compressor/torch/algorithms/autoround/autoround.py @@ -242,7 +242,8 @@ def convert(self, model: torch.nn.Module, *args, **kwargs): model.autoround_config = rounder.layer_config self.accelerator.empty_cache() - dump_model_op_stats(rounder.layer_config) + if not bool(getattr(self, "model_free", False)): + dump_model_op_stats(rounder.layer_config) reloading = self.__dict__.get("reloading", True) if self.export_format in ["auto_round", "llm_compressor"] and reloading: diff --git a/test/torch/quantization/test_autoround_cpu.py b/test/torch/quantization/test_autoround_cpu.py index d25fc9bb1e6..f70e95248d7 100644 --- a/test/torch/quantization/test_autoround_cpu.py +++ b/test/torch/quantization/test_autoround_cpu.py @@ -16,6 +16,10 @@ prepare, quantize, ) +from neural_compressor.torch.quantization.quantize import ( + _AutoRoundModelReference, + _is_autoround_model_free_string_case, +) from neural_compressor.torch.utils import logger torch.backends.__allow_nonbracketed_mutation_flag = True @@ -638,3 +642,82 @@ def test_fp8_kv_attn(self, scheme, static_kv_dtype, static_attention_dtype, tmp_ assert ( getattr(attn, "q_scale", None) is not None ), f"Missing q_scale in attention for scheme={scheme}, static_attention_dtype={static_attention_dtype}" + + def test_is_autoround_model_free_string_case_true(self): + """Test detection when model is string and config has model_free=True.""" + config = AutoRoundConfig(model_free=True, scheme="MXFP4") + model = "/path/to/model" + assert _is_autoround_model_free_string_case(model, config) is True + + def test_is_autoround_model_free_string_case_false_not_string(self): + """Test detection returns False when model is not a string.""" + config = AutoRoundConfig(model_free=True, scheme="MXFP4") + model = torch.nn.Linear(10, 10) + assert _is_autoround_model_free_string_case(model, config) is False + + def test_is_autoround_model_free_string_case_false_no_flag(self): + """Test detection returns False when model_free is not set.""" + config = AutoRoundConfig(scheme="MXFP4") + model = "/path/to/model" + assert _is_autoround_model_free_string_case(model, config) is False + + def test_is_autoround_model_free_string_case_false_model_free_false(self): + """Test detection returns False when model_free is explicitly False.""" + config = AutoRoundConfig(model_free=False, scheme="MXFP4") + model = "/path/to/model" + assert _is_autoround_model_free_string_case(model, config) is False + + def test_autoround_model_reference_creation(self): + """Test _AutoRoundModelReference wrapper creation.""" + model_ref = "/path/to/deepseek-v4" + config = AutoRoundConfig(model_free=True, scheme="MXFP4") + example_inputs = {"input_ids": torch.ones(1, 10, dtype=torch.long)} + + ref = _AutoRoundModelReference(model_reference=model_ref, quant_config=config, example_inputs=example_inputs) + + assert ref.model_reference == model_ref + assert ref.quant_config is config + assert ref.example_inputs == example_inputs + assert ref.is_prepared is True + + def test_prepare_with_string_model_and_model_free_returns_reference(self): + """Test that prepare() returns _AutoRoundModelReference when called with string model and model_free=True.""" + model = "/path/to/model" + config = AutoRoundConfig( + model_free=True, + scheme="MXFP4", + ignore_layers="compressor", + output_dir="/tmp/test_output", + ) + + result = prepare(model, config) + + assert isinstance(result, _AutoRoundModelReference) + assert result.model_reference == model + assert result.quant_config is config + + def test_model_free_with_string_model(self): + """Test that prepare() preserves all config attributes in _AutoRoundModelReference.""" + model = "facebook/opt-125m" + layer_config = {"fc2": {"bits": 4, "data_type": "mx_fp"}} + config = AutoRoundConfig( + model_free=True, + scheme="MXFP8", + ignore_layers="self_attn", + layer_config=layer_config, + export_format="llm_compressor", + output_dir="/tmp/quantized_model", + ) + + result = prepare(model, config) + + assert isinstance(result, _AutoRoundModelReference) + assert result.quant_config.scheme == "MXFP8" + assert result.quant_config.ignore_layers == "self_attn" + assert result.quant_config.layer_config == layer_config + assert result.quant_config.export_format == "llm_compressor" + + result = convert(result) + assert not hasattr(result.model.decoder.layers[0].self_attn.k_proj, "quantization_scheme"), "Ignored layers were not preserved during conversion." + assert result.model.decoder.layers[0].fc1.quantization_scheme.format.value == 'mxfp8-quantized', "Model conversion did not preserve the quantization scheme format." + assert result.model.decoder.layers[0].fc2.quantization_scheme.format.value == 'mxfp4-pack-quantized', "Model conversion did not preserve the quantization scheme format for layer_config." From 214328a3c40f3ed3d61244d725d51aed9e84d7f8 Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 16 Jun 2026 10:50:07 +0800 Subject: [PATCH 03/12] update run_evalscope Signed-off-by: Xin He --- .../auto_round/deepseekv4/README.md | 2 +- .../auto_round/deepseekv4/run_evalscope.sh | 56 ++++++++++++++++++- 2 files changed, 55 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md index 6930104d889..9c11772c83f 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -51,7 +51,7 @@ SAFETENSORS_FAST_GPU=1 CUDA_VISIBLE_DEVICES=0,1 vllm serve \ ``` If model basename is exactly `DeepSeek-V4-Flash` or `DeepSeek-V4-Pro` (without extra suffix), -`run_evalscope.sh` will also add: +`run_evalscope.sh` will also add (automatically): ```bash --enable-expert-parallel --moe-backend deep_gemm_mega_moe diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh index 7f1f1f97196..bccb7911341 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh @@ -16,15 +16,31 @@ TENSOR_PARALLEL_SIZE=2 SAFETENSORS_FAST_GPU="1" TRUST_REMOTE_CODE="true" NO_ENABLE_FLASHINFER_AUTOTUNE="true" -SKIP_SERVE="false" +SKIP_SERVE="${SKIP_SERVE:-false}" +VLLM_PID="" +LOG_TAIL_PID="" SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd) cd "${SCRIPT_DIR}" cleanup() { + if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then + kill "${LOG_TAIL_PID}" 2>/dev/null || true + fi + if [[ "${SKIP_SERVE}" == "true" ]]; then return fi + + if [[ -n "${VLLM_PID}" ]] && kill -0 "${VLLM_PID}" 2>/dev/null; then + CHILDREN=$(pgrep -P "${VLLM_PID}" || true) + if [[ -n "${CHILDREN}" ]]; then + kill -9 ${CHILDREN} 2>/dev/null || true + fi + kill -9 "${VLLM_PID}" 2>/dev/null || true + return + fi + # Kill the process listening on the specified port to free GPU. VLLM_PIDS=$(ps aux | grep -- "vllm serve" | grep -- "--port[ =]${PORT}" | grep -v grep | awk '{print $2}') if [[ -n "${VLLM_PIDS}" ]]; then @@ -40,6 +56,13 @@ cleanup() { trap cleanup EXIT +stop_log_tail() { + if [[ -n "${LOG_TAIL_PID}" ]] && kill -0 "${LOG_TAIL_PID}" 2>/dev/null; then + kill "${LOG_TAIL_PID}" 2>/dev/null || true + LOG_TAIL_PID="" + fi +} + while [[ $# -gt 0 ]]; do case "$1" in --port) @@ -50,6 +73,8 @@ while [[ $# -gt 0 ]]; do TEMPERATURE="$2"; shift 2 ;; --skip_serve) SKIP_SERVE="true"; shift 1 ;; + --skip-serve) + SKIP_SERVE="true"; shift 1 ;; --tp) TENSOR_PARALLEL_SIZE="$2"; shift 2 ;; --kv-cache-dtype) @@ -61,6 +86,8 @@ while [[ $# -gt 0 ]]; do esac done +SKIP_SERVE="$(echo "${SKIP_SERVE}" | tr '[:upper:]' '[:lower:]')" + API_URL="http://127.0.0.1:${PORT}/v1" if [[ "${SKIP_SERVE}" != "true" ]]; then @@ -91,15 +118,40 @@ if [[ "${SKIP_SERVE}" != "true" ]]; then VLLM_CMD+=("${EXTRA_ARGS[@]}") SAFETENSORS_FAST_GPU="${SAFETENSORS_FAST_GPU}" "${VLLM_CMD[@]}" >/tmp/vllm_${PORT}.log 2>&1 & + VLLM_PID=$! echo "vLLM launched. Log: /tmp/vllm_${PORT}.log" + echo "vLLM PID: ${VLLM_PID}" + echo "=== vLLM startup log (will stop after API wait ends) ===" + tail -n +1 -f "/tmp/vllm_${PORT}.log" & + LOG_TAIL_PID=$! fi # Wait until the API is ready echo "Waiting for API at ${API_URL} ..." -until curl -sf "${API_URL}/models" -o /dev/null; do +for _ in $(seq 1 90); do + if curl -sf "${API_URL}/models" -o /dev/null; then + break + fi + if [[ "${SKIP_SERVE}" != "true" ]] && [[ -n "${VLLM_PID}" ]] && ! kill -0 "${VLLM_PID}" 2>/dev/null; then + stop_log_tail + echo "[$(date '+%Y-%m-%d %H:%M:%S')] vLLM exited before API became ready." + echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----" + tail -n 80 "/tmp/vllm_${PORT}.log" || true + exit 1 + fi echo "[$(date '+%Y-%m-%d %H:%M:%S')] Port ${PORT} not ready, retrying in 20s..." sleep 20 done + +stop_log_tail + +if ! curl -sf "${API_URL}/models" -o /dev/null; then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] Timeout waiting for API at ${API_URL}." + echo "----- Last 80 lines of /tmp/vllm_${PORT}.log -----" + tail -n 80 "/tmp/vllm_${PORT}.log" || true + exit 1 +fi + echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation." MODEL_NORMALIZED="${MODEL%/}" From 09559c90c1d583570d49b4bd5c1a28947ce85aaa Mon Sep 17 00:00:00 2001 From: Xin He Date: Tue, 16 Jun 2026 10:51:53 +0800 Subject: [PATCH 04/12] update readme Signed-off-by: Xin He --- examples/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/examples/README.md b/examples/README.md index 26e4a5792d7..9d590b1b7f5 100644 --- a/examples/README.md +++ b/examples/README.md @@ -15,6 +15,12 @@ IntelĀ® Neural Compressor validated examples with multiple compression technique + + deepseek-ai/DeepSeek-V4 + Natural Language Processing + Quantization (MXFP8/MXFP4) + link + deepseek-ai/DeepSeek-R1 Natural Language Processing From f156b16604960259edd517e6a355ca8edabb2731 Mon Sep 17 00:00:00 2001 From: changwangss Date: Wed, 17 Jun 2026 08:07:48 +0000 Subject: [PATCH 05/12] add lm_eval ruler_qa_squad Signed-off-by: changwangss --- .../auto_round/deepseekv4/run_evalscope.sh | 27 ++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh index bccb7911341..dcfb75466b7 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh @@ -109,6 +109,9 @@ if [[ "${SKIP_SERVE}" != "true" ]]; then --attention_config.use_fp4_indexer_cache=True --port "${PORT}" ) + if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then + VLLM_CMD+=(--max-model-len 1048576) + fi if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then VLLM_CMD+=(--trust-remote-code) fi @@ -168,7 +171,7 @@ echo "" | tee -a "$OUTPUT_FILE" echo "" | tee -a "$OUTPUT_FILE" -echo "=== [1/3] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE" +echo "=== [1/4] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE" evalscope eval \ --model "$MODEL" \ --eval-type openai_api \ @@ -179,7 +182,7 @@ evalscope eval \ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" echo "" | tee -a "$OUTPUT_FILE" -echo "=== [2/3] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE" +echo "=== [2/4] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE" evalscope eval \ --model "$MODEL" \ --eval-type openai_api \ @@ -189,7 +192,7 @@ evalscope eval \ --eval-batch-size 10 --timeout 3000 \ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" -echo "=== [3/3] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE" +echo "=== [3/4] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE" evalscope eval \ --model "$MODEL" \ --eval-type openai_api \ @@ -198,6 +201,24 @@ evalscope eval \ --eval-batch-size 10 --timeout 3000 \ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +echo "=== [4/4] ruler_qa_squad (lm_eval, 1M) ===" | tee -a "$OUTPUT_FILE" +if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then + LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa" + mkdir -p "${LMEVAL_OUTPUT_DIR}" + LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}") + lm_eval \ + --model local-completions \ + --tasks ruler_qa_squad \ + --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \ + --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \ + --metadata "${LMEVAL_METADATA}" \ + --batch_size 1 \ + --log_samples \ + --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE" +else + echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE" +fi + echo "" | tee -a "$OUTPUT_FILE" echo "=== Evaluation finished at $(date) ===" | tee -a "$OUTPUT_FILE" From 602c4a0bc37bb6a910ae7e24f912a2a09c741e35 Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 17 Jun 2026 16:25:42 +0800 Subject: [PATCH 06/12] add requirement Signed-off-by: Xin He --- .../quantization/auto_round/deepseekv4/README.md | 11 +++++++++++ .../auto_round/deepseekv4/run_evalscope.sh | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md index 9c11772c83f..d0002c6fec2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -17,6 +17,17 @@ model = prepare(model, config) model = convert(model) ``` +## Requirements + +Install dependencies before running quantization or evaluation: + +```bash +uv pip install -U pip +uv pip install -U "git+https://github.com/intel/auto-round.git@main" +uv pip install -U evalscope vllm lm_eval transformers datasets compressed-tensors +bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) +``` + ## Quick Start ```bash diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh index dcfb75466b7..0736d4ff2cb 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh @@ -109,7 +109,7 @@ if [[ "${SKIP_SERVE}" != "true" ]]; then --attention_config.use_fp4_indexer_cache=True --port "${PORT}" ) - if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then + if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then VLLM_CMD+=(--max-model-len 1048576) fi if [[ "${TRUST_REMOTE_CODE}" == "true" ]]; then @@ -202,7 +202,7 @@ evalscope eval \ --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" echo "=== [4/4] ruler_qa_squad (lm_eval, 1M) ===" | tee -a "$OUTPUT_FILE" -if [[ "${MODEL_NAME}" == "DeepSeek-V4-Pro" ]]; then +if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa" mkdir -p "${LMEVAL_OUTPUT_DIR}" LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}") From a51e4115e150ac7a24ba851e9326fcd3887cb7ca Mon Sep 17 00:00:00 2001 From: Xin He Date: Thu, 18 Jun 2026 16:14:18 +0800 Subject: [PATCH 07/12] Update README and run_evalscope.sh to enhance task handling and installation instructions Signed-off-by: Xin He --- .../auto_round/deepseekv4/README.md | 5 +- .../auto_round/deepseekv4/run_evalscope.sh | 206 ++++++++++++++---- 2 files changed, 162 insertions(+), 49 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md index d0002c6fec2..5e9ca74dae2 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -24,8 +24,10 @@ Install dependencies before running quantization or evaluation: ```bash uv pip install -U pip uv pip install -U "git+https://github.com/intel/auto-round.git@main" -uv pip install -U evalscope vllm lm_eval transformers datasets compressed-tensors +uv pip install -U evalscope lm_eval transformers datasets +uv pip install compressed-tensors --no-deps bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) +VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp ``` ## Quick Start @@ -45,6 +47,7 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_evalscope.sh \ --model /workspace/models/deepseek-ai/DeepSeek-V4-Flash-MXFP4-Mixed \ --tp 2 \ --port 8009 \ + --tasks piqa,hellaswag,gsm8k,mmlu_pro,math_500,mmlu,aime26,gpqa_diamond,ruler_qa_squad --temp 1.0 ``` diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh index 0736d4ff2cb..6c19df57576 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/run_evalscope.sh @@ -3,7 +3,7 @@ set -euo pipefail # Usage: -# bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] +# bash run_evalscope.sh --model MODEL_PATH [--port PORT] [--temp TEMPERATURE] [--tasks TASK1,TASK2] # # This script can start vLLM serve and then run evalscope automatically. @@ -16,6 +16,7 @@ TENSOR_PARALLEL_SIZE=2 SAFETENSORS_FAST_GPU="1" TRUST_REMOTE_CODE="true" NO_ENABLE_FLASHINFER_AUTOTUNE="true" +TASKS="" SKIP_SERVE="${SKIP_SERVE:-false}" VLLM_PID="" LOG_TAIL_PID="" @@ -63,6 +64,30 @@ stop_log_tail() { fi } +trim_task_name() { + local task_name="$1" + task_name="${task_name#${task_name%%[![:space:]]*}}" + task_name="${task_name%${task_name##*[![:space:]]}}" + echo "${task_name}" +} + +task_in_list() { + local target_task="$1" + shift + local task_name + for task_name in "$@"; do + if [[ "${task_name}" == "${target_task}" ]]; then + return 0 + fi + done + return 1 +} + +print_section_header() { + echo "=== [${STEP_INDEX}/${TOTAL_STEPS}] $1 ===" | tee -a "$OUTPUT_FILE" + STEP_INDEX=$((STEP_INDEX + 1)) +} + while [[ $# -gt 0 ]]; do case "$1" in --port) @@ -71,6 +96,8 @@ while [[ $# -gt 0 ]]; do MODEL="$2"; shift 2 ;; --temp) TEMPERATURE="$2"; shift 2 ;; + --tasks) + TASKS="$2"; shift 2 ;; --skip_serve) SKIP_SERVE="true"; shift 1 ;; --skip-serve) @@ -156,67 +183,150 @@ if ! curl -sf "${API_URL}/models" -o /dev/null; then fi echo "[$(date '+%Y-%m-%d %H:%M:%S')] API is ready, starting evaluation." - MODEL_NORMALIZED="${MODEL%/}" MODEL_NAME="${MODEL_NORMALIZED##*/}" LOG_DIR="logs/${MODEL_NAME}" mkdir -p "$LOG_DIR" OUTPUT_FILE="${LOG_DIR}/eval_results_$(date +%Y%m%d_%H%M%S)_port${PORT}_temp${TEMPERATURE}.log" +DEFAULT_STANDARD_TASKS=(piqa hellaswag gsm8k mmlu_pro math_500 mmlu) +SUPPORTED_TASKS=(aime26 gpqa_diamond ruler_qa_squad "${DEFAULT_STANDARD_TASKS[@]}") +SELECTED_STANDARD_TASKS=() +RUN_AIME26="true" +RUN_GPQA_DIAMOND="true" +RUN_STANDARD_TASKS="true" +RUN_RULER_QA_SQUAD="true" + +if [[ -n "${TASKS}" ]]; then + RUN_AIME26="false" + RUN_GPQA_DIAMOND="false" + RUN_STANDARD_TASKS="false" + RUN_RULER_QA_SQUAD="false" + + IFS=',' read -r -a REQUESTED_TASKS <<< "${TASKS}" + for raw_task in "${REQUESTED_TASKS[@]}"; do + task_name="$(trim_task_name "${raw_task}")" + if [[ -z "${task_name}" ]]; then + continue + fi + if ! task_in_list "${task_name}" "${SUPPORTED_TASKS[@]}"; then + echo "Unsupported task: ${task_name}" + echo "Supported tasks: ${SUPPORTED_TASKS[*]}" + exit 1 + fi + + case "${task_name}" in + aime26) + RUN_AIME26="true" + ;; + gpqa_diamond) + RUN_GPQA_DIAMOND="true" + ;; + ruler_qa_squad) + RUN_RULER_QA_SQUAD="true" + ;; + *) + if ! task_in_list "${task_name}" "${SELECTED_STANDARD_TASKS[@]}"; then + SELECTED_STANDARD_TASKS+=("${task_name}") + RUN_STANDARD_TASKS="true" + fi + ;; + esac + done + + if [[ "${RUN_AIME26}" != "true" ]] && [[ "${RUN_GPQA_DIAMOND}" != "true" ]] \ + && [[ "${RUN_STANDARD_TASKS}" != "true" ]] && [[ "${RUN_RULER_QA_SQUAD}" != "true" ]]; then + echo "No valid tasks selected from --tasks '${TASKS}'." + exit 1 + fi +else + SELECTED_STANDARD_TASKS=("${DEFAULT_STANDARD_TASKS[@]}") +fi + +TOTAL_STEPS=0 +if [[ "${RUN_AIME26}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then + TOTAL_STEPS=$((TOTAL_STEPS + 1)) +fi +STEP_INDEX=1 + echo "=== Evaluation started at $(date) ===" | tee "$OUTPUT_FILE" echo "Model: $MODEL" | tee -a "$OUTPUT_FILE" echo "API URL: $API_URL" | tee -a "$OUTPUT_FILE" echo "Temperature: $TEMPERATURE" | tee -a "$OUTPUT_FILE" +if [[ -n "${TASKS}" ]]; then + echo "Tasks: ${TASKS}" | tee -a "$OUTPUT_FILE" +else + echo "Tasks: all default tasks" | tee -a "$OUTPUT_FILE" +fi echo "" | tee -a "$OUTPUT_FILE" -echo "" | tee -a "$OUTPUT_FILE" -echo "=== [1/4] aime26 (n=10) ===" | tee -a "$OUTPUT_FILE" -evalscope eval \ - --model "$MODEL" \ - --eval-type openai_api \ - --api-key EMPTY \ - --datasets aime26 \ - --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \ - --eval-batch-size 10 --timeout 3000 \ - --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" -echo "" | tee -a "$OUTPUT_FILE" +if [[ "${RUN_AIME26}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "aime26 (n=10)" + evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets aime26 \ + --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 10}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +fi -echo "=== [2/4] gpqa_diamond (n=5) ===" | tee -a "$OUTPUT_FILE" -evalscope eval \ - --model "$MODEL" \ - --eval-type openai_api \ - --api-key EMPTY \ - --datasets gpqa_diamond \ - --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \ - --eval-batch-size 10 --timeout 3000 \ - --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" - -echo "=== [3/4] piqa hellaswag gsm8k mmlu_pro math_500 mmlu ===" | tee -a "$OUTPUT_FILE" -evalscope eval \ - --model "$MODEL" \ - --eval-type openai_api \ - --api-key EMPTY \ - --datasets piqa hellaswag gsm8k mmlu_pro math_500 mmlu \ - --eval-batch-size 10 --timeout 3000 \ - --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" - -echo "=== [4/4] ruler_qa_squad (lm_eval, 1M) ===" | tee -a "$OUTPUT_FILE" -if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then - LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa" - mkdir -p "${LMEVAL_OUTPUT_DIR}" - LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}") - lm_eval \ - --model local-completions \ - --tasks ruler_qa_squad \ - --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \ - --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \ - --metadata "${LMEVAL_METADATA}" \ - --batch_size 1 \ - --log_samples \ - --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE" -else - echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE" +if [[ "${RUN_GPQA_DIAMOND}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "gpqa_diamond (n=5)" + evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets gpqa_diamond \ + --generation-config "{\"temperature\": ${TEMPERATURE}, \"n\": 5}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +fi + +if [[ "${RUN_STANDARD_TASKS}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "${SELECTED_STANDARD_TASKS[*]}" + evalscope eval \ + --model "$MODEL" \ + --eval-type openai_api \ + --api-key EMPTY \ + --datasets "${SELECTED_STANDARD_TASKS[@]}" \ + --eval-batch-size 10 --timeout 3000 \ + --api-url "$API_URL" 2>&1 | tee -a "$OUTPUT_FILE" +fi + +if [[ "${RUN_RULER_QA_SQUAD}" == "true" ]]; then + echo "" | tee -a "$OUTPUT_FILE" + print_section_header "ruler_qa_squad (lm_eval, 1M)" + if [[ "${MODEL_NAME}" == *"DeepSeek-V4-Pro"* ]]; then + LMEVAL_OUTPUT_DIR="${LOG_DIR}/lm_eval_ruler_1M_qa" + mkdir -p "${LMEVAL_OUTPUT_DIR}" + LMEVAL_METADATA=$(printf '{"max_seq_lengths":[1000000],"pretrained":"%s/","use_fast":false}' "${MODEL_NORMALIZED}") + lm_eval \ + --model local-completions \ + --tasks ruler_qa_squad \ + --model_args "model=${MODEL_NORMALIZED},base_url=${API_URL}/completions,num_concurrent=1,max_retries=3,max_length=1048576" \ + --gen_kwargs "temperature=${TEMPERATURE},do_sample=False,max_tokens=128" \ + --metadata "${LMEVAL_METADATA}" \ + --batch_size 1 \ + --log_samples \ + --output_path "${LMEVAL_OUTPUT_DIR}" 2>&1 | tee -a "$OUTPUT_FILE" + else + echo "Skip ruler_qa_squad: only DeepSeek-V4-Pro is supported for this test." | tee -a "$OUTPUT_FILE" + fi fi From 88cfd46a816dec4829fa56771a52e3b454160399 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 19 Jun 2026 17:16:30 +0800 Subject: [PATCH 08/12] add setup.sh Signed-off-by: chensuyue --- .../quantization/auto_round/deepseekv4/setup.sh | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh new file mode 100644 index 00000000000..63b4d0ab6e6 --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -e + +uv pip install -U pip +uv pip install -U evalscope lm_eval transformers datasets +uv pip install compressed-tensors --no-deps +bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) +VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp \ No newline at end of file From 46456a723c704a4186ae747f0f91676f25ca54f3 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Fri, 19 Jun 2026 18:15:04 +0800 Subject: [PATCH 09/12] update install Signed-off-by: chensuyue --- .../quantization/auto_round/deepseekv4/README.md | 2 +- .../quantization/auto_round/deepseekv4/setup.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md index 5e9ca74dae2..03b21cf3023 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -27,7 +27,7 @@ uv pip install -U "git+https://github.com/intel/auto-round.git@main" uv pip install -U evalscope lm_eval transformers datasets uv pip install compressed-tensors --no-deps bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) -VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp +VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp ``` ## Quick Start diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh index 63b4d0ab6e6..ddfe2ddebfc 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh @@ -5,4 +5,4 @@ uv pip install -U pip uv pip install -U evalscope lm_eval transformers datasets uv pip install compressed-tensors --no-deps bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) -VLLM_USE_PRECOMPILED=1 pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp \ No newline at end of file +VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp \ No newline at end of file From 8c98036639b35a75cd65ef500014819a5852d7a5 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 22 Jun 2026 15:25:54 +0800 Subject: [PATCH 10/12] update setup env Signed-off-by: chensuyue --- .../quantization/auto_round/deepseekv4/setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh index ddfe2ddebfc..19685ec1176 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh @@ -1,8 +1,8 @@ #!/bin/bash set -e -uv pip install -U pip +uv pip install -U pip setuptools_rust setuptools_scm uv pip install -U evalscope lm_eval transformers datasets uv pip install compressed-tensors --no-deps bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) -VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp \ No newline at end of file +VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation \ No newline at end of file From cd86e70a14c6c7f990501677e60c4a5b01cfe3e9 Mon Sep 17 00:00:00 2001 From: chensuyue Date: Mon, 22 Jun 2026 22:21:35 +0800 Subject: [PATCH 11/12] update env setup Signed-off-by: chensuyue --- .../quantization/auto_round/deepseekv4/setup.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh index 19685ec1176..0b1215ebfba 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/setup.sh @@ -3,6 +3,7 @@ set -e uv pip install -U pip setuptools_rust setuptools_scm uv pip install -U evalscope lm_eval transformers datasets +uv pip install git+https://github.com/intel/auto-round.git@main uv pip install compressed-tensors --no-deps bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation \ No newline at end of file From 90f9379c34bfd3caf49fbe795eee8e8bc3eb26be Mon Sep 17 00:00:00 2001 From: Xin He Date: Wed, 24 Jun 2026 07:30:24 +0000 Subject: [PATCH 12/12] update requirement Signed-off-by: Xin He --- .../quantization/auto_round/deepseekv4/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md index 03b21cf3023..5ab41000942 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseekv4/README.md @@ -27,7 +27,8 @@ uv pip install -U "git+https://github.com/intel/auto-round.git@main" uv pip install -U evalscope lm_eval transformers datasets uv pip install compressed-tensors --no-deps bash <(curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm/main/tools/install_deepgemm.sh) -VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp +uv pip install setuptools_rust setuptools_scm +VLLM_USE_PRECOMPILED=1 uv pip install git+https://github.com/xin3he/vllm-fork.git@support_deepseekv4_mxfp --no-build-isolation ``` ## Quick Start