vllm-project · aoshen02 · Jun 16, 2026
diff --git a/scripts/low_precision/run-kimi-k2-Thinking-int4.sh b/scripts/low_precision/run-kimi-k2-Thinking-int4.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 vllm
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONUNBUFFERED=1
+
+NVLINK_COUNT=$(nvidia-smi | grep -o "NVLink" | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/../models/kimi-k2-thinking.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint /root/Kimi-K2-Thinking/
+   --ref-load /root/Kimi-K2_thinking_torch_dist/
+   --load /root/Kimi-K2-thinking_vime/
+   --save /root/Kimi-K2-thinking_vime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+
+   --rm-type math
+
+   --num-rollout 100
+   --rollout-batch-size 128
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 16384
+   --rollout-temperature 0.8
+
+   # --global-batch-size 256
+
+   --over-sampling-batch-size 256
+   --dynamic-sampling-filter-path vime.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std
+
+   --num-steps-per-rollout 4
+   --balance-data
+)
+
+EVAL_ARGS=(
+   --eval-interval 10
+   --eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
+   --n-samples-per-eval-prompt 16
+   --eval-max-response-len 16384
+   --eval-top-p 0.7
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 8
+   --sequence-parallel
+   --pipeline-model-parallel-size 8
+   --context-parallel-size 4
+   --expert-model-parallel-size 32
+   --expert-tensor-parallel-size 1
+   --decoder-last-pipeline-num-layers 5
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 16384
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   # --kl-coef 0.00
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+   --use-tis
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+   # --use-wandb
+   # --wandb-project vime-dev
+   # --wandb-group kimi-k2-thinking-test
+   # --wandb-key ${WANDB_KEY}
+)
+
+VLLM_ARGS=(
+   --rollout-num-gpus-per-engine 8
+   --vllm-gpu-memory-utilization 0.7
+
+   # dp attention
+   # --vllm-data-parallel-size 8
+
+   --vllm-enable-expert-parallel
+
+   # enable deepep for vllm
+
+   # make every dp rank has 128 concurrency
+   --vllm-server-concurrency 1024
+)
+
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   --attention-backend flash
+
+   # use deepep for megatron
+  #  --moe-enable-deepep
+  #  --moe-token-dispatcher-type flex
+   --no-check-for-nan-in-loss-and-grad
+)
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"NCCL_TIMEOUT_MS\":\"360000000\",
+    \"no_proxy\": \"${no_proxy}\",
+    \"MASTER_ADDR\": \"${MASTER_ADDR}\",
+    \"OPEN_TRAINING_INT4_FAKE_QAT_FLAG\": \"1\",
+    \"OPEN_TRAINING_INT4_GROUP_SIZE\": \"32\"
+  }
+}"
+
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 /root/vime/train.py \
+   --actor-num-nodes 32 \
+   --actor-num-gpus-per-node 8 \
+   --colocate \
+   --update-weight-buffer-size $(( 4 * 512 * 1024 * 1024)) \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${VLLM_ARGS[@]} \
+   ${MISC_ARGS[@]}
diff --git a/scripts/low_precision/run-moonlight-16B-A3B-int4.sh b/scripts/low_precision/run-moonlight-16B-A3B-int4.sh
@@ -0,0 +1,165 @@
+#!/bin/bash
+
+# for rerun the task
+pkill -9 vllm
+sleep 3
+ray stop --force
+pkill -9 ray
+pkill -9 python
+sleep 3
+pkill -9 ray
+pkill -9 python
+pkill -9 redis
+
+set -ex
+
+# will prevent ray from buffering stdout/stderr
+export PYTHONUNBUFFERED=1
+
+NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
+if [ "$NVLINK_COUNT" -gt 0 ]; then
+    HAS_NVLINK=1
+else
+    HAS_NVLINK=0
+fi
+echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
+
+SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
+source "${SCRIPT_DIR}/../models/moonlight.sh"
+
+CKPT_ARGS=(
+   --hf-checkpoint /root/Moonlight-16B-A3B-Instruct-INT4
+   --ref-load /root/Moonlight-16B-A3B-Instruct-INT4_torch_dist
+   --load /root/Moonlight-16B-A3B_vime/
+   --save /root/Moonlight-16B-A3B_vime/
+   --save-interval 20
+)
+
+ROLLOUT_ARGS=(
+   --prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
+   --input-key prompt
+   --label-key label
+   --apply-chat-template
+   --rollout-shuffle
+   --rm-type math
+   --num-rollout 3000
+   --rollout-batch-size 128
+   --n-samples-per-prompt 8
+   --rollout-max-response-len 4096
+   --rollout-temperature 0.8
+
+   --over-sampling-batch-size 256
+   --dynamic-sampling-filter-path vime.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std
+
+   --num-steps-per-rollout 4
+   # --global-batch-size 256
+   --balance-data   
+)
+
+EVAL_ARGS=(
+   --eval-interval 20
+   --eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
+   --n-samples-per-eval-prompt 8
+   --eval-max-response-len 4096
+   --eval-top-p 0.7
+)
+
+PERF_ARGS=(
+   --tensor-model-parallel-size 2
+   --sequence-parallel
+   --pipeline-model-parallel-size 1
+   --context-parallel-size 1
+   --expert-model-parallel-size 4
+   --expert-tensor-parallel-size 1
+
+   --recompute-granularity full
+   --recompute-method uniform
+   --recompute-num-layers 1
+
+   # --micro-batch-size 1
+   --use-dynamic-batch-size
+   --max-tokens-per-gpu 8192
+)
+
+GRPO_ARGS=(
+   --advantage-estimator grpo
+   --use-kl-loss
+   --kl-loss-coef 0.00
+   --kl-loss-type low_var_kl
+   --entropy-coef 0.00
+   --eps-clip 0.2
+   --eps-clip-high 0.28
+)
+
+OPTIMIZER_ARGS=(
+   --optimizer adam
+   --lr 1e-6
+   --lr-decay-style constant
+   --weight-decay 0.1
+   --adam-beta1 0.9
+   --adam-beta2 0.98
+
+   --optimizer-cpu-offload
+   --overlap-cpu-optimizer-d2h-h2d
+   --use-precision-aware-optimizer
+)
+
+WANDB_ARGS=(
+   # --use-wandb
+   # --wandb-project vime-dev
+   # --wandb-group moomlight-16B-A3B-test
+   # --wandb-key ${WANDB_KEY}
+)
+
+VLLM_ARGS=(
+   --rollout-num-gpus-per-engine 4
+   --vllm-gpu-memory-utilization 0.7
+   --vllm-cudagraph-capture-sizes 1 2 4 8 $(seq 16 8 256)
+)
+
+MISC_ARGS=(
+   # default dropout in megatron is 0.1
+   --attention-dropout 0.0
+   --hidden-dropout 0.0
+   # should be good for model performance
+   --accumulate-allreduce-grads-in-fp32
+   --attention-softmax-in-fp32
+   # need to comment this when using model with MLA
+   # --attention-backend flash
+
+   # use deepep for megatron
+   --moe-enable-deepep
+   --moe-token-dispatcher-type flex
+)
+
+# launch the master node of ray in container
+export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 4 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
+
+# Build the runtime environment JSON with proper variable substitution
+RUNTIME_ENV_JSON="{
+  \"env_vars\": {
+    \"PYTHONPATH\": \"/root/Megatron-LM/\",
+    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
+    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
+    \"OPEN_TRAINING_INT4_FAKE_QAT_FLAG\": \"1\",
+    \"OPEN_TRAINING_INT4_GROUP_SIZE\": \"128\"
+  }
+}"
+
+ray job submit --address="http://127.0.0.1:8265" \
+   --runtime-env-json="${RUNTIME_ENV_JSON}" \
+   -- python3 train.py \
+   --actor-num-nodes 1 \
+   --actor-num-gpus-per-node 4 \
+   --colocate \
+   ${MODEL_ARGS[@]} \
+   ${CKPT_ARGS[@]} \
+   ${ROLLOUT_ARGS[@]} \
+   ${OPTIMIZER_ARGS[@]} \
+   ${GRPO_ARGS[@]} \
+   ${WANDB_ARGS[@]} \
+   ${PERF_ARGS[@]} \
+   ${EVAL_ARGS[@]} \
+   ${VLLM_ARGS[@]} \
+   ${MISC_ARGS[@]}