Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions scripts/low_precision/run-kimi-k2-Thinking-int4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
#!/bin/bash

# for rerun the task
pkill -9 vllm
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python

set -ex

# will prevent ray from buffering stdout/stderr
export PYTHONUNBUFFERED=1

NVLINK_COUNT=$(nvidia-smi | grep -o "NVLink" | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
HAS_NVLINK=1
else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
source "${SCRIPT_DIR}/../models/kimi-k2-thinking.sh"

CKPT_ARGS=(
--hf-checkpoint /root/Kimi-K2-Thinking/
--ref-load /root/Kimi-K2_thinking_torch_dist/
--load /root/Kimi-K2-thinking_vime/
--save /root/Kimi-K2-thinking_vime/
--save-interval 20
)

ROLLOUT_ARGS=(
--prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
--input-key prompt
--label-key label
--apply-chat-template
--rollout-shuffle

--rm-type math

--num-rollout 100
--rollout-batch-size 128
--n-samples-per-prompt 8
--rollout-max-response-len 16384
--rollout-temperature 0.8

# --global-batch-size 256

--over-sampling-batch-size 256
--dynamic-sampling-filter-path vime.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std

--num-steps-per-rollout 4
--balance-data
)

EVAL_ARGS=(
--eval-interval 10
--eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
--n-samples-per-eval-prompt 16
--eval-max-response-len 16384
--eval-top-p 0.7
)

PERF_ARGS=(
--tensor-model-parallel-size 8
--sequence-parallel
--pipeline-model-parallel-size 8
--context-parallel-size 4
--expert-model-parallel-size 32
--expert-tensor-parallel-size 1
--decoder-last-pipeline-num-layers 5

--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1

--use-dynamic-batch-size
--max-tokens-per-gpu 16384
)

GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.00
--kl-loss-type low_var_kl
# --kl-coef 0.00
--entropy-coef 0.00
--eps-clip 0.2
--eps-clip-high 0.28
--use-tis
)

OPTIMIZER_ARGS=(
--optimizer adam
--lr 1e-6

--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98

--optimizer-cpu-offload
--overlap-cpu-optimizer-d2h-h2d
--use-precision-aware-optimizer
)

WANDB_ARGS=(
# --use-wandb
# --wandb-project vime-dev
# --wandb-group kimi-k2-thinking-test
# --wandb-key ${WANDB_KEY}
)

VLLM_ARGS=(
--rollout-num-gpus-per-engine 8
--vllm-gpu-memory-utilization 0.7

# dp attention
# --vllm-data-parallel-size 8

--vllm-enable-expert-parallel

# enable deepep for vllm

# make every dp rank has 128 concurrency
--vllm-server-concurrency 1024
)


MISC_ARGS=(
# default dropout in megatron is 0.1
--attention-dropout 0.0
--hidden-dropout 0.0
# should be good for model performance
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
# need to comment this when using model with MLA
--attention-backend flash

# use deepep for megatron
# --moe-enable-deepep
# --moe-token-dispatcher-type flex
--no-check-for-nan-in-loss-and-grad
)

# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
\"NCCL_TIMEOUT_MS\":\"360000000\",
\"no_proxy\": \"${no_proxy}\",
\"MASTER_ADDR\": \"${MASTER_ADDR}\",
\"OPEN_TRAINING_INT4_FAKE_QAT_FLAG\": \"1\",
\"OPEN_TRAINING_INT4_GROUP_SIZE\": \"32\"
}
}"


ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 /root/vime/train.py \
--actor-num-nodes 32 \
--actor-num-gpus-per-node 8 \
--colocate \
--update-weight-buffer-size $(( 4 * 512 * 1024 * 1024)) \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${VLLM_ARGS[@]} \
${MISC_ARGS[@]}
165 changes: 165 additions & 0 deletions scripts/low_precision/run-moonlight-16B-A3B-int4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/bin/bash

# for rerun the task
pkill -9 vllm
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python
pkill -9 redis

set -ex

# will prevent ray from buffering stdout/stderr
export PYTHONUNBUFFERED=1

NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
HAS_NVLINK=1
else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
source "${SCRIPT_DIR}/../models/moonlight.sh"

CKPT_ARGS=(
--hf-checkpoint /root/Moonlight-16B-A3B-Instruct-INT4
--ref-load /root/Moonlight-16B-A3B-Instruct-INT4_torch_dist
--load /root/Moonlight-16B-A3B_vime/
--save /root/Moonlight-16B-A3B_vime/
--save-interval 20
)

ROLLOUT_ARGS=(
--prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
--input-key prompt
--label-key label
--apply-chat-template
--rollout-shuffle
--rm-type math
--num-rollout 3000
--rollout-batch-size 128
--n-samples-per-prompt 8
--rollout-max-response-len 4096
--rollout-temperature 0.8

--over-sampling-batch-size 256
--dynamic-sampling-filter-path vime.rollout.filter_hub.dynamic_sampling_filters.check_reward_nonzero_std

--num-steps-per-rollout 4
# --global-batch-size 256
--balance-data
)

EVAL_ARGS=(
--eval-interval 20
--eval-prompt-data aime /root/aime-2024/aime-2024.jsonl
--n-samples-per-eval-prompt 8
--eval-max-response-len 4096
--eval-top-p 0.7
)

PERF_ARGS=(
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--expert-model-parallel-size 4
--expert-tensor-parallel-size 1

--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1

# --micro-batch-size 1
--use-dynamic-batch-size
--max-tokens-per-gpu 8192
)

GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.00
--kl-loss-type low_var_kl
--entropy-coef 0.00
--eps-clip 0.2
--eps-clip-high 0.28
)

OPTIMIZER_ARGS=(
--optimizer adam
--lr 1e-6
--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98

--optimizer-cpu-offload
--overlap-cpu-optimizer-d2h-h2d
--use-precision-aware-optimizer
)

WANDB_ARGS=(
# --use-wandb
# --wandb-project vime-dev
# --wandb-group moomlight-16B-A3B-test
# --wandb-key ${WANDB_KEY}
)

VLLM_ARGS=(
--rollout-num-gpus-per-engine 4
--vllm-gpu-memory-utilization 0.7
--vllm-cudagraph-capture-sizes 1 2 4 8 $(seq 16 8 256)
)

MISC_ARGS=(
# default dropout in megatron is 0.1
--attention-dropout 0.0
--hidden-dropout 0.0
# should be good for model performance
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
# need to comment this when using model with MLA
# --attention-backend flash

# use deepep for megatron
--moe-enable-deepep
--moe-token-dispatcher-type flex
)

# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 4 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265

# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
\"OPEN_TRAINING_INT4_FAKE_QAT_FLAG\": \"1\",
\"OPEN_TRAINING_INT4_GROUP_SIZE\": \"128\"
}
}"

ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 train.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node 4 \
--colocate \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${VLLM_ARGS[@]} \
${MISC_ARGS[@]}
Loading