Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ distributed:
strategy: fsdp2
dp_size: none
tp_size: 4
tp_plan: llama_nemotron_super_tp_plan # registered DeciLM/nemotron-nas TP plan; custom-code arch has no default plan
cp_size: 1
ep_size: 1
pipeline:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,9 @@ ci:
time: "00:15:00"
checkpoint_robustness:
hf_kl_threshold: 5e-3
# tp_size=2 with bf16 row-parallel all-reduces produces ULP-level drift
# (~1e-3) between trainer and restored logits even with bit-identical
# weights; relax the Phase-3 threshold accordingly.
kl_threshold: 5e-3
distributed.tp_size: 2
# NemotronFlash (hybrid mamba/deltanet, custom code) has no TP plan; reload at tp=1.
distributed.tp_size: 1
tokenizer_name: nvidia/Nemotron-Flash-1B
trust_remote_code: true
check_fused_qkv_keys: true
Expand Down
Loading