diff --git a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml index aadd070b85..8f575e0b60 100644 --- a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml +++ b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml @@ -54,6 +54,7 @@ distributed: strategy: fsdp2 dp_size: none tp_size: 4 + tp_plan: llama_nemotron_super_tp_plan # registered DeciLM/nemotron-nas TP plan; custom-code arch has no default plan cp_size: 1 ep_size: 1 pipeline: diff --git a/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml b/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml index c62c5abcb9..98015d7055 100755 --- a/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml +++ b/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml @@ -113,11 +113,9 @@ ci: time: "00:15:00" checkpoint_robustness: hf_kl_threshold: 5e-3 - # tp_size=2 with bf16 row-parallel all-reduces produces ULP-level drift - # (~1e-3) between trainer and restored logits even with bit-identical - # weights; relax the Phase-3 threshold accordingly. kl_threshold: 5e-3 - distributed.tp_size: 2 + # NemotronFlash (hybrid mamba/deltanet, custom code) has no TP plan; reload at tp=1. + distributed.tp_size: 1 tokenizer_name: nvidia/Nemotron-Flash-1B trust_remote_code: true check_fused_qkv_keys: true