NVIDIA-NeMo · adil-a · Jun 16, 2026
@@ -54,6 +54,7 @@ distributed:
   strategy: fsdp2
   dp_size: none
   tp_size: 4
+  tp_plan: llama_nemotron_super_tp_plan  # registered DeciLM/nemotron-nas TP plan; custom-code arch has no default plan
   cp_size: 1
   ep_size: 1
   pipeline:

@@ -113,11 +113,9 @@ ci:
   time: "00:15:00"
   checkpoint_robustness:
     hf_kl_threshold: 5e-3
-    # tp_size=2 with bf16 row-parallel all-reduces produces ULP-level drift
-    # (~1e-3) between trainer and restored logits even with bit-identical
-    # weights; relax the Phase-3 threshold accordingly.
     kl_threshold: 5e-3
-    distributed.tp_size: 2
+    # NemotronFlash (hybrid mamba/deltanet, custom code) has no TP plan; reload at tp=1.
+    distributed.tp_size: 1
     tokenizer_name: nvidia/Nemotron-Flash-1B
     trust_remote_code: true
     check_fused_qkv_keys: true