diff --git a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml
index aadd070b85..8f575e0b60 100644
--- a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml
+++ b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml
@@ -54,6 +54,7 @@ distributed:
   strategy: fsdp2
   dp_size: none
   tp_size: 4
+  tp_plan: llama_nemotron_super_tp_plan  # registered DeciLM/nemotron-nas TP plan; custom-code arch has no default plan
   cp_size: 1
   ep_size: 1
   pipeline:
diff --git a/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml b/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml
index c62c5abcb9..98015d7055 100755
--- a/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml
+++ b/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml
@@ -113,11 +113,9 @@ ci:
   time: "00:15:00"
   checkpoint_robustness:
     hf_kl_threshold: 5e-3
-    # tp_size=2 with bf16 row-parallel all-reduces produces ULP-level drift
-    # (~1e-3) between trainer and restored logits even with bit-identical
-    # weights; relax the Phase-3 threshold accordingly.
     kl_threshold: 5e-3
-    distributed.tp_size: 2
+    # NemotronFlash (hybrid mamba/deltanet, custom code) has no TP plan; reload at tp=1.
+    distributed.tp_size: 1
     tokenizer_name: nvidia/Nemotron-Flash-1B
     trust_remote_code: true
     check_fused_qkv_keys: true