From 78114b22b452ae3038febab557115c09da3df7ee Mon Sep 17 00:00:00 2001 From: Adil Asif Date: Tue, 16 Jun 2026 14:14:37 +0000 Subject: [PATCH] fix: TP config for nemotron-flash-1b + super-49B vllm_deploy cascade Both *_vllm_deploy tests (jobs 337980668 nemotron-flash-1b PEFT, 337980592 llama-3.3-nemotron-super-49B SFT) cascade from "No checkpoint found": the upstream finetune/robustness job dies because a custom-code (trust_remote_code) architecture has no registered TP plan and now hard-errors at tp_size>1 (torch DTensor shard_order assert; #2244 fail-fast in parallelizer.py). These used to ride AutoModel's default base plan on older torch. - nemotron-flash-1b: NemotronFlash (hybrid mamba2/deltanet) has no TP plan in any transformers version (5.5.0/5.12.1), in the model's Hub code, or in AutoModel; its hybrid layers aren't expressible with the standard TP styles. The robustness cross-TP phase ran at tp_size=2 and aborted before the checkpoint was saved. It's a 1B model that doesn't need TP -> run the robustness reload at tp_size=1. - super-49B (DeciLM/nemotron-nas): AutoModel already ships a TP plan (get_decilm_nemotron_tp_plan, named "llama_nemotron_super_tp_plan", since #1487) but the recipe never selected it, so the finetune fell through to the broken default plan at tp_size=4. Wire distributed.tp_plan: llama_nemotron_super_tp_plan. All 49 real-attention blocks have 8 KV heads, divisible by tp 4 (finetune) and 8 (robustness). Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: Adil Asif --- .../nemotron/llama3_3_nemotron_super_49B_squad.yaml | 1 + .../nemotron_flash/nemotron_flash_1b_squad_peft.yaml | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml index aadd070b85..8f575e0b60 100644 --- a/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml +++ b/examples/llm_finetune/nemotron/llama3_3_nemotron_super_49B_squad.yaml @@ -54,6 +54,7 @@ distributed: strategy: fsdp2 dp_size: none tp_size: 4 + tp_plan: llama_nemotron_super_tp_plan # registered DeciLM/nemotron-nas TP plan; custom-code arch has no default plan cp_size: 1 ep_size: 1 pipeline: diff --git a/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml b/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml index c62c5abcb9..98015d7055 100755 --- a/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml +++ b/examples/llm_finetune/nemotron_flash/nemotron_flash_1b_squad_peft.yaml @@ -113,11 +113,9 @@ ci: time: "00:15:00" checkpoint_robustness: hf_kl_threshold: 5e-3 - # tp_size=2 with bf16 row-parallel all-reduces produces ULP-level drift - # (~1e-3) between trainer and restored logits even with bit-identical - # weights; relax the Phase-3 threshold accordingly. kl_threshold: 5e-3 - distributed.tp_size: 2 + # NemotronFlash (hybrid mamba/deltanet, custom code) has no TP plan; reload at tp=1. + distributed.tp_size: 1 tokenizer_name: nvidia/Nemotron-Flash-1B trust_remote_code: true check_fused_qkv_keys: true