NVIDIA · hychiang-git · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026 · Jun 28, 2026
@@ -0,0 +1,111 @@
+# NEL 0.3.x (engine) — SIMPLE / no-tools, SLURM-sharded.
+# Built-in + skills math/multichoice benchmarks for a reasoning checkpoint.
+# Fill every <...> placeholder. See references/nel-v0.3.0-migration.md.
+#
+#   nel eval run r030_example_eval.yaml --dry-run
+#   nel eval run r030_example_eval.yaml
+#   # after all shards finish:
+#   nel eval merge <output.dir>/<run-id>
+#   # then export from the LOGIN node (SLURM does NOT auto-export — see migration doc "Export results to MLFlow"):
+#   nel export <output.dir>/<run-id>/<bench> --dest mlflow -o tracking_uri=<uri> -o experiment_name=<exp>
+
+services:
+  model:
+    type: vllm
+    # HOST checkpoint path; the engine auto-mounts it <hostpath>:/model:ro and serves `vllm serve /model`.
+    # Do NOT also hand-mount it. For an HF handle, set `model: <org/model>` (no mount needed).
+    model: /path/to/checkpoint
+    served_model_name: my-model      # required; referenced by benchmarks
+    protocol: chat_completions
+    port: 8000
+    node_pool: gpu
+    # Single-instance serving. Pick TP/PP from model size & GPU count (references/parallelism.md).
+    # Scale THROUGHPUT with cluster.shards below, not data_parallel (DP can be fragile on stock images).
+    tensor_parallel_size: 4
+    pipeline_parallel_size: 1
+    data_parallel_size: 1
+    num_nodes: 1
+    image: <vllm-image.sqsh-or-handle>   # pin a version; never :latest. Cross-check recipes.vllm.ai.
+    startup_timeout: 3600.0
+    extra_args:
+      - "--gpu-memory-utilization=0.9"
+      - "--max-model-len=262144"             # must exceed prompt + generation.max_tokens
+      - "--trust-remote-code"                # if config.json has auto_map
+      # Reasoning checkpoint: use YOUR model's parser (and plugin path if it ships a custom one).
+      - "--reasoning-parser=<your-reasoning-parser>"
+      # - "--reasoning-parser-plugin=/model/<custom_parser>.py"   # only if the checkpoint bundles one
+      - "--enable-expert-parallel"           # MoE only; no-op for dense
+      - "--max-num-seqs=512"                 # serving-side concurrency knob — raise to use the GPU
+    extra_env:
+      HF_TOKEN: ${HF_TOKEN}
+      HF_HOME: /cache/huggingface
+    container_mounts:
+      - /path/to/hf-cache:/cache/huggingface
+    proxy:
+      verbose: true                          # REQUIRED for request_timeout to apply
+      request_timeout: 36000.0               # LARGE for reasoning models (120s default kills long gens)
+      # max_concurrent_upstream: 512         # default 64 is the hidden throttle — raise to use the GPU
+    generation:
+      temperature: 1.0                       # from model card
+      top_p: 0.95                            # from model card
+      # PER model-call cap. Leave headroom under --max-model-len AND bound one call under the SLURM wall
+      # so a single long sample can't loop auto_resume forever.
+      max_tokens: 196608
+
+benchmarks:
+  - name: gpqa                               # built-in
+    repeats: 8
+    max_concurrent: 64
+    solver: {type: simple, service: model}
+  - name: mmlu_pro                           # built-in
+    repeats: 1
+    max_concurrent: 64
+    solver: {type: simple, service: model}
+  - name: skills://aime25                    # nemo-skills math (per-sample) — needs nemo-skills baked in
+    repeats: 64
+    max_concurrent: 64
+    solver: {type: simple, service: model}
+  # WITH-TOOLS variant of a native benchmark: SAME backend/name, just a tool_calling solver + a local
+  # sandbox — NO gym server involved. Use in place of the simple `gpqa` entry above to give it tools.
+  # NOTE: the served model must be launched with --enable-auto-tool-choice + --tool-call-parser=<parser>
+  # (add them to services.model.extra_args) or the tool calls won't be parsed.
+  # - name: gpqa                             # still BUILT-IN
+  #   repeats: 8
+  #   max_concurrent: 64
+  #   solver: {type: tool_calling, service: model, sandbox_tools: true, max_turns: 100}
+  #   sandbox: {type: local, concurrency: 64}
+
+cluster:
+  type: slurm
+  account: <your-slurm-account>
+  walltime: "04:00:00"                       # cluster wall; long evals continue via auto_resume / shards
+  auto_resume: true                          # safe here (no-tools can't loop a single call past the wall)
+  shards: 8                                  # N independent single-node workers + `nel eval merge`
+  eval_image: <eval-image.sqsh>              # the one eval container, with nemo-skills + lm-eval pip-installed
+  mount_home: false                          # avoid a stale host ~/.local/bin/nel shadowing the CLI
+  container_mounts:
+    - /path/to/hf-cache:/cache/huggingface
+  container_env:
+    HF_HOME: /cache/huggingface
+    HF_TOKEN: ${HF_TOKEN}
+  node_pools:
+    gpu:
+      partition: <gpu-partition>
+      nodes: 1
+      ntasks_per_node: 1
+      gres: "gpu:4"                          # match the node's GPU count + TP×DP
+  sbatch_extra_flags:
+    exclusive: true
+
+output:
+  dir: /abs/path/to/rundir                   # MUST be absolute
+  export:
+    - mlflow                                 # NOTE: NOT auto-exported on SLURM — run `nel export` manually
+  export_config:
+    mlflow:
+      tracking_uri: <your-mlflow-uri>
+      experiment_name: <experiment>
+      tags:
+        model: my-model
+        precision: <bf16|fp8|nvfp4>
+        pipeline: r0.3.0
@@ -0,0 +1,56 @@
+# NEL 0.3.x (engine) — gym:// (NeMo-Gym resource server + reward).
+# The FAITHFUL path for agentic / judge / batch benchmarks that skills:// only exact-match-scores
+# (the † rows in the matrix). The engine connects to a RUNNING gym resource server, drives the
+# rollout against your model, and reads the reward back from the server's /verify.
+# Validated with the self-contained `mcqa` + `ifbench` servers (native protocol). Fill every <...>.
+# Full setup (install gym, start the server, find its port, dataset rcp format): github.com/NVIDIA-NeMo/Gym
+#
+# PREREQUISITE — start ONE resource server first (separate process), then find its dynamic port:
+#   export RAY_TMPDIR=/tmp     # Lustre AF_UNIX socket paths exceed 107 bytes → Ray fails without this
+#   gym env start --resources-server <name> &   # background it; loads ONLY the grader (not the agent)
+#   # the head server is at :11000; each resource server registers there on a DYNAMIC port. Read it back:
+#   curl -s http://127.0.0.1:11000/server_instances | python3 -m json.tool   # → [{"name": .., "url": ".../<PORT>"}]
+#   # running >1 server (e.g. ifbench + ifeval) lists them all — match the row by "name" to get each port.
+#
+#   nel eval run r030_gym.yaml --dry-run
+#   nel eval run r030_gym.yaml
+
+services:
+  model:                                   # the POLICY the engine rolls out against the gym server
+    # Hosted OpenAI-compatible endpoint = no GPU needed (ideal for a smoke). To self-serve a checkpoint,
+    # serve it on a GPU as a SEPARATE `vllm serve` job and point url: at it (migration doc §4/§5) — keep
+    # type: api here. (type: vllm + cluster: local does NOT work: NEL would run `python -m vllm` from the
+    #  nel venv, which has no vLLM; serving must be its own GPU job reached via type: api.)
+    type: api
+    url: <https://your-openai-compatible-endpoint/v1/chat/completions>
+    protocol: chat_completions
+    model: <model-id>
+    api_key: ${YOUR_API_KEY}               # resolved from env at run time (set -a; source <.env>; set +a)
+    generation:
+      temperature: 0.0
+      max_tokens: 512
+
+benchmarks:
+  # The benchmark lives in the running server + the data file, NOT after the prefix.
+  #   gym://<host:port>?protocol=native&data=<tasks.jsonl>
+  # native  = evaluator holds the dataset, seeds locally, posts the answer to the server's /verify.
+  # <PORT>  = the dynamic resource-server port from /server_instances above (NOT the 11000 head port).
+  - name: "gym://127.0.0.1:<PORT>?protocol=native&data=/abs/path/to/tasks.jsonl"
+    repeats: 1
+    max_concurrent: 5
+    # simple solver = single-shot policy answer scored by the server (mcqa/ifbench-style). For agentic
+    # gym envs that need tool use, switch to `type: tool_calling` + a `sandbox:`.
+    solver: {type: simple, service: model}
+
+cluster:
+  # `local` = NEL runs the evaluator in-process on the node you launch it from — use a CPU compute node
+  # (sbatch --partition=cpu), NOT the login node. The gym scorer is CPU; only the policy needs a GPU
+  # (and here the policy is a hosted API, so no GPU at all). This single-node path is the validated one.
+  # NOTE: sharding gym is NOT a free swap — the one shared resource server would have to be at a routable
+  # host (not 127.0.0.1) reachable from every shard node; loopback + shards silently fails to score.
+  type: local
-  - name: "gym://127.0.0.1:<PORT>?protocol=native&data=/abs/path/to/tasks.jsonl"
-    repeats: 1
-    max_concurrent: 5
-    # simple solver = single-shot policy answer scored by the server (mcqa/ifbench-style). For agentic
-    # gym envs that need tool use, switch to `type: tool_calling` + a `sandbox:`.
-    solver: {type: simple, service: model}
-
-cluster:
-  # `local` = in-process on the login/cpu node — the gym scorer is always CPU, only the policy needs a
-  # GPU (and here the policy is a hosted API, so no GPU at all). For a real run, swap to a slurm block
-  # (see r030_example_eval.yaml). NOTE: gym:// supports cluster.shards the same way native does.
-  type: local
+  - name: "gym://<gym-server-host>:<PORT>?protocol=native&data=/abs/path/to/tasks.jsonl"
+    repeats: 1
+    max_concurrent: 5
+    # simple solver = single-shot policy answer scored by the server (mcqa/ifbench-style). For agentic
+    # gym envs that need tool use, switch to `type: tool_calling` + a `sandbox:`.
+    solver: {type: simple, service: model}
+
+cluster:
+  # `local` = in-process on the login/cpu node — the gym scorer is always CPU, only the policy needs a
+  # GPU (and here the policy is a hosted API, so no GPU at all). For a real run, swap to a slurm block
+  # (see r030_example_eval.yaml). If you move off `local`, replace `127.0.0.1` above with a host that
+  # is reachable from the worker node(s). NOTE: gym:// supports cluster.shards the same way native does.
+  type: local
-  - name: "gym://127.0.0.1:<PORT>?protocol=native&data=/abs/path/to/tasks.jsonl"
-    repeats: 1
-    max_concurrent: 5
-    # simple solver = single-shot policy answer scored by the server (mcqa/ifbench-style). For agentic
-    # gym envs that need tool use, switch to `type: tool_calling` + a `sandbox:`.
-    solver: {type: simple, service: model}
-
-cluster:
-  # `local` = in-process on the login/cpu node — the gym scorer is always CPU, only the policy needs a
-  # GPU (and here the policy is a hosted API, so no GPU at all). For a real run, swap to a slurm block
-  # (see r030_example_eval.yaml). NOTE: gym:// supports cluster.shards the same way native does.
-  type: local
+  - name: "gym://<gym-server-host>:<PORT>?protocol=native&data=/abs/path/to/tasks.jsonl"
+    repeats: 1
+    max_concurrent: 5
+    # simple solver = single-shot policy answer scored by the server (mcqa/ifbench-style). For agentic
+    # gym envs that need tool use, switch to `type: tool_calling` + a `sandbox:`.
+    solver: {type: simple, service: model}
+
+cluster:
+  # `local` = in-process on the login/cpu node — the gym scorer is always CPU, only the policy needs a
+  # GPU (and here the policy is a hosted API, so no GPU at all). For a real run, swap to a slurm block
+  # (see r030_example_eval.yaml). If you move off `local`, replace `127.0.0.1` above with a host that
+  # is reachable from the worker node(s). NOTE: gym:// supports cluster.shards the same way native does.
+  type: local
+
+output:
+  dir: /abs/path/to/rundir                 # MUST be absolute
+  # export is run manually after the job (see r030_example_eval.yaml / migration doc "Export to MLFlow"):
+  #   nel export <rundir>/<bundle> --dest mlflow -o tracking_uri=<uri> -o experiment_name=<exp>