From 7d3bdbbe3047a83bafad226d08c2bbe7f5971384 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sat, 27 Jun 2026 18:37:31 -0700 Subject: [PATCH 1/7] docs(eval): add NEL v0.3.0 migration guide + example configs Add a guide for migrating checkpoint evaluation from the NEL 0.2.x launcher (nemo_evaluator_launcher) to the 0.3.x engine (nemo_evaluator / `nel eval run`): - references/nel-v0.3.0-migration.md: 0.2.x->0.3.x config mapping, backend support matrix (built-in / skills:// / gym://), per-backend launch patterns, single-node vs sharded runs, and the MLflow export flow. - recipes/examples/r030_example_eval.yaml: built-in + skills:// native example. - recipes/examples/r030_gym.yaml: gym:// (NeMo-Gym resource server + reward). Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .../recipes/examples/r030_example_eval.yaml | 111 ++++ .../evaluation/recipes/examples/r030_gym.yaml | 56 ++ .../references/nel-v0.3.0-migration.md | 592 ++++++++++++++++++ 3 files changed, 759 insertions(+) create mode 100644 .agents/skills/evaluation/recipes/examples/r030_example_eval.yaml create mode 100644 .agents/skills/evaluation/recipes/examples/r030_gym.yaml create mode 100644 .agents/skills/evaluation/references/nel-v0.3.0-migration.md diff --git a/.agents/skills/evaluation/recipes/examples/r030_example_eval.yaml b/.agents/skills/evaluation/recipes/examples/r030_example_eval.yaml new file mode 100644 index 00000000000..78b27fb60e8 --- /dev/null +++ b/.agents/skills/evaluation/recipes/examples/r030_example_eval.yaml @@ -0,0 +1,111 @@ +# NEL 0.3.x (engine) — SIMPLE / no-tools, SLURM-sharded. +# Built-in + skills math/multichoice benchmarks for a reasoning checkpoint. +# Fill every <...> placeholder. See references/nel-v0.3.0-migration.md. +# +# nel eval run r030_example_eval.yaml --dry-run +# nel eval run r030_example_eval.yaml +# # after all shards finish: +# nel eval merge / +# # then export from the LOGIN node (SLURM does NOT auto-export — see migration doc "Export results to MLFlow"): +# nel export // --dest mlflow -o tracking_uri= -o experiment_name= + +services: + model: + type: vllm + # HOST checkpoint path; the engine auto-mounts it :/model:ro and serves `vllm serve /model`. + # Do NOT also hand-mount it. For an HF handle, set `model: ` (no mount needed). + model: /path/to/checkpoint + served_model_name: my-model # required; referenced by benchmarks + protocol: chat_completions + port: 8000 + node_pool: gpu + # Single-instance serving. Pick TP/PP from model size & GPU count (references/parallelism.md). + # Scale THROUGHPUT with cluster.shards below, not data_parallel (DP can be fragile on stock images). + tensor_parallel_size: 4 + pipeline_parallel_size: 1 + data_parallel_size: 1 + num_nodes: 1 + image: # pin a version; never :latest. Cross-check recipes.vllm.ai. + startup_timeout: 3600.0 + extra_args: + - "--gpu-memory-utilization=0.9" + - "--max-model-len=262144" # must exceed prompt + generation.max_tokens + - "--trust-remote-code" # if config.json has auto_map + # Reasoning checkpoint: use YOUR model's parser (and plugin path if it ships a custom one). + - "--reasoning-parser=" + # - "--reasoning-parser-plugin=/model/.py" # only if the checkpoint bundles one + - "--enable-expert-parallel" # MoE only; no-op for dense + - "--max-num-seqs=512" # serving-side concurrency knob — raise to use the GPU + extra_env: + HF_TOKEN: ${HF_TOKEN} + HF_HOME: /cache/huggingface + container_mounts: + - /path/to/hf-cache:/cache/huggingface + proxy: + verbose: true # REQUIRED for request_timeout to apply + request_timeout: 36000.0 # LARGE for reasoning models (120s default kills long gens) + # max_concurrent_upstream: 512 # default 64 is the hidden throttle — raise to use the GPU + generation: + temperature: 1.0 # from model card + top_p: 0.95 # from model card + # PER model-call cap. Leave headroom under --max-model-len AND bound one call under the SLURM wall + # so a single long sample can't loop auto_resume forever. + max_tokens: 196608 + +benchmarks: + - name: gpqa # built-in + repeats: 8 + max_concurrent: 64 + solver: {type: simple, service: model} + - name: mmlu_pro # built-in + repeats: 1 + max_concurrent: 64 + solver: {type: simple, service: model} + - name: skills://aime25 # nemo-skills math (per-sample) — needs nemo-skills baked in + repeats: 64 + max_concurrent: 64 + solver: {type: simple, service: model} + # WITH-TOOLS variant of a native benchmark: SAME backend/name, just a tool_calling solver + a local + # sandbox — NO gym server involved. Use in place of the simple `gpqa` entry above to give it tools. + # NOTE: the served model must be launched with --enable-auto-tool-choice + --tool-call-parser= + # (add them to services.model.extra_args) or the tool calls won't be parsed. + # - name: gpqa # still BUILT-IN + # repeats: 8 + # max_concurrent: 64 + # solver: {type: tool_calling, service: model, sandbox_tools: true, max_turns: 100} + # sandbox: {type: local, concurrency: 64} + +cluster: + type: slurm + account: + walltime: "04:00:00" # cluster wall; long evals continue via auto_resume / shards + auto_resume: true # safe here (no-tools can't loop a single call past the wall) + shards: 8 # N independent single-node workers + `nel eval merge` + eval_image: # the one eval container, with nemo-skills + lm-eval pip-installed + mount_home: false # avoid a stale host ~/.local/bin/nel shadowing the CLI + container_mounts: + - /path/to/hf-cache:/cache/huggingface + container_env: + HF_HOME: /cache/huggingface + HF_TOKEN: ${HF_TOKEN} + node_pools: + gpu: + partition: + nodes: 1 + ntasks_per_node: 1 + gres: "gpu:4" # match the node's GPU count + TP×DP + sbatch_extra_flags: + exclusive: true + +output: + dir: /abs/path/to/rundir # MUST be absolute + export: + - mlflow # NOTE: NOT auto-exported on SLURM — run `nel export` manually + export_config: + mlflow: + tracking_uri: + experiment_name: + tags: + model: my-model + precision: + pipeline: r0.3.0 diff --git a/.agents/skills/evaluation/recipes/examples/r030_gym.yaml b/.agents/skills/evaluation/recipes/examples/r030_gym.yaml new file mode 100644 index 00000000000..c854e2665cc --- /dev/null +++ b/.agents/skills/evaluation/recipes/examples/r030_gym.yaml @@ -0,0 +1,56 @@ +# NEL 0.3.x (engine) — gym:// (NeMo-Gym resource server + reward). +# The FAITHFUL path for agentic / judge / batch benchmarks that skills:// only exact-match-scores +# (the † rows in the matrix). The engine connects to a RUNNING gym resource server, drives the +# rollout against your model, and reads the reward back from the server's /verify. +# Validated with the self-contained `mcqa` + `ifbench` servers (native protocol). Fill every <...>. +# Full setup (install gym, start the server, find its port, dataset rcp format): github.com/NVIDIA-NeMo/Gym +# +# PREREQUISITE — start ONE resource server first (separate process), then find its dynamic port: +# export RAY_TMPDIR=/tmp # Lustre AF_UNIX socket paths exceed 107 bytes → Ray fails without this +# gym env start --resources-server & # background it; loads ONLY the grader (not the agent) +# # the head server is at :11000; each resource server registers there on a DYNAMIC port. Read it back: +# curl -s http://127.0.0.1:11000/server_instances | python3 -m json.tool # → [{"name": .., "url": ".../"}] +# # running >1 server (e.g. ifbench + ifeval) lists them all — match the row by "name" to get each port. +# +# nel eval run r030_gym.yaml --dry-run +# nel eval run r030_gym.yaml + +services: + model: # the POLICY the engine rolls out against the gym server + # Hosted OpenAI-compatible endpoint = no GPU needed (ideal for a smoke). To self-serve a checkpoint, + # serve it on a GPU as a SEPARATE `vllm serve` job and point url: at it (migration doc §4/§5) — keep + # type: api here. (type: vllm + cluster: local does NOT work: NEL would run `python -m vllm` from the + # nel venv, which has no vLLM; serving must be its own GPU job reached via type: api.) + type: api + url: + protocol: chat_completions + model: + api_key: ${YOUR_API_KEY} # resolved from env at run time (set -a; source <.env>; set +a) + generation: + temperature: 0.0 + max_tokens: 512 + +benchmarks: + # The benchmark lives in the running server + the data file, NOT after the prefix. + # gym://?protocol=native&data= + # native = evaluator holds the dataset, seeds locally, posts the answer to the server's /verify. + # = the dynamic resource-server port from /server_instances above (NOT the 11000 head port). + - name: "gym://127.0.0.1:?protocol=native&data=/abs/path/to/tasks.jsonl" + repeats: 1 + max_concurrent: 5 + # simple solver = single-shot policy answer scored by the server (mcqa/ifbench-style). For agentic + # gym envs that need tool use, switch to `type: tool_calling` + a `sandbox:`. + solver: {type: simple, service: model} + +cluster: + # `local` = NEL runs the evaluator in-process on the node you launch it from — use a CPU compute node + # (sbatch --partition=cpu), NOT the login node. The gym scorer is CPU; only the policy needs a GPU + # (and here the policy is a hosted API, so no GPU at all). This single-node path is the validated one. + # NOTE: sharding gym is NOT a free swap — the one shared resource server would have to be at a routable + # host (not 127.0.0.1) reachable from every shard node; loopback + shards silently fails to score. + type: local + +output: + dir: /abs/path/to/rundir # MUST be absolute + # export is run manually after the job (see r030_example_eval.yaml / migration doc "Export to MLFlow"): + # nel export / --dest mlflow -o tracking_uri= -o experiment_name= diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md new file mode 100644 index 00000000000..5e503b4c3f6 --- /dev/null +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -0,0 +1,592 @@ +# NEL v0.3.0 Migration + +## NEL 0.2.x (launcher) vs 0.3.x (engine) + +| | **0.2.x launcher** (`nemo_evaluator_launcher`) | **0.3.x engine** (`nemo_evaluator`) | +| --- | --- | --- | +| Benchmark source | each task runs in its **own published eval-factory container** (`nvcr.io/nvidia/eval-factory/*`); you never build an eval image | **one** `cluster.eval_image`; benchmarks resolve from Python `@register()` modules | +| Where harnesses live | baked into per-task containers upstream | **you pip-install** external harnesses (`nemo-skills`, `lm-eval`) **into the one eval image** | +| Benchmark families | eval-factory `container/` adapters, `ns_*` nemo-skills | **built-in** (17, `nel list -s builtin`), **`skills://`** (nemo-skills), **`lm-eval://`**, + legacy **`container://`** (BC) | +| CLI | `nel run` / `ls` / `status` / `info` | `nel eval run` / `list` / `export` | +| Config schema | Hydra `deployment:` / `evaluation:` / `execution:` | `services:` / `benchmarks:` / `cluster:` / `output:` (one file) | +| Serving | a `deployment:` block | integrated under `services.model` | +| "with tools" | task-specific | `solver: {type: tool_calling, sandbox_tools: true}` + a `sandbox:` | + +### Comparing the Config File Structure + +**0.2.x launcher** — Hydra, several top-level blocks (+ a `defaults:` preset chain), typically one file per task: + +```yaml +defaults: [...] # Hydra preset chain (resolved at launch) +execution: # SLURM: account, partition, walltime, num_nodes, output_dir, sbatch flags +deployment: # serving: image, hf_model_handle, served_model_name, TP/PP/DP, extra_args +target: # how to reach the served model + api_endpoint: {...} +evaluation: # the benchmark(s) + tasks: [...] # task names live here +export: # mlflow export + mlflow: {...} +``` + +**0.3.x engine** — one self-contained file, four blocks, all tasks in a single `benchmarks:` list: + +```yaml +services: # serving (replaces `deployment:` + `target:`) + model: {type: vllm, model: ..., tensor_parallel_size: 4, extra_args: [...]} +benchmarks: # the tasks (replaces `evaluation.tasks:`), one `- name:` each + - name: gpqa # prefix picks the backend (see matrix below) +cluster: # SLURM (replaces `execution:`): type, account, walltime, shards, eval_image +output: # results + export (replaces `export:`) + dir: ... + export: [mlflow] + export_config: {mlflow: {...}} +``` + +Key moves: +- `deployment:` + `target:` → **`services.model`** (serving + endpoint unified; the engine wires it to a benchmark via `solver.service`) +- `evaluation.tasks:` → **`benchmarks:`** (a list of `- name:`; the prefix selects the backend) +- `execution:` → **`cluster:`** (adds `shards:` for multi-node sharding and `eval_image:` for the one eval container) +- `export:` → **`output.export` / `output.export_config`** +- `defaults:` (Hydra presets) → **gone** — one explicit file, no preset resolution step + + +### Backward supporting v0.2.x + +0.3.x keeps a backward-compat path for v0.2.x's eval-factory `container/` task variants (e.g. +`mmlu_pro_aa_v3`, `gpqa_diamond_aa_v3`) — though they have **no _native_ 0.3.x benchmark of the same +name**. Two options: + +1. run them via the **legacy `container://` backend** (`solver: {type: container}`, which injects a v1-format `run_config.yaml` so v1 configs port with minimal changes — but **`cluster.shards` is unsupported** for legacy container runs) + +2. **recommended** — switch to the native built-in (`mmlu_pro`, `gpqa`) or `skills://` equivalents (sharding + one eval image). Note the launcher's AIME entry was already `ns_aime2025` (nemo-skills, *not* a container variant), so it ports directly to `skills://aime25`. + + +## NEL v0.3.0 Backend Support Matrix + +### Supported Backends + +NEL 0.3.x resolves a benchmark through one of several **backends**, chosen by the `name:` prefix you +put in `benchmarks:`. The same benchmark is often reachable from **more than one** backend, and the +prefix picks which harness actually runs — which also affects scoring fidelity (whether a benchmark is +scored correctly vs silently mis-scored). That's detailed per-benchmark in **Supported Benchmarks** below. + +```text +backend name: in the config find available names +---------- ------------------------------ ------------------------------------------------------- +built-in (e.g. gpqa) `nel list -s builtin` +skills:// skills:// `nel list -s skills` (after prep) / nemo_skills/dataset/ +lm-eval:// lm-eval:// `lm_eval --tasks list` +gym:// gym://?protocol= Gym repo benchmarks/ + resources_servers/ + native&data= (needs a running server → github.com/NVIDIA-NeMo/Gym) +``` + +Example — a `benchmarks:` block, one entry per benchmark. The `name:` prefix selects the backend (it is **not** uniformly `{backend}://{benchmark}`): built-in has **no prefix** (just `{benchmark}`); `skills://` and `lm-eval://` are `{backend}://{benchmark}`; `gym://` points at a running server, `gym://{host:port}?...&data=` (the benchmark lives in the data file, not after the prefix): + +```yaml +benchmarks: # the benchmarks block + - name: gpqa # built-in + - name: skills://aime25 # NeMo-Skills + - name: lm-eval://hellaswag # lm-eval + - name: gym://127.0.0.1:8000?protocol=native&data=tasks.jsonl # NeMo-Gym (running server) +``` + +### Supported Benchmarks + +The list of supported benchmarks are from three NVIDIA-NeMo repositories: +- `NVIDIA-NeMo/Evaluator`: built-in engine [`@register`](https://github.com/NVIDIA-NeMo/Evaluator/tree/main/src/nemo_evaluator/benchmarks) +- `NVIDIA-NeMo/Skills`: **`skills://`** from [`nemo_skills/dataset/`](https://github.com/NVIDIA-NeMo/Skills/tree/main/nemo_skills/dataset), +- `NVIDIA-NeMo/Gym`: **`gym://`** from Gym [`benchmarks/`](https://github.com/NVIDIA-NeMo/Gym/tree/main/benchmarks)+[`resources_servers/`](https://github.com/NVIDIA-NeMo/Gym/tree/main/resources_servers). + +The matrix below covers only these three backends. **`lm-eval://`** (e.g. `lm-eval://hellaswag`) is also +supported but not enumerated here — discover its tasks with `lm_eval --tasks list`. Only the **built-in** +column is verified against this engine (`nel list -s builtin`); the **`skills://`** and **`gym://`** +columns are compiled from the upstream repos and have **not** all been run — treat them as a routing +guide, not a guarantee (see the per-row notes). + + +> **`†`** = `skills://` **may mis-score** this **judge / batch / code** benchmark: its bridge ([`skills.py`](https://github.com/NVIDIA-NeMo/Evaluator/blob/main/src/nemo_evaluator/environments/skills.py)) only scores math/multichoice; other types fall back to exact-match → ~0. Use `gym://` (or the nemo-skills container / 0.2.x launcher) instead. + +```text +benchmark built-in skills:// gym:// notes +------------------------- -------- ---------- ------ ---------------------------------------- +KNOWLEDGE / QA + gpqa yes yes yes gym: gpqa (+ gpqa_diamond server) + mmlu yes yes yes + mmlu_pro yes yes yes + simpleqa yes yes yes + triviaqa yes — — built-in only + drop yes — — built-in only + healthbench yes — — built-in only + hotpotqa — yes yes gym: hotpotqa_qa / hotpotqa_closedbook + supergpqa — yes yes + hle — yes†(judge) yes gym recommended (skills may mis-score) + omniscience — yes†(judge) yes gym recommended (skills may mis-score) + critpt — yes†(judge) yes gym recommended (skills may mis-score) +MATH + math500 yes yes yes skills/gym: math-500 + gsm8k yes yes yes + aime25 — yes yes + aime24 — yes yes + hmmt_feb25 — yes yes + polymath — yes yes + putnam_bench — yes yes + minif2f — yes yes + proofnet — yes yes + ugphysics — yes yes + physics — yes yes + minerva_math — yes — skills only + olympiadbench — yes — skills only + omni_math — yes — skills only +CODE / SWE + humaneval yes yes†(code) yes built-in faithful; skills code-exec=batch + mbpp — yes†(code) yes + livecodebench — yes†(code) yes gym recommended (skills may mis-score) + bigcodebench — yes†(code) yes gym recommended (skills may mis-score) + scicode — yes†(code) yes gym recommended (skills may mis-score) + ioi — yes†(code) yes + cvdp — yes†(code) yes + swe-bench — yes — gym has only SWE-RL servers (swe_pivot/swerl_*) + bird_sql — — yes gym only + spider2_lite — — yes gym only + code_gen — — yes gym only + evalplus — — yes gym only +INSTRUCTION / FORMAT + ifbench — yes†(batch) yes gym recommended (skills may mis-score) + ifeval — yes†(batch) yes gym recommended (skills may mis-score) + instruction_following — — yes gym only + structured_outputs — — yes gym only + multichallenge — — yes gym only +LONG-CONTEXT + ruler — yes yes + mrcr — yes yes + longbench_v2 — yes yes + longcodebench — yes yes + aalcr — yes†(judge) yes gym recommended (skills may mis-score) +AGENTIC / TOOL / RL (mostly gym-only; the few built-ins are noted) + terminal-bench-v1 yes — — built-in; agentic terminal tasks + terminal-bench-hard yes — — built-in (+ -aa-split variant) + nmp_harbor yes — — built-in (harbor packaging) + tau2 — — yes + gdpval — — yes + browsecomp — — yes + mcqa — — yes + blackjack — — yes + gymnasium — — yes + reasoning_gym — — yes + arc_agi — — yes + aviary — — yes + google_search — — yes + tavily_search — — yes + xlam_fc — — yes + ns_tools — — yes + swe_pivot — — yes + pinchbench yes — — built-in only (NOT gym) +ARENA / JUDGE + arena_hard — yes†(judge) yes the "LM Arena" family + arena_hard_v2 — yes†(judge) yes + arena_judge — — yes gym only + genrm_compare — — yes gym only +SAFETY + xstest yes — yes + abstention — — yes gym only + over_refusal_detection — — yes gym only + jailbreak_detection — — yes gym only + indirect_prompt_injection — — yes gym only +MULTILINGUAL / MULTIMODAL / AUDIO + mgsm yes yes — built-in (multilingual GSM8K) + mmmlu — yes yes + flores200 — yes yes + wmt24pp — yes yes + mmmu_pro — yes — skills only (multimodal) + covost2 — yes — skills only (audio) + fleurs — yes — skills only (audio) + labbench2_vlm — — yes gym only (multimodal) + vlm_eval_kit — — yes gym only (multimodal) +``` + + +## Launch NEL v0.3.0 + +The launch pattern follows the **backend**, not the category. Map each matrix category to the matching +section below: + +```text +section (the ### headings below) categories / when +--------------------------------- -------------------------------------------------------------- +built-in and skills:// (native) KNOWLEDGE/QA · MATH · LONG-CONTEXT · MULTILINGUAL · SAFETY + (every row `yes` under built-in or skills with NO †) +gym:// (server + reward) AGENTIC/TOOL/RL (gym-only) + every † row (CODE/SWE, + INSTRUCTION/FORMAT, ARENA/JUDGE, LONG-CONTEXT aalcr) +legacy container:// v0.2.x `*_aa_v3` eval-factory container tasks +``` + +> **Multimodal / audio — not validated here.** Our migration only covered text benchmarks. The engine +> registers a `vlmevalkit://` backend (`environments/registry.py` → `VLMEvalKitEnvironment`), and +> multimodal/audio datasets (e.g. `mmmu_pro`, `covost2`, `fleurs`) may be reachable via `skills://` or +> `gym://` — but we ran **none** of them, so the exact name→backend routing is unconfirmed. Treat as a +> starting pointer, not a tested recipe. + +### Basic Usage + +Core 0.3.x CLI — the same for any benchmark; the backend (the three subsections below) only changes what goes in `benchmarks:`. + +```bash +# Check the version +nel --version # v0.3.x +# Config mode (recommended): one file with services + benchmarks + cluster + output +nel eval run config.yaml +nel eval run config.yaml --dry-run # generate scripts, don't run (inspect per-shard configs) +nel eval run config.yaml --resume # resume a partial / timed-out run + +# Quick mode: a single built-in benchmark against an already-served model +nel eval run --bench gpqa --model-url http://localhost:8000/v1 --model-id my-model --api-key dummy + +# Discover names +nel list -s builtin # also: -s skills / -s lm-eval + +# Validate before a long run: serve + run a few samples (catches serving/config errors cheap) +nel validate config.yaml + +# Track / control a submitted run +nel eval jobs # list tracked runs +nel eval status # progress of the current/last run +nel eval logs # tail logs +nel eval stop # cancel + +# Sharded run: after all shards finish, merge then export +nel eval merge / +nel export [ ...] --dest mlflow -o tracking_uri= -o experiment_name= # takes 1+ bundles +``` + +### Single-node vs sharded + +Sharding is **orthogonal to the benchmark config** — the `services:` / `benchmarks:` / `solver:` blocks +shown in the backend subsections below are identical either way. The only difference is one line in the +`cluster:` block plus a post-run merge: + +```text + single-node (no shards) sharded (cluster.shards: N) +---------------- --------------------------------- ---------------------------------------- +config diff cluster.shards absent (or = 1) cluster.shards: N (one added line) +benchmarks/solver identical identical +how it runs 1 worker serves the model, N independent single-node TP workers, each + runs ALL (problems × repeats) serves its OWN model copy, runs 1/N of them +launch nel eval run config.yaml nel eval run config.yaml (same command) +results one result bundle per benchmark N shard results -> nel eval merge / +export nel export ... nel export ... +when small / quick runs the long pole (e.g. aime25 repeats=64); ~N× faster +``` + +Key points: + +- **`repeats` is *not* divided — the *problem set* is.** Each shard runs a disjoint slice of all + `problems × repeats` tasks, so the merged result is mathematically identical to a single-node run, + just ~N× faster wall-clock. +- **This is the throughput mechanism, not data-parallel serving.** Sharding launches N separate + single-node jobs (each its own model instance); it does **not** rely on vLLM data-parallel (DP is + fragile on some stock vLLM builds — DP can fail in engine-core init). +- **The only extra step is `nel eval merge /`** — it stitches the N shard outputs into one + bundle per benchmark before you `nel export` (see "Export results to MLFlow" below). +- **Works cleanly for native (built-in / `skills://`).** Legacy `container://` is single-node only. A + `gym://` run *can* shard but it isn't a free swap — its one shared resource server must be at a + routable host (not loopback) reachable from every shard (see the `gym://` section); the validated gym + path is single-node `cluster: {type: local}`. + +### The `solver:` block — `simple` vs `tool_calling` + +`solver:` sets **how the model answers each problem**; `service:` names which `services:` entry to run +against (e.g. `service: model`). + +- **`simple`** — one model call per problem, no tools. For pure reasoning (knowledge / math / multichoice). +- **`tool_calling`** — a multi-turn ReAct loop: the model calls tools that a `sandbox:` block runs, up to + `max_turns` (turn-exhaustion = a failed episode). Needs `sandbox_tools: true` + a `sandbox:`, and a model + served with tool calling enabled (vLLM: `--enable-auto-tool-choice` + `--tool-call-parser`). For + tool-augmented tasks (run code, compute, search). + +**Backend and solver are independent.** A native built-in / `skills://` benchmark can run *either* solver +— e.g. `gpqa` with `simple` (no-tools) or with `tool_calling` + a `local` sandbox (the "with-tools" +variant). `tool_calling` is **not** exclusive to `gym://`. + + +### built-in and skills:// (native) + +The common path — every matrix row that is `yes` under **built-in** or **skills** with **no `†`**. The +engine serves the model itself and scores per-sample; just list the names in `benchmarks:` with a +`simple` solver (no sandbox, no server). Built-in takes a bare name; nemo-skills takes the `skills://` +prefix. + +Built-in benchmarks need nothing extra. For **`skills://`**, first install **NeMo-Skills** into your eval +image — `pip install nemo-skills` from [NVIDIA-NeMo/Skills](https://github.com/NVIDIA-NeMo/Skills); see the +repo README. It's baked into `cluster.eval_image` (and the dataset is prepped there). Likewise `lm-eval://` +needs `lm-eval` in the image. + +```yaml +benchmarks: + - name: gpqa # built-in (multichoice) + repeats: 8 + max_concurrent: 64 + solver: {type: simple, service: model} + - name: mmlu_pro # built-in (multichoice) + repeats: 1 + max_concurrent: 64 + solver: {type: simple, service: model} + - name: skills://aime25 # nemo-skills (math) + repeats: 64 + max_concurrent: 256 + solver: {type: simple, service: model} + # WITH-TOOLS variant of the SAME native benchmark — backend unchanged, just swap the solver and add a + # sandbox (see "The solver: block" above). Use instead of the simple gpqa entry to give it tools: + # - name: gpqa + # repeats: 8 + # max_concurrent: 64 + # solver: {type: tool_calling, service: model, sandbox_tools: true, max_turns: 100} + # sandbox: {type: local, concurrency: 64} + +cluster: + shards: 8 # N single-node workers, each runs 1/N of (problems × repeats); `nel eval merge` after + # ... + type/account/walltime/eval_image/node_pools (see r030_example_eval.yaml) +``` + +`nel eval run config.yaml`. `repeats` matches your model card's sampling (e.g. gpqa avg-of-8, aime25 avg-of-64); +sampling (`temperature`, `top_p`, `max_tokens`) lives under `services.model.generation`. Full runnable +config: [`recipes/examples/r030_example_eval.yaml`](../recipes/examples/r030_example_eval.yaml) +(simple / no-tools). For a tool-calling benchmark see the `gym://` example below. + +### gym:// (server + reward) + +`gym://` runs a NeMo-Gym **resource server** that scores each response — the faithful path for the `†` +benchmarks `skills://` mis-scores and for agentic envs. We run it as a **two-job split**: the model on a +GPU node (its own serve job), and the grader + evaluator on a CPU node. Why split: serving needs the vLLM +container, while `nel` and the Gym server live in their own venvs (see the last caveat in §7). + +#### 1. The pieces (two jobs) + +```text +program what it does runs on +----------------------------- ---------------------------------------------------- ------------------------------ +the grader (gym env start) checks each answer, returns a score CPU node (you start it there) +the conductor (nel eval run) sends questions -> model, answers -> grader, tallies CPU node — in-process + (cluster: {type: local}) +the model answers the questions GPU node — its own vllm serve + job (services.model.type: api) +``` + +Both the grader and the conductor run on the CPU node that `gym_eval.sbatch` allocates (`--partition=cpu`, +§6) — `cluster: {type: local}` means NEL runs the conductor **in-process there, not on the login node**. The +conductor reaches the **model** over the network (the GPU node's hostname) and the **grader** at `127.0.0.1` +(same CPU node). + +#### 2. Start the gym grader + get its port + +First install NeMo-Gym (the `gym`/`ng` CLI) — clone [NVIDIA-NeMo/Gym](https://github.com/NVIDIA-NeMo/Gym) +and `uv venv --python 3.12 && uv sync`; see the repo README. + +The grader is **not** started by NEL — you launch it on the **CPU node** (same node as the evaluator, so +it's reachable at `127.0.0.1`). Use **`--resources-server `** to bring up *only* the grader: a +benchmark's bundled config also defines an **agent** that needs its own model server, which you don't want +here (NEL is the conductor and serves its own model). It registers with a head process on `:11000` and +gets a **dynamic** port: + +```bash +export RAY_TMPDIR=/tmp # Lustre socket-path fix +gym env start --resources-server & # e.g. --resources-server ifbench +curl -s http://127.0.0.1:11000/server_instances | python3 -m json.tool # -> [{"name": .., "url": ".../"}] +``` + +The port is **assigned dynamically** — read it from `/server_instances` (the `url` field ends in +`:`) and drop it into the conductor config's `gym://127.0.0.1:` URI (§4). For scripting, parse it: + +```bash +PORT=$(curl -s http://127.0.0.1:11000/server_instances \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(d[0]['url'].rsplit(':',1)[1] if d else '')") +``` + +Running more than one server lists them all — match the row by `name`. + + +#### 3. Get the data + +The questions live in a **local `tasks.jsonl`** that the conductor reads (`data=` points at it). Two ways: + +**Way 1 - no download (quick):** use the bundled [`example.jsonl`](https://github.com/NVIDIA-NeMo/Gym/blob/main/resources_servers/ifbench/data/example.jsonl) already in the Gym repo (each server has one under +`resources_servers//data/`; small, prompt field pre-built): + +```text +data=/resources_servers//data/example.jsonl +``` + +**Way 2 - full set (downloads from HuggingFace):** run the benchmark's `prepare.py` — one per benchmark dir under [`Gym/benchmarks/`](https://github.com/NVIDIA-NeMo/Gym/tree/main/benchmarks) (e.g. [`benchmarks/ifbench/prepare.py`](https://github.com/NVIDIA-NeMo/Gym/blob/main/benchmarks/ifbench/prepare.py)): + +```bash +cd # your NeMo-Gym clone +export HF_HOME=/huggingface # keep the HF download off your home dir +PYTHONPATH=$PWD .venv/bin/python benchmarks//prepare.py # downloads from HF -> benchmarks//data/ +# then point data= at the output, e.g.: +# data=/benchmarks/ifbench/data/ifbench_benchmark_eval.jsonl +``` + +(Native path: each row needs a `responses_create_params` prompt field - bundled `example.jsonl` already +has it; `prepare.py` output may need it added.) + + +#### 4. The conductor config (what `nel eval run` reads) + +This is the config the **conductor** (`nel eval run` — the third piece in §1) reads. It tells the conductor +which **model** to call (§5), which **grader** + **data** to score against (§2/§3), where it runs, and where +to write results. It is *not* the grader's config (that's §2's gym server) — it's the run itself. + +Its placeholders are filled by the other steps — the grader port (§2), the data path (§3), and the model host (§5): +model = external endpoint (`type: api`), benchmark = the local gym grader, evaluator in-process +(`cluster: {type: local}`): + +```yaml +services: + model: + type: api + url: http://:8000/v1/chat/completions # from §5 (serve_host.txt) + model: + api_key: dummy + generation: + temperature: 1.0 + top_p: 0.95 + max_tokens: 32768 # reasoning model: big enough to finish reasoning + emit the answer +benchmarks: + # simple = single-shot (ifbench / ifeval); for agentic envs use `tool_calling` + a `sandbox:` block. + - name: "gym://127.0.0.1:?protocol=native&data=/.../tasks.jsonl" # from §2, data from §3 + repeats: 1 + max_concurrent: 64 + solver: {type: simple, service: model} +cluster: + type: local # evaluator runs in-process on the CPU node +output: + dir: +``` + +Full runnable config: [`recipes/examples/r030_gym.yaml`](../recipes/examples/r030_gym.yaml). + + +#### 5. Serve the model on a GPU node + +`type: api` means you serve the model yourself — a plain `vllm serve` in its own GPU `sbatch` job (the +vLLM **container**). Publish the node's hostname so the eval job can build `url`: + +```bash +#!/bin/bash +# serve.sbatch +#SBATCH --job-name=serve-policy +#SBATCH --partition= +#SBATCH --gres=gpu:4 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --time=04:00:00 +#SBATCH --exclusive +hostname > /serve_host.txt # publish the node for the eval job +srun --container-image \ + --container-mounts :/model:ro,:/cache \ + bash -lc 'vllm serve /model --served-model-name --host 0.0.0.0 --port 8000 \ + --tensor-parallel-size 4 --max-model-len 262144 --trust-remote-code \ + --reasoning-parser

--tool-call-parser

...' +``` + +- **`--host 0.0.0.0`** (not `127.0.0.1`) so the CPU eval node can reach it cross-node; the GPU node serves only the model. +- **`` is resolved at runtime** — you don't know the node until SLURM dispatches the job, so the serve job writes its `hostname` to a shared file and the eval job reads it (`POLICY_HOST=$(cat /serve_host.txt)`) to fill `url`. + + +#### 6. Run it — the two-job pattern + +From the login node you submit **two** jobs — `serve.sbatch` (§5) and `gym_eval.sbatch` (below): + +```bash +sbatch serve.sbatch # GPU: §5 — vllm serve (publishes serve_host.txt) +sbatch gym_eval.sbatch # CPU: the script below +``` + +`gym_eval.sbatch` bundles §2 (start grader + get port) + the host hand-off + the conductor (§4 config): + +```bash +#!/bin/bash +# gym_eval.sbatch +#SBATCH --job-name=gym-eval +#SBATCH --partition=cpu +#SBATCH --nodes=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=128G +#SBATCH --time=02:00:00 +export RAY_TMPDIR=/tmp # Lustre socket-path fix +# 1. start the grader (§2) on this node, then wait for it to register + read its DYNAMIC port +cd # so --resources-server resolves +gym env start --resources-server & +PORT="" +for i in $(seq 1 60); do # server takes a few s to register + PORT=$(curl -s http://127.0.0.1:11000/server_instances \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(d[0]['url'].rsplit(':',1)[1] if d else '')") + [ -n "$PORT" ] && break; sleep 5 +done +[ -z "$PORT" ] && { echo "gym server did not come up"; exit 1; } +# 2. wait for the GPU serve (serve_host.txt + model live), then fill the §4 config placeholders +until [ -f /serve_host.txt ]; do sleep 5; done +POLICY_HOST=$(cat /serve_host.txt) +until curl -s --max-time 8 "http://$POLICY_HOST:8000/v1/models" | grep -q '"id"'; do sleep 10; done +sed -i "s##$POLICY_HOST#; s##$PORT#" config.yaml +# 3. run the conductor (§4) +nel eval run config.yaml +``` + +Submit `serve.sbatch` first (a plain `sbatch --dependency=after` only waits for the serve job to *start*, +not for vLLM to finish loading — that's why `gym_eval.sbatch` polls `…/v1/models` above before running). + +#### 7. Caveats + +- **Reasoning models need a big enough `generation.max_tokens`** — too small truncates mid-reasoning, the + grader sees empty output, and the score collapses. +- **`example.jsonl` is a small subset** — use `prepare.py` (§3) for the full benchmark. +- **Sharding:** a gym run has one shared grader, so it doesn't shard across nodes cleanly (loopback breaks) + — keep it single-node. +- **Why not "one GPU node, NEL serves the model" (`type: vllm` + `cluster: {type: local}`)?** NEL's local + path runs `python -m vllm` from the `nel` venv (it ignores the container `image:`), and that venv has no + vLLM — so it errors at model startup. Serving needs the vLLM container, which is why §5 is a separate job. + + +### legacy container:// + +Backward-compat path for v0.2.x's eval-factory `*_aa_v3` container tasks that have no native 0.3.x +benchmark. The `container` solver injects a v1-format `run_config.yaml`, so v1 configs port with +minimal change — but **`cluster.shards:` is unsupported** for legacy container runs (single-node only). + +```yaml +benchmarks: + - name: mmlu_pro_aa_v3 # v0.2.x eval-factory container task + solver: {type: container, service: model} + +cluster: + # shards: N # UNSUPPORTED for legacy container — single-node only (omit it) + # ... + type/account/walltime/eval_image/node_pools +``` + +Prefer the native built-in / `skills://` equivalent (`mmlu_pro`, `gpqa`) when one exists — you get +sharding + the one eval image. Use `container://` only when there is no native port. + + +## Export results to MLFlow + +SLURM runs do **not** auto-export (the `output.export` block only fires for local runs). After all +shards finish, merge then export from the login node: + +```bash +# RUN = the run dir: / (one timestamped subdir per run; holds shard_0/ ... shard_N/) +RUN=/ + +# 1. merge the N shards -> writes the merged per-benchmark bundles BACK INTO $RUN (alongside the shards) +nel eval merge "$RUN" + +# 2. export each merged benchmark bundle ($RUN/) to MLflow +nel export "$RUN/gpqa" "$RUN/mmlu_pro" "$RUN/skills___aime25" --dest mlflow \ + -o tracking_uri= \ + -o experiment_name= +``` + +Both commands use the **same** run dir (`/`): merge reads its `shard_*/` +subdirectories and writes the merged `/` bundles next to them, which export then consumes. Tags +from `output.export_config.mlflow.tags` (e.g. `model`, `checkpoint_path`, `num_nodes`, `precision`, +`variant`) ride along and feed your MLflow dashboards / downstream viewers. The `/` in a +`skills://` name becomes `___` in the on-disk bundle dir (`skills___aime25`). From 4b4f4ee6d97b4300a8dd629fb20542cca166e644 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sat, 27 Jun 2026 23:00:38 -0700 Subject: [PATCH 2/7] =?UTF-8?q?docs(eval):=20gym=20caveats=20=E2=80=94=20e?= =?UTF-8?q?nable=5Fthinking=20via=20proxy.extra=5Fbody=20+=20manual=20resu?= =?UTF-8?q?me?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - §7: how to enable reasoning (`chat_template_kwargs: {enable_thinking: true}` under services.model.proxy.extra_body) — a model setting, applies to native runs too. - §7: clarify that cluster:{type: local} has no auto_resume (slurm-only); recover a killed gym run by re-submitting with `nel eval run --resume` (progress is checkpointed). Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .../skills/evaluation/references/nel-v0.3.0-migration.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md index 5e503b4c3f6..46cc895ce41 100644 --- a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -539,6 +539,13 @@ not for vLLM to finish loading — that's why `gym_eval.sbatch` polls `…/v1/mo - **Reasoning models need a big enough `generation.max_tokens`** — too small truncates mid-reasoning, the grader sees empty output, and the score collapses. +- **Reasoning models: enable thinking via `proxy.extra_body`** — pass `chat_template_kwargs: {enable_thinking: + true}` under `services.model.proxy.extra_body` (it's merged into every request) so the chat template emits + the reasoning block. This is a *model* setting (applies to native runs too), not gym-specific. +- **Resuming a killed run is manual.** `cluster: {type: local}` does not auto-restart after a SLURM + wall-clock kill (NEL's `auto_resume` only chains `slurm`-cluster jobs). Finished rollouts are + checkpointed, though — just re-submit the job with `nel eval run --resume` and it skips the done ones + and continues. - **`example.jsonl` is a small subset** — use `prepare.py` (§3) for the full benchmark. - **Sharding:** a gym run has one shared grader, so it doesn't shard across nodes cleanly (loopback breaks) — keep it single-node. From 465bce79a9c20d4ee120fb02b020b3c1393ada75 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sat, 27 Jun 2026 23:03:04 -0700 Subject: [PATCH 3/7] docs(eval): native (cluster:slurm) auto-resumes; gym (local) is manual --resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clarify in the native section that built-in/skills runs use cluster:{type: slurm} (auto_resume: true) so NEL chains a successor job on a wall-clock kill — no manual step — in contrast to gym://'s cluster:{type: local} (manual `nel eval run --resume`). Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .agents/skills/evaluation/references/nel-v0.3.0-migration.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md index 46cc895ce41..b5496997ae4 100644 --- a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -352,6 +352,11 @@ sampling (`temperature`, `top_p`, `max_tokens`) lives under `services.model.gene config: [`recipes/examples/r030_example_eval.yaml`](../recipes/examples/r030_example_eval.yaml) (simple / no-tools). For a tool-calling benchmark see the `gym://` example below. +Because the native path uses `cluster: {type: slurm}` (`auto_resume: true` by default), NEL **auto-resumes** +these runs on a wall-clock kill — it chains a successor job, no manual step. (This differs from `gym://`, +which runs under `cluster: {type: local}` and must be resumed by hand with `nel eval run --resume` — see the +gym `§7` caveats.) + ### gym:// (server + reward) `gym://` runs a NeMo-Gym **resource server** that scores each response — the faithful path for the `†` From bb53eaebd1864aaaeb69c35e643015bd00c59854 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sat, 27 Jun 2026 23:23:23 -0700 Subject: [PATCH 4/7] =?UTF-8?q?docs(eval):=20gym=20=C2=A72=20=E2=80=94=20d?= =?UTF-8?q?ocument=20bundled-config=20instances=20+=20keep/drop=20rule?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explain that a NeMo-Gym bundled config (resources_servers//configs/.yaml) packs multiple server instances: the resources_servers grader, plus a sample responses_api_agents agent that references an undefined responses_api_models. Add a component table (linked to upstream ifbench.yaml) and a keep/drop rule: NEL native needs only the grader; the agent is the gym-driven Responses-API path. Note that --resources-server / --config .yaml start the whole file and fail on the undefined model, so point --config at a grader-only config. Align §6 sbatch to match. Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .../references/nel-v0.3.0-migration.md | 36 +++++++++++++++---- 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md index b5496997ae4..03a84a84f41 100644 --- a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -387,14 +387,36 @@ First install NeMo-Gym (the `gym`/`ng` CLI) — clone [NVIDIA-NeMo/Gym](https:// and `uv venv --python 3.12 && uv sync`; see the repo README. The grader is **not** started by NEL — you launch it on the **CPU node** (same node as the evaluator, so -it's reachable at `127.0.0.1`). Use **`--resources-server `** to bring up *only* the grader: a -benchmark's bundled config also defines an **agent** that needs its own model server, which you don't want -here (NEL is the conductor and serves its own model). It registers with a head process on `:11000` and -gets a **dynamic** port: +it's reachable at `127.0.0.1`). A benchmark's bundled config (`resources_servers//configs/.yaml`) +usually packs **more than one server instance** into one file — you only start one of them. For +[ifbench](https://github.com/NVIDIA-NeMo/Gym/blob/main/resources_servers/ifbench/configs/ifbench.yaml): + +```text +top-level key in .yaml kind role NEL native: start it? +------------------------------ -------------------- ------------------------------- ---------------------- +ifbench resources_servers the GRADER — scores via /verify KEEP (all NEL needs) +ifbench_simple_agent responses_api_agents a sample gym-driven agent loop DROP + └ model_server: policy_model responses_api_models the model that agent would run (placeholder name only; + (referenced, NOT — gym-driven path, not NEL not defined in the file) + defined in the file) +``` + +**Keep / drop — which and when:** +- **NEL `gym://…?protocol=native` (this guide)** — NEL *is* the conductor: it calls the model itself, then + posts each answer to the grader's `/verify`. Start **only the `resources_servers` grader** (`ifbench`); + drop the agent. +- **gym-driven (Responses API)** — gym runs the `*_simple_agent` itself and needs a real + `responses_api_models` wired in (the bundle ships `policy_model` as an unresolved placeholder). Different + path; not what NEL native uses. + +`gym env start --resources-server ` (and `--config .yaml`) load the **whole** bundled file, so +they also try to start `*_simple_agent` and **fail** on the undefined `policy_model`. To bring up only the +grader, point `--config` at a config that keeps **just** the `resources_servers:` block (drop +`responses_api_agents:`). It registers with a head process on `:11000` and gets a **dynamic** port: ```bash export RAY_TMPDIR=/tmp # Lustre socket-path fix -gym env start --resources-server & # e.g. --resources-server ifbench +gym env start --config resources_servers//configs/.yaml & # Launch gym grader in background by using "&" curl -s http://127.0.0.1:11000/server_instances | python3 -m json.tool # -> [{"name": .., "url": ".../"}] ``` @@ -519,8 +541,8 @@ sbatch gym_eval.sbatch # CPU: the script below #SBATCH --time=02:00:00 export RAY_TMPDIR=/tmp # Lustre socket-path fix # 1. start the grader (§2) on this node, then wait for it to register + read its DYNAMIC port -cd # so --resources-server resolves -gym env start --resources-server & +cd +gym env start --config & # keep only resources_servers:, drop the agent (§2) PORT="" for i in $(seq 1 60); do # server takes a few s to register PORT=$(curl -s http://127.0.0.1:11000/server_instances \ From 7abb8e9e065a5a490d93d9c0dfa735cd8409caa8 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sat, 27 Jun 2026 23:25:08 -0700 Subject: [PATCH 5/7] =?UTF-8?q?docs(eval):=20gym=20=C2=A76=20=E2=80=94=20m?= =?UTF-8?q?atch=20=C2=A72's=20grader=20start=20line=20+=20comment?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Align the §6 sbatch grader command with §2 (same --config resources_servers//configs/.yaml line and the "&" = background / keep resources_servers, drop the agent comment) so both code blocks are consistent. Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .agents/skills/evaluation/references/nel-v0.3.0-migration.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md index 03a84a84f41..9fa44383347 100644 --- a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -542,7 +542,8 @@ sbatch gym_eval.sbatch # CPU: the script below export RAY_TMPDIR=/tmp # Lustre socket-path fix # 1. start the grader (§2) on this node, then wait for it to register + read its DYNAMIC port cd -gym env start --config & # keep only resources_servers:, drop the agent (§2) +# Launch gym grader in background by using "&". Keep only resources_servers and drop the agent (§2) as needed +gym env start --config resources_servers//configs/.yaml & PORT="" for i in $(seq 1 60); do # server takes a few s to register PORT=$(curl -s http://127.0.0.1:11000/server_instances \ From 5314022f6f1e06c541250852acd40820cf3d09c1 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sat, 27 Jun 2026 23:54:48 -0700 Subject: [PATCH 6/7] =?UTF-8?q?docs(eval):=20gym=20=C2=A77=20resume=20?= =?UTF-8?q?=E2=80=94=20two-step=20procedure=20+=20verified-cache=20note?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the gym "Resume the jobs" section into a clean two-step procedure (relaunch the serve, then resume the eval with --export=ALL,RESUME=--resume) and thread ${RESUME} into the §6 conductor line so the command works. Add that completed rollouts survive the resume even though the new serve node changes the model URL: NEL keys the skip on its verified log, which is retained on a config-hash change (only the un-scored inference cache is dropped). Tighten the native auto-resume paragraph and fix its cross-reference, and clarify the single-node/local sharding note in the §4 config snippet. Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .../references/nel-v0.3.0-migration.md | 40 +++++++++++++------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md index 9fa44383347..f25ba5c93c8 100644 --- a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -352,10 +352,10 @@ sampling (`temperature`, `top_p`, `max_tokens`) lives under `services.model.gene config: [`recipes/examples/r030_example_eval.yaml`](../recipes/examples/r030_example_eval.yaml) (simple / no-tools). For a tool-calling benchmark see the `gym://` example below. -Because the native path uses `cluster: {type: slurm}` (`auto_resume: true` by default), NEL **auto-resumes** -these runs on a wall-clock kill — it chains a successor job, no manual step. (This differs from `gym://`, -which runs under `cluster: {type: local}` and must be resumed by hand with `nel eval run --resume` — see the -gym `§7` caveats.) +The native path runs under `cluster: {type: slurm}` with `auto_resume: true` (the default), so NEL +**auto-resumes** a wall-clock kill itself — it chains a successor SLURM job, no manual step. (A `gym://` run +is different: it runs under `cluster: {type: local}`, which has no `auto_resume`, so you resume it by hand — +see the gym section's "Resume the jobs".) ### gym:// (server + reward) @@ -484,7 +484,8 @@ benchmarks: max_concurrent: 64 solver: {type: simple, service: model} cluster: - type: local # evaluator runs in-process on the CPU node + # gym's single grader lives on 127.0.0.1, so a gym run can't shard across nodes — keep it single-node. + type: local # evaluator runs in-process on the CPU node (no shards) output: dir: ``` @@ -556,24 +557,39 @@ until [ -f /serve_host.txt ]; do sleep 5; done POLICY_HOST=$(cat /serve_host.txt) until curl -s --max-time 8 "http://$POLICY_HOST:8000/v1/models" | grep -q '"id"'; do sleep 10; done sed -i "s##$POLICY_HOST#; s##$PORT#" config.yaml -# 3. run the conductor (§4) -nel eval run config.yaml +# 3. run the conductor (§4); ${RESUME} is empty on a fresh run, "--resume" when resuming (§7) +nel eval run config.yaml ${RESUME:-} ``` Submit `serve.sbatch` first (a plain `sbatch --dependency=after` only waits for the serve job to *start*, not for vLLM to finish loading — that's why `gym_eval.sbatch` polls `…/v1/models` above before running). -#### 7. Caveats +#### 7. Resume the jobs + +Resuming a wall-clock kill is **manual** here. `cluster: {type: local}` has no `auto_resume` (that only +chains `slurm`-cluster jobs — see the native section), so neither job restarts itself, and the GPU serve +job dies too. A resume is therefore **two steps** — relaunch the serve, then resume the eval: + +```bash +# 1. relaunch the GPU serve — it rewrites serve_host.txt with the new node's hostname +sbatch serve.sbatch + +# 2. resume the eval — it waits for the fresh serve_host.txt + /v1/models, then continues +sbatch --export=ALL,RESUME=--resume gym_eval.sbatch # RESUME -> nel eval run … --resume (§6 step 3) +``` + +Completed rollouts are checkpointed per `(problem, repeat)`, so the resume **skips everything already +scored** and re-runs only what was in flight — even though the new serve node changes the model URL. (NEL +keys the skip on its *verified* log, which survives the config-hash change; only the un-scored inference +cache is dropped and regenerated.) + +#### 8. Caveats - **Reasoning models need a big enough `generation.max_tokens`** — too small truncates mid-reasoning, the grader sees empty output, and the score collapses. - **Reasoning models: enable thinking via `proxy.extra_body`** — pass `chat_template_kwargs: {enable_thinking: true}` under `services.model.proxy.extra_body` (it's merged into every request) so the chat template emits the reasoning block. This is a *model* setting (applies to native runs too), not gym-specific. -- **Resuming a killed run is manual.** `cluster: {type: local}` does not auto-restart after a SLURM - wall-clock kill (NEL's `auto_resume` only chains `slurm`-cluster jobs). Finished rollouts are - checkpointed, though — just re-submit the job with `nel eval run --resume` and it skips the done ones - and continues. - **`example.jsonl` is a small subset** — use `prepare.py` (§3) for the full benchmark. - **Sharding:** a gym run has one shared grader, so it doesn't shard across nodes cleanly (loopback breaks) — keep it single-node. From 835db3f082ce0480ddfefd17cdb3af416ed07381 Mon Sep 17 00:00:00 2001 From: Hung-Yueh Chiang Date: Sun, 28 Jun 2026 01:10:04 -0700 Subject: [PATCH 7/7] =?UTF-8?q?docs(eval):=20export=20=E2=80=94=20require?= =?UTF-8?q?=20a=20model=20tag=20for=20run=20attribution=20+=20tags=20examp?= =?UTF-8?q?le?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a reminder in "Export results to MLflow" that downstream MLflow dashboards key/group runs by the `model` (and `checkpoint_path`) tag: a run exported without them can't be attributed (shows as an orphan/"no checkpoint" row), and a drifting `model` value splits one logical row. Include a concrete `output.export_config.mlflow.tags` YAML example. Signed-off-by: Hung-Yueh Chiang Co-Authored-By: Claude Opus 4.8 (1M context) --- .../references/nel-v0.3.0-migration.md | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md index f25ba5c93c8..56a15fd7a7c 100644 --- a/.agents/skills/evaluation/references/nel-v0.3.0-migration.md +++ b/.agents/skills/evaluation/references/nel-v0.3.0-migration.md @@ -641,3 +641,30 @@ subdirectories and writes the merged `/` bundles next to them, which expo from `output.export_config.mlflow.tags` (e.g. `model`, `checkpoint_path`, `num_nodes`, `precision`, `variant`) ride along and feed your MLflow dashboards / downstream viewers. The `/` in a `skills://` name becomes `___` in the on-disk bundle dir (`skills___aime25`). + +> **Always set a stable `model` (and ideally `checkpoint_path`) tag.** Dashboards key/group runs by +> these tags to attribute a score to a model or checkpoint. A run exported **without** them can't be +> attributed — it shows up as an orphan/"no checkpoint" row instead of landing on the right model — and +> a `model` value that drifts between runs (or differs from a baseline's) splits what should be one row. +> The engine logs only a generic headline metric (`pass_at_1` / `mean_reward`); the benchmark identity +> rides in the `benchmark` tag and the model identity in `model`, so both tags are what make the export +> usable downstream. + +Example `output` block with the tags set (lives in the run config; `nel export` also accepts the same +tracking URI / experiment via `-o`): + +```yaml +output: + dir: + export: [mlflow] + export_config: + mlflow: + tracking_uri: + experiment_name: + tags: + model: # set this — dashboards group/attribute runs by it + checkpoint_path: # the per-checkpoint row label downstream + benchmark: # benchmark identity (the engine logs only a generic metric key) + precision: bf16 + variant: # e.g. base / with-tools / 96k-thinking +```