Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions tools/launcher/common/smoke/nvidia_smi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -euo pipefail

echo "MODEL_OPT_SMOKE_START"
hostname
nvidia-smi
echo "MODEL_OPT_SMOKE_DONE"
91 changes: 70 additions & 21 deletions tools/launcher/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,19 @@ def set_slurm_config_type(cls):
"""Register the SlurmConfig dataclass type used by SandboxTask."""
global _SLURM_CONFIG_TYPE
_SLURM_CONFIG_TYPE = cls
# Patch SandboxTask's type annotation so nemo-run's CLI parser can resolve factories
SandboxTask.__dataclass_fields__["slurm_config"].type = cls
SandboxTask.__annotations__["slurm_config"] = cls
# Patch every task dataclass so nemo-run's CLI parser sees the concrete
# SlurmConfig type for task_0/task_1/... fields, not the base `object`.
for task_cls in (
SandboxTask,
SandboxTask0,
SandboxTask1,
SandboxTask2,
SandboxTask3,
SandboxTask4,
):
task_cls.__dataclass_fields__["slurm_config"].type = cls
task_cls.__annotations__["slurm_config"] = cls
task_cls.__init__.__annotations__["slurm_config"] = cls


def register_factory(name, fn):
Expand Down Expand Up @@ -386,25 +396,64 @@ def build_docker_executor(

def _git_info(path):
"""Get git commit hash and branch for a directory."""
import subprocess # nosec B404

try:
commit = subprocess.run( # nosec B603 B607
["git", "rev-parse", "--short", "HEAD"],
cwd=path,
capture_output=True,
text=True,
timeout=5,
).stdout.strip()
branch = subprocess.run( # nosec B603 B607
["git", "rev-parse", "--abbrev-ref", "HEAD"],
cwd=path,
capture_output=True,
text=True,
timeout=5,
).stdout.strip()
return commit, branch
except Exception:
worktree_dir = os.path.abspath(path)
while True:
git_path = os.path.join(worktree_dir, ".git")
if os.path.isdir(git_path):
git_dir = git_path
break
if os.path.isfile(git_path):
with open(git_path, encoding="utf-8") as file:
marker = file.read().strip()
if not marker.startswith("gitdir:"):
return "unknown", "unknown"
git_dir = marker.removeprefix("gitdir:").strip()
if not os.path.isabs(git_dir):
git_dir = os.path.normpath(os.path.join(worktree_dir, git_dir))
break

parent = os.path.dirname(worktree_dir)
if parent == worktree_dir:
return "unknown", "unknown"
worktree_dir = parent

common_dir = git_dir
commondir_path = os.path.join(git_dir, "commondir")
if os.path.exists(commondir_path):
with open(commondir_path, encoding="utf-8") as file:
common_dir = file.read().strip()
if not os.path.isabs(common_dir):
common_dir = os.path.normpath(os.path.join(git_dir, common_dir))

with open(os.path.join(git_dir, "HEAD"), encoding="utf-8") as file:
head = file.read().strip()
if not head.startswith("ref:"):
return head[:7], "HEAD"

ref = head.removeprefix("ref:").strip()
branch = ref.removeprefix("refs/heads/")
commit = ""
for refs_dir in (git_dir, common_dir):
ref_path = os.path.join(refs_dir, *ref.split("/"))
if os.path.exists(ref_path):
with open(ref_path, encoding="utf-8") as file:
commit = file.read().strip()
break

if not commit:
packed_refs = os.path.join(common_dir, "packed-refs")
if os.path.exists(packed_refs):
with open(packed_refs, encoding="utf-8") as file:
for line in file:
if line.startswith(("#", "^")):
continue
sha, _, packed_ref = line.strip().partition(" ")
if packed_ref == ref:
commit = sha
break
return (commit[:7] if commit else "unknown"), branch
except OSError:
return "unknown", "unknown"


Expand Down
21 changes: 21 additions & 0 deletions tools/launcher/examples/smoke/nvidia_smi.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Minimal Slurm smoke test for launcher/MCP integration.
#
# This intentionally avoids model downloads and HF cache mounts. It verifies:
# MCP submit_job -> launcher YAML parse -> Slurm submit -> container start -> GPU visibility.

job_name: nvidia_smi_smoke
pipeline:
skip: false
allow_to_fail: false
note: "Slurm container GPU smoke test"

task_0:
script: common/smoke/nvidia_smi.sh
slurm_config:
_factory_: "slurm_factory"
nodes: 1
ntasks_per_node: 1
gpus_per_node: 1
time: "00:10:00"
container: nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04
container_mounts: []
46 changes: 32 additions & 14 deletions tools/launcher/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,9 @@
"""

import getpass
import glob
import os
import subprocess # nosec B404
import subprocess # nosec B404 - required for explicit git clean command; no shell is used.
import warnings

import modelopt_launcher as _pkg
Expand Down Expand Up @@ -79,20 +80,33 @@
EXPERIMENT_TITLE = "cicd"
DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE)

_include_pattern = ["examples/*", "common/*"]
_relative_path = [LAUNCHER_DIR, LAUNCHER_DIR]
_include_pattern = []
_relative_path = []


def _add_package_path(path: str) -> None:
"""Add an existing package path using LAUNCHER_DIR as the tar root."""
if os.path.exists(path):
_include_pattern.append(path)
_relative_path.append(LAUNCHER_DIR)


def _add_package_glob(pattern: str) -> None:
"""Expand a glob and add each matching path to the launcher package."""
for path in sorted(glob.glob(pattern)):
_add_package_path(path)


_add_package_path(os.path.join(LAUNCHER_DIR, "examples"))
_add_package_path(os.path.join(LAUNCHER_DIR, "common"))

if _has_modelopt_src:
_include_pattern = [
"modules/Megatron-LM/megatron/*",
"modules/Megatron-LM/examples/*",
"modules/Megatron-LM/*.py",
"modules/Model-Optimizer/modelopt/*",
"modules/Model-Optimizer/modelopt_recipes/*",
"modules/Model-Optimizer/examples/*",
*_include_pattern,
]
_relative_path = [LAUNCHER_DIR] * 6 + _relative_path
_add_package_path(os.path.join(LAUNCHER_DIR, "modules/Megatron-LM/megatron"))
_add_package_path(os.path.join(LAUNCHER_DIR, "modules/Megatron-LM/examples"))
_add_package_glob(os.path.join(LAUNCHER_DIR, "modules/Megatron-LM/*.py"))
_add_package_path(os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt"))
_add_package_path(os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt_recipes"))
_add_package_path(os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/examples"))

packager = run.PatternPackager(
include_pattern=_include_pattern,
Expand Down Expand Up @@ -127,7 +141,11 @@ def launch(
raise ValueError("--clean requires a dev checkout; modelopt source not found.")
examples_dir = os.path.join(_mo_symlink, "examples")
print(f"Cleaning {examples_dir} with git clean -xdf ...")
subprocess.run(["git", "clean", "-xdf", "."], cwd=examples_dir, check=True) # nosec B603 B607
subprocess.run( # nosec B603 B607 - fixed git CLI argv; no shell.
["git", "clean", "-xdf", "."],
cwd=examples_dir,
check=True,
)

if "NEMORUN_HOME" not in os.environ:
warnings.warn("NEMORUN_HOME is not set. Defaulting to current working directory.")
Expand Down
16 changes: 14 additions & 2 deletions tools/launcher/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
SandboxTask,
SandboxTask0,
SandboxTask1,
SandboxTask2,
SandboxTask3,
SandboxTask4,
create_task_from_yaml,
get_default_env,
register_factory,
Expand Down Expand Up @@ -185,8 +188,17 @@ class MockSlurmConfig:
host: str = "test"

set_slurm_config_type(MockSlurmConfig)
assert SandboxTask.__annotations__["slurm_config"] is MockSlurmConfig
assert SandboxTask.__dataclass_fields__["slurm_config"].type is MockSlurmConfig
for task_cls in (
SandboxTask,
SandboxTask0,
SandboxTask1,
SandboxTask2,
SandboxTask3,
SandboxTask4,
):
assert task_cls.__annotations__["slurm_config"] is MockSlurmConfig
assert task_cls.__dataclass_fields__["slurm_config"].type is MockSlurmConfig
assert task_cls.__init__.__annotations__["slurm_config"] is MockSlurmConfig


class TestGetDefaultEnv:
Expand Down
6 changes: 6 additions & 0 deletions tools/launcher/tests/test_core_extended.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@ def test_valid_git_repo(self):
assert branch != "unknown"
assert len(commit) >= 7 # short hash

def test_valid_git_repo_from_nested_directory(self):
commit, branch = _git_info(os.path.join(os.getcwd(), "tests"))
assert commit != "unknown"
assert branch != "unknown"
assert len(commit) >= 7 # short hash

def test_nonexistent_directory(self):
commit, branch = _git_info("/tmp/nonexistent_xyz_12345")
assert commit == "unknown"
Expand Down
30 changes: 27 additions & 3 deletions tools/mcp/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Mode is determined by which args you pass, not by which tool you call. One tool,
|---|---|
| `list_examples` | Enumerate bundled launcher YAMLs under `tools/launcher/examples/` with model + description metadata extracted from each YAML. Discovery primitive — call this first when you don't know which YAML to launch. |
| `verify_setup(executor, ...)` | Fail-fast probe for the named executor. Docker: `docker info` (daemon up) + `docker info --format` runtime-registry check (looks for `"nvidia"` runtime registered by the NVIDIA Container Toolkit — no image pull, daemon-fast). Slurm: `ssh -o BatchMode=yes -o ConnectTimeout=5` to the cluster login node. Returns structured failure on auth / network / daemon issues — no exception. |
| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dryrun --yes` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool, diagnostic?, exit_code, stdout_tail, stderr_tail, argv}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). |
| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?, source_ref?, source_repo?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Before launching, materializes a managed Model-Optimizer checkout at `source_ref` (branch, tag, or SHA; default `main`) and initializes recursive submodules, then runs that checkout's launcher. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dryrun --yes` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool, diagnostic?, exit_code, stdout_tail, stderr_tail, argv, source_sha, source_root}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). |
| `job_status(experiment_id)` | Filesystem-based status from nemo_run's experiment dir (`_DONE`, `status_*.out`). Returns `done` / `failed` / `running` plus per-task statuses. No in-memory registry; survives MCP server restarts. |
| `job_logs(experiment_id, task?, tail?)` | Read `log_<task>.out` from the experiment dir. Per-task filtering + optional tail to truncate. |
| `wait_for_experiment(experiment_id, timeout_sec?, poll_interval_sec?)` | Block until `job_status` returns `done` / `failed`, or until `timeout_sec` elapses. Single tool call replaces the agent's `while True: status; sleep` loop — saves tool-call turns and avoids overshooting the poll interval. Returns the final status plus `waited_seconds`. |
Expand All @@ -47,7 +47,7 @@ codex mcp add modelopt -- uvx --from \
modelopt-mcp
```

`uvx` clones the whole repo to its cache, installs `tools/mcp/` as the entry point, and resolves the sibling `modelopt-launcher` dep via `[tool.uv.sources]` (path → `../launcher`) inside the cloned tree.
`uvx` clones the whole repo to its cache, installs `tools/mcp/` as the entry point, and resolves the sibling `modelopt-launcher` dep via `[tool.uv.sources]` (path → `../launcher`) inside the cloned tree. That install clone is only the server runtime; job submission uses the managed source checkout described below.

### Dev install (local checkout)

Expand All @@ -59,6 +59,24 @@ modelopt-mcp # stdio server entry on PATH

Both packages share the launcher's `core.py` orchestrator. The dev path relies on `[tool.uv.sources]` to point `modelopt-launcher` at `../launcher`.

## Managed source checkouts

`submit_job` does not rely on the uvx install clone or on the caller being inside a Model-Optimizer checkout. For each launch it resolves:

1. `source_ref` argument, if provided.
2. `MODELOPT_MCP_SOURCE_REF`, if set.
3. `main`.

It resolves that ref against `source_repo` / `MODELOPT_MCP_SOURCE_REPO` / `https://github.com/NVIDIA/Model-Optimizer.git`, creates a cached checkout under `MODELOPT_MCP_SOURCE_CACHE` (default `$XDG_CACHE_HOME/modelopt-mcp/sources` or `~/.cache/modelopt-mcp/sources`), and runs:

```bash
uv run --project <source_root>/tools/launcher modelopt-launcher --yaml <resolved-yaml> ...
```

The checkout is keyed by resolved commit SHA, so multiple agents using different branches or SHAs get separate source roots. Recursive submodules are initialized in the managed checkout, so launcher packagers can include `tools/launcher/modules/...` content even when MCP was installed outside a repo checkout.

Set `MODELOPT_MCP_DISABLE_MANAGED_SOURCE=1` only for local development when you deliberately want the already-installed `modelopt-launcher` entrypoint.

### Why no plain `pip install` today

`modelopt-mcp` and `modelopt-launcher` are not on PyPI. Plain `pip` doesn't read `[tool.uv.sources]`, so even from a local checkout, `pip install -e tools/mcp` fails to resolve the bare `modelopt-launcher` name. Stick with `uv` / `uvx` while we're git-only.
Expand Down Expand Up @@ -96,8 +114,9 @@ result = mcp__modelopt__submit_job(
cluster_user="alice",
identity="/home/alice/.ssh/id_ed25519",
skip_verify=True, # we just probed
source_ref="main", # optional; omit to use main
)
# {"ok": True, "experiment_id": "cicd_1781240000", "slurm_job_id": "12345", ...}
# {"ok": True, "experiment_id": "cicd_1781240000", "slurm_job_id": "12345", "source_sha": "...", ...}

# 4. Poll until done
while True:
Expand All @@ -122,6 +141,11 @@ For local Docker execution, drop `cluster_host`/`cluster_user`/`identity` and pa
| `NEMORUN_HOME` | submit + status + logs | Where the launcher writes experiment artifacts. Defaults to cwd if unset. `job_status` / `job_logs` search `$NEMORUN_HOME/experiments/<id>/`. |
| `MODELOPT_MCP_LOG` | (optional) server | Log level. Defaults to `INFO`. Logs go to stderr — stdout is the MCP wire. |
| `MODELOPT_MCP_SKIP_GPU_CHECK` | (optional) `verify_setup(executor='docker')` | Set to skip the `docker info --format` runtime-registry check. Useful for CI hosts where the daemon is up but the NVIDIA Container Toolkit isn't installed. |
| `MODELOPT_MCP_SOURCE_REPO` | (optional) `submit_job` | Default git repository for managed source checkouts. Defaults to `https://github.com/NVIDIA/Model-Optimizer.git`. |
| `MODELOPT_MCP_SOURCE_REF` | (optional) `submit_job` | Default branch, tag, or SHA when `source_ref` is omitted. Defaults to `main`. |
| `MODELOPT_MCP_SOURCE_CACHE` | (optional) `submit_job` | Root for managed source checkouts. Defaults to `$XDG_CACHE_HOME/modelopt-mcp/sources` or `~/.cache/modelopt-mcp/sources`. |
| `MODELOPT_MCP_DISABLE_MANAGED_SOURCE` | (optional) local dev | Set to `1` to skip managed checkout and invoke the installed `modelopt-launcher` entrypoint directly. |
| `MODELOPT_MCP_UV` | (optional) `submit_job` | Override the `uv` binary used for `uv run --project <source>/tools/launcher ...`. |
| `MODELOPT_LAUNCHER_EXAMPLES_DIR` | (optional) `list_examples` | Override the examples directory location. Defaults to `../launcher/examples/` relative to this package. |

## Design principles
Expand Down
Loading
Loading