NVIDIA · ChenhanYu · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
@@ -0,0 +1,22 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -euo pipefail
+
+echo "MODEL_OPT_SMOKE_START"
+hostname
+nvidia-smi
+echo "MODEL_OPT_SMOKE_DONE"
@@ -75,9 +75,19 @@ def set_slurm_config_type(cls):
     """Register the SlurmConfig dataclass type used by SandboxTask."""
     global _SLURM_CONFIG_TYPE
     _SLURM_CONFIG_TYPE = cls
-    # Patch SandboxTask's type annotation so nemo-run's CLI parser can resolve factories
-    SandboxTask.__dataclass_fields__["slurm_config"].type = cls
-    SandboxTask.__annotations__["slurm_config"] = cls
+    # Patch every task dataclass so nemo-run's CLI parser sees the concrete
+    # SlurmConfig type for task_0/task_1/... fields, not the base `object`.
+    for task_cls in (
+        SandboxTask,
+        SandboxTask0,
+        SandboxTask1,
+        SandboxTask2,
+        SandboxTask3,
+        SandboxTask4,
+    ):
+        task_cls.__dataclass_fields__["slurm_config"].type = cls
+        task_cls.__annotations__["slurm_config"] = cls
+        task_cls.__init__.__annotations__["slurm_config"] = cls
 
 
 def register_factory(name, fn):
@@ -386,25 +396,64 @@ def build_docker_executor(
 
 def _git_info(path):
     """Get git commit hash and branch for a directory."""
-    import subprocess  # nosec B404
-
     try:
-        commit = subprocess.run(  # nosec B603 B607
-            ["git", "rev-parse", "--short", "HEAD"],
-            cwd=path,
-            capture_output=True,
-            text=True,
-            timeout=5,
-        ).stdout.strip()
-        branch = subprocess.run(  # nosec B603 B607
-            ["git", "rev-parse", "--abbrev-ref", "HEAD"],
-            cwd=path,
-            capture_output=True,
-            text=True,
-            timeout=5,
-        ).stdout.strip()
-        return commit, branch
-    except Exception:
+        worktree_dir = os.path.abspath(path)
+        while True:
+            git_path = os.path.join(worktree_dir, ".git")
+            if os.path.isdir(git_path):
+                git_dir = git_path
+                break
+            if os.path.isfile(git_path):
+                with open(git_path, encoding="utf-8") as file:
+                    marker = file.read().strip()
+                if not marker.startswith("gitdir:"):
+                    return "unknown", "unknown"
+                git_dir = marker.removeprefix("gitdir:").strip()
+                if not os.path.isabs(git_dir):
+                    git_dir = os.path.normpath(os.path.join(worktree_dir, git_dir))
+                break
+
+            parent = os.path.dirname(worktree_dir)
+            if parent == worktree_dir:
+                return "unknown", "unknown"
+            worktree_dir = parent
+
+        common_dir = git_dir
+        commondir_path = os.path.join(git_dir, "commondir")
+        if os.path.exists(commondir_path):
+            with open(commondir_path, encoding="utf-8") as file:
+                common_dir = file.read().strip()
+            if not os.path.isabs(common_dir):
+                common_dir = os.path.normpath(os.path.join(git_dir, common_dir))
+
+        with open(os.path.join(git_dir, "HEAD"), encoding="utf-8") as file:
+            head = file.read().strip()
+        if not head.startswith("ref:"):
+            return head[:7], "HEAD"
+
+        ref = head.removeprefix("ref:").strip()
+        branch = ref.removeprefix("refs/heads/")
+        commit = ""
+        for refs_dir in (git_dir, common_dir):
+            ref_path = os.path.join(refs_dir, *ref.split("/"))
+            if os.path.exists(ref_path):
+                with open(ref_path, encoding="utf-8") as file:
+                    commit = file.read().strip()
+                break
+
+        if not commit:
+            packed_refs = os.path.join(common_dir, "packed-refs")
+            if os.path.exists(packed_refs):
+                with open(packed_refs, encoding="utf-8") as file:
+                    for line in file:
+                        if line.startswith(("#", "^")):
+                            continue
+                        sha, _, packed_ref = line.strip().partition(" ")
+                        if packed_ref == ref:
+                            commit = sha
+                            break
+        return (commit[:7] if commit else "unknown"), branch
+    except OSError:
         return "unknown", "unknown"
 
 

@@ -0,0 +1,21 @@
+# Minimal Slurm smoke test for launcher/MCP integration.
+#
+# This intentionally avoids model downloads and HF cache mounts. It verifies:
+#   MCP submit_job -> launcher YAML parse -> Slurm submit -> container start -> GPU visibility.
+
+job_name: nvidia_smi_smoke
+pipeline:
+  skip: false
+  allow_to_fail: false
+  note: "Slurm container GPU smoke test"
+
+  task_0:
+    script: common/smoke/nvidia_smi.sh
+    slurm_config:
+      _factory_: "slurm_factory"
+      nodes: 1
+      ntasks_per_node: 1
+      gpus_per_node: 1
+      time: "00:10:00"
+      container: nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04
+      container_mounts: []
@@ -30,8 +30,9 @@
 """
 
 import getpass
+import glob
 import os
-import subprocess  # nosec B404
+import subprocess  # nosec B404 - required for explicit git clean command; no shell is used.
 import warnings
 
 import modelopt_launcher as _pkg
@@ -79,20 +80,33 @@
 EXPERIMENT_TITLE = "cicd"
 DEFAULT_SLURM_ENV, DEFAULT_LOCAL_ENV = get_default_env(EXPERIMENT_TITLE)
 
-_include_pattern = ["examples/*", "common/*"]
-_relative_path = [LAUNCHER_DIR, LAUNCHER_DIR]
+_include_pattern = []
+_relative_path = []
+
+
+def _add_package_path(path: str) -> None:
+    """Add an existing package path using LAUNCHER_DIR as the tar root."""
+    if os.path.exists(path):
+        _include_pattern.append(path)
+        _relative_path.append(LAUNCHER_DIR)
+
+
+def _add_package_glob(pattern: str) -> None:
+    """Expand a glob and add each matching path to the launcher package."""
+    for path in sorted(glob.glob(pattern)):
+        _add_package_path(path)
+
+
+_add_package_path(os.path.join(LAUNCHER_DIR, "examples"))
+_add_package_path(os.path.join(LAUNCHER_DIR, "common"))
 
 if _has_modelopt_src:
-    _include_pattern = [
-        "modules/Megatron-LM/megatron/*",
-        "modules/Megatron-LM/examples/*",
-        "modules/Megatron-LM/*.py",
-        "modules/Model-Optimizer/modelopt/*",
-        "modules/Model-Optimizer/modelopt_recipes/*",
-        "modules/Model-Optimizer/examples/*",
-        *_include_pattern,
-    ]
-    _relative_path = [LAUNCHER_DIR] * 6 + _relative_path
+    _add_package_path(os.path.join(LAUNCHER_DIR, "modules/Megatron-LM/megatron"))
+    _add_package_path(os.path.join(LAUNCHER_DIR, "modules/Megatron-LM/examples"))
+    _add_package_glob(os.path.join(LAUNCHER_DIR, "modules/Megatron-LM/*.py"))
+    _add_package_path(os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt"))
+    _add_package_path(os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt_recipes"))
+    _add_package_path(os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/examples"))
 
 packager = run.PatternPackager(
     include_pattern=_include_pattern,
@@ -127,7 +141,11 @@ def launch(
             raise ValueError("--clean requires a dev checkout; modelopt source not found.")
         examples_dir = os.path.join(_mo_symlink, "examples")
         print(f"Cleaning {examples_dir} with git clean -xdf ...")
-        subprocess.run(["git", "clean", "-xdf", "."], cwd=examples_dir, check=True)  # nosec B603 B607
+        subprocess.run(  # nosec B603 B607 - fixed git CLI argv; no shell.
+            ["git", "clean", "-xdf", "."],
+            cwd=examples_dir,
+            check=True,
+        )
 
     if "NEMORUN_HOME" not in os.environ:
         warnings.warn("NEMORUN_HOME is not set. Defaulting to current working directory.")

@@ -35,6 +35,9 @@
     SandboxTask,
     SandboxTask0,
     SandboxTask1,
+    SandboxTask2,
+    SandboxTask3,
+    SandboxTask4,
     create_task_from_yaml,
     get_default_env,
     register_factory,
@@ -185,8 +188,17 @@ class MockSlurmConfig:
             host: str = "test"
 
         set_slurm_config_type(MockSlurmConfig)
-        assert SandboxTask.__annotations__["slurm_config"] is MockSlurmConfig
-        assert SandboxTask.__dataclass_fields__["slurm_config"].type is MockSlurmConfig
+        for task_cls in (
+            SandboxTask,
+            SandboxTask0,
+            SandboxTask1,
+            SandboxTask2,
+            SandboxTask3,
+            SandboxTask4,
+        ):
+            assert task_cls.__annotations__["slurm_config"] is MockSlurmConfig
+            assert task_cls.__dataclass_fields__["slurm_config"].type is MockSlurmConfig
+            assert task_cls.__init__.__annotations__["slurm_config"] is MockSlurmConfig
 
 
 class TestGetDefaultEnv:

@@ -129,6 +129,12 @@ def test_valid_git_repo(self):
         assert branch != "unknown"
         assert len(commit) >= 7  # short hash
 
+    def test_valid_git_repo_from_nested_directory(self):
+        commit, branch = _git_info(os.path.join(os.getcwd(), "tests"))
+        assert commit != "unknown"
+        assert branch != "unknown"
+        assert len(commit) >= 7  # short hash
+
     def test_nonexistent_directory(self):
         commit, branch = _git_info("/tmp/nonexistent_xyz_12345")
         assert commit == "unknown"

@@ -21,7 +21,7 @@ Mode is determined by which args you pass, not by which tool you call. One tool,
 |---|---|
 | `list_examples` | Enumerate bundled launcher YAMLs under `tools/launcher/examples/` with model + description metadata extracted from each YAML. Discovery primitive — call this first when you don't know which YAML to launch. |
 | `verify_setup(executor, ...)` | Fail-fast probe for the named executor. Docker: `docker info` (daemon up) + `docker info --format` runtime-registry check (looks for `"nvidia"` runtime registered by the NVIDIA Container Toolkit — no image pull, daemon-fast). Slurm: `ssh -o BatchMode=yes -o ConnectTimeout=5` to the cluster login node. Returns structured failure on auth / network / daemon issues — no exception. |
-| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dryrun --yes` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool, diagnostic?, exit_code, stdout_tail, stderr_tail, argv}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). |
+| `submit_job(yaml_path, hf_local? \| cluster_host?, ..., dry_run?, source_ref?, source_repo?)` | Submit a launcher YAML. Mode resolved from mutually-exclusive args. Before launching, materializes a managed Model-Optimizer checkout at `source_ref` (branch, tag, or SHA; default `main`) and initializes recursive submodules, then runs that checkout's launcher. Returns `experiment_id` (Slurm) or PID (Docker) immediately; the actual job runs detached. Auto-runs `verify_setup` first by default (skippable). **Pass `dry_run=True`** to validate the YAML via `launch.py --dryrun --yes` without contacting the cluster / spawning a container / running sbatch — returns `{ok, dry_run: True, validated: bool, diagnostic?, exit_code, stdout_tail, stderr_tail, argv, source_sha, source_root}` instead of `experiment_id`. Used by verify-task workflow stages (deployment_support, hidden_state_dump_support, mlm_eval, ...). |
 | `job_status(experiment_id)` | Filesystem-based status from nemo_run's experiment dir (`_DONE`, `status_*.out`). Returns `done` / `failed` / `running` plus per-task statuses. No in-memory registry; survives MCP server restarts. |
 | `job_logs(experiment_id, task?, tail?)` | Read `log_<task>.out` from the experiment dir. Per-task filtering + optional tail to truncate. |
 | `wait_for_experiment(experiment_id, timeout_sec?, poll_interval_sec?)` | Block until `job_status` returns `done` / `failed`, or until `timeout_sec` elapses. Single tool call replaces the agent's `while True: status; sleep` loop — saves tool-call turns and avoids overshooting the poll interval. Returns the final status plus `waited_seconds`. |
@@ -47,7 +47,7 @@ codex mcp add modelopt -- uvx --from \
   modelopt-mcp
 ```
 
-`uvx` clones the whole repo to its cache, installs `tools/mcp/` as the entry point, and resolves the sibling `modelopt-launcher` dep via `[tool.uv.sources]` (path → `../launcher`) inside the cloned tree.
+`uvx` clones the whole repo to its cache, installs `tools/mcp/` as the entry point, and resolves the sibling `modelopt-launcher` dep via `[tool.uv.sources]` (path → `../launcher`) inside the cloned tree. That install clone is only the server runtime; job submission uses the managed source checkout described below.
 
 ### Dev install (local checkout)
 
@@ -59,6 +59,24 @@ modelopt-mcp                         # stdio server entry on PATH
 
 Both packages share the launcher's `core.py` orchestrator. The dev path relies on `[tool.uv.sources]` to point `modelopt-launcher` at `../launcher`.
 
+## Managed source checkouts
+
+`submit_job` does not rely on the uvx install clone or on the caller being inside a Model-Optimizer checkout. For each launch it resolves:
+
+1. `source_ref` argument, if provided.
+2. `MODELOPT_MCP_SOURCE_REF`, if set.
+3. `main`.
+
+It resolves that ref against `source_repo` / `MODELOPT_MCP_SOURCE_REPO` / `https://github.com/NVIDIA/Model-Optimizer.git`, creates a cached checkout under `MODELOPT_MCP_SOURCE_CACHE` (default `$XDG_CACHE_HOME/modelopt-mcp/sources` or `~/.cache/modelopt-mcp/sources`), and runs:
+
+```bash
+uv run --project <source_root>/tools/launcher modelopt-launcher --yaml <resolved-yaml> ...
+```
+
+The checkout is keyed by resolved commit SHA, so multiple agents using different branches or SHAs get separate source roots. Recursive submodules are initialized in the managed checkout, so launcher packagers can include `tools/launcher/modules/...` content even when MCP was installed outside a repo checkout.
+
+Set `MODELOPT_MCP_DISABLE_MANAGED_SOURCE=1` only for local development when you deliberately want the already-installed `modelopt-launcher` entrypoint.
+
 ### Why no plain `pip install` today
 
 `modelopt-mcp` and `modelopt-launcher` are not on PyPI. Plain `pip` doesn't read `[tool.uv.sources]`, so even from a local checkout, `pip install -e tools/mcp` fails to resolve the bare `modelopt-launcher` name. Stick with `uv` / `uvx` while we're git-only.
@@ -96,8 +114,9 @@ result = mcp__modelopt__submit_job(
     cluster_user="alice",
     identity="/home/alice/.ssh/id_ed25519",
     skip_verify=True,  # we just probed
+    source_ref="main",  # optional; omit to use main
 )
-# {"ok": True, "experiment_id": "cicd_1781240000", "slurm_job_id": "12345", ...}
+# {"ok": True, "experiment_id": "cicd_1781240000", "slurm_job_id": "12345", "source_sha": "...", ...}
 
 # 4. Poll until done
 while True:
@@ -122,6 +141,11 @@ For local Docker execution, drop `cluster_host`/`cluster_user`/`identity` and pa
 | `NEMORUN_HOME` | submit + status + logs | Where the launcher writes experiment artifacts. Defaults to cwd if unset. `job_status` / `job_logs` search `$NEMORUN_HOME/experiments/<id>/`. |
 | `MODELOPT_MCP_LOG` | (optional) server | Log level. Defaults to `INFO`. Logs go to stderr — stdout is the MCP wire. |
 | `MODELOPT_MCP_SKIP_GPU_CHECK` | (optional) `verify_setup(executor='docker')` | Set to skip the `docker info --format` runtime-registry check. Useful for CI hosts where the daemon is up but the NVIDIA Container Toolkit isn't installed. |
+| `MODELOPT_MCP_SOURCE_REPO` | (optional) `submit_job` | Default git repository for managed source checkouts. Defaults to `https://github.com/NVIDIA/Model-Optimizer.git`. |
+| `MODELOPT_MCP_SOURCE_REF` | (optional) `submit_job` | Default branch, tag, or SHA when `source_ref` is omitted. Defaults to `main`. |
+| `MODELOPT_MCP_SOURCE_CACHE` | (optional) `submit_job` | Root for managed source checkouts. Defaults to `$XDG_CACHE_HOME/modelopt-mcp/sources` or `~/.cache/modelopt-mcp/sources`. |
+| `MODELOPT_MCP_DISABLE_MANAGED_SOURCE` | (optional) local dev | Set to `1` to skip managed checkout and invoke the installed `modelopt-launcher` entrypoint directly. |
+| `MODELOPT_MCP_UV` | (optional) `submit_job` | Override the `uv` binary used for `uv run --project <source>/tools/launcher ...`. |
 | `MODELOPT_LAUNCHER_EXAMPLES_DIR` | (optional) `list_examples` | Override the examples directory location. Defaults to `../launcher/examples/` relative to this package. |
 
 ## Design principles