From cd7a8817d066793911622063d789ea64d0cb69dd Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 11:48:57 -0700 Subject: [PATCH 01/17] fix(jobs): Separate compute and executor payload shapes Signed-off-by: Matthew Grossman --- .../src/nemo_platform_plugin/job.py | 3 + .../nemo_platform_plugin/jobs/api_factory.py | 5 + packages/nmp_testing/src/nmp/testing/jobs.py | 5 +- .../nemo_agents_plugin/jobs/analyze_batch.py | 2 +- .../nemo_agents_plugin/jobs/evaluate_agent.py | 2 +- .../nemo_agents_plugin/jobs/evaluate_suite.py | 2 +- .../nemo_agents_plugin/jobs/optimize_agent.py | 2 +- .../jobs/optimize_skills.py | 2 +- .../tests/unit/test_evaluate_agent_job.py | 2 +- .../tests/unit/test_improvement_jobs.py | 8 +- .../api/v2/jobs/endpoints.py | 2 +- .../tests/unit/test_jobs.py | 2 +- .../nmp/core/jobs/api/v2/jobs/endpoints.py | 35 +----- .../jobs/src/nmp/core/jobs/app/providers.py | 118 +++++++----------- .../src/nmp/core/jobs/app/test_helpers.py | 4 +- .../core/jobs/controllers/backends/config.py | 4 +- .../core/jobs/controllers/backends/docker.py | 18 ++- .../backends/kubernetes/kubernetes_job.py | 14 +-- .../backends/kubernetes/volcano_job.py | 6 +- .../jobs/controllers/backends/registry.py | 3 +- .../jobs/controllers/backends/subprocess.py | 2 +- .../core/jobs/controllers/backends/test.py | 16 +-- services/core/jobs/tests/conftest.py | 30 ++--- .../core/jobs/tests/controllers/test_base.py | 4 +- .../tests/controllers/test_docker_backend.py | 31 +++-- .../controllers/test_kubernetes_backend.py | 14 +-- .../controllers/test_subprocess_backend.py | 16 +-- .../tests/controllers/test_volcano_backend.py | 21 ++-- services/core/jobs/tests/test_config.py | 25 ++-- services/core/jobs/tests/test_jobs_api.py | 34 ++--- .../tests/test_jobs_endpoint_translation.py | 38 +----- 31 files changed, 184 insertions(+), 286 deletions(-) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py index ea91db6f39..265a56dcfb 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py @@ -172,6 +172,9 @@ class NemoJob(_NamedPlugin): description: ClassVar[str] = "" container: ClassVar[str] = "cpu-tasks" execution_provider: ClassVar[str] = "cpu" + # Execution kind: "container" (default) or "subprocess". + # Subprocess jobs override this to "subprocess". + execution_kind: ClassVar[str] = "container" # ------------------------------------------------------------------ # # Spec schemas — canonical ``spec_schema``; optional ``input_spec_schema`` diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py index 63981b18e2..0d06ce0a27 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py @@ -79,6 +79,9 @@ GPUExecutionProviderSpec = GPUExecutionProviderParam DistributedGPUExecutionProviderSpec = DistributedGPUExecutionProviderParam SubprocessExecutionProviderSpec = SubprocessExecutionProviderParam +# Container providers (cpu, gpu, gpu_distributed) share the same shape today. +# This alias will point at a dedicated SDK type once the SDK is regenerated. +ContainerExecutionProviderSpec = CPUExecutionProviderParam ResourcesSpec = ComputeResourcesParam ResourcesLimitsSpec = ComputeResourceSpecParam ResourcesRequestsSpec = ComputeResourceSpecParam @@ -102,6 +105,8 @@ class BaseJobRequest(BaseModel, Generic[JobConfigT]): spec: JobConfigT ownership: dict | None = None custom_fields: dict | None = None + profile: str | None = None + options: dict | None = None class BaseJob(BaseModel, Generic[JobConfigT]): diff --git a/packages/nmp_testing/src/nmp/testing/jobs.py b/packages/nmp_testing/src/nmp/testing/jobs.py index c5388c39bd..1bd5b0f234 100644 --- a/packages/nmp_testing/src/nmp/testing/jobs.py +++ b/packages/nmp_testing/src/nmp/testing/jobs.py @@ -30,7 +30,10 @@ def subprocess_job_executor_patch( from nmp.core.jobs.api.v2.jobs import endpoints as jobs_endpoints patched_executors = list(executors) - if not any(executor.provider == "subprocess" and executor.profile == profile for executor in patched_executors): + if not any( + getattr(executor, "backend", None) == "subprocess" and getattr(executor, "profile", None) == profile + for executor in patched_executors + ): patched_executors.insert(0, SubprocessJobExecutionProfile(profile=profile)) with ExitStack() as stack: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py index 36f55842f6..81d6abab78 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py @@ -107,7 +107,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="analyze", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.analyze"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py index 2eeda79456..db74b9364d 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py @@ -182,7 +182,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="evaluate-agent", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.evaluate"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py index 61146380ab..0f9d56c37a 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py @@ -201,7 +201,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="evaluate-suite", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py index 9f7edd10e1..5c2b1ea79d 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py @@ -164,7 +164,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="optimize-agent", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.optimize"], ), config=spec_dict, diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py index 6993b491bc..a5118457d7 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py @@ -147,7 +147,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="optimize-skills", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.optimize_skills"], ), config=spec_dict, diff --git a/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py b/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py index 83252f691e..f626e71cb4 100644 --- a/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py +++ b/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py @@ -36,7 +36,7 @@ async def test_compile_produces_single_cpu_step() -> None: assert len(steps) == 1 step = steps[0] assert step["name"] == "evaluate-agent" - assert step["executor"]["provider"] == "subprocess" + assert step["executor"]["provider"] == "cpu" assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate"] assert step["config"]["agent"] == "calc" assert step["config"]["eval_config"] == "config.yml" diff --git a/plugins/nemo-agents/tests/unit/test_improvement_jobs.py b/plugins/nemo-agents/tests/unit/test_improvement_jobs.py index a80e3bd803..011d472fd2 100644 --- a/plugins/nemo-agents/tests/unit/test_improvement_jobs.py +++ b/plugins/nemo-agents/tests/unit/test_improvement_jobs.py @@ -86,7 +86,7 @@ async def test_evaluate_suite_compile_produces_single_subprocess_step() -> None: assert len(steps) == 1 step = steps[0] assert step["name"] == "evaluate-suite" - assert step["executor"]["provider"] == "subprocess" + assert step["executor"]["provider"] == "cpu" assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"] assert step["config"]["evals"] == "/abs/evals" assert step["config"]["agent"] == "/abs/agent" @@ -199,7 +199,7 @@ async def test_optimize_skills_compile_produces_single_subprocess_step() -> None step = steps[0] assert step["name"] == "optimize-skills" executor = step["executor"] - assert executor.get("provider") == "subprocess" + assert executor.get("provider") == "cpu" assert executor.get("command") == ["python", "-m", "nemo_agents_plugin.tasks.optimize_skills"] env = {e["name"]: e for e in step["environment"]} @@ -279,7 +279,7 @@ async def test_analyze_compile_produces_single_subprocess_step() -> None: step = next(iter(platform_spec["steps"])) assert step["name"] == "analyze" executor = step["executor"] - assert executor.get("provider") == "subprocess" + assert executor.get("provider") == "cpu" assert executor.get("command") == ["python", "-m", "nemo_agents_plugin.tasks.analyze"] @@ -328,7 +328,7 @@ async def test_optimize_agent_compile_produces_single_subprocess_step() -> None: step = next(iter(platform_spec["steps"])) assert step["name"] == "optimize-agent" executor = step["executor"] - assert executor.get("provider") == "subprocess" + assert executor.get("provider") == "cpu" assert executor.get("command") == ["python", "-m", "nemo_agents_plugin.tasks.optimize"] assert step["config"]["workspace"] == "staging" diff --git a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py index bbb663cdc6..18466d8dbb 100644 --- a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py +++ b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py @@ -62,7 +62,7 @@ def _create_job_step(job_config: SafeSynthesizerJobConfig, environment: list[Env return PlatformJobStep( name="safe-synthesizer", executor=SubprocessExecutionProviderSpec( - provider="subprocess", + provider="cpu", profile=config.job_executor_profile, command=command, ), diff --git a/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py b/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py index 446aee4637..00c118062f 100644 --- a/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py +++ b/plugins/nemo-safe-synthesizer/tests/unit/test_jobs.py @@ -92,7 +92,7 @@ async def test_job_config_compiler_with_classify_provider(mock_sdk): mock_sdk.inference.providers.retrieve.assert_awaited_once_with("my-nim", workspace="default") step = next(iter(result["steps"])) - assert step["executor"]["provider"] == "subprocess" + assert step["executor"]["provider"] == "cpu" assert step["executor"]["command"] == ["/runtime/bin/python", "-m", TASK_MODULE] env = {e["name"]: e.get("value") for e in step.get("environment", [])} assert env["CLASSIFY_LLM_ENDPOINT_PATH"] == "/apis/inference-gateway/v2/workspaces/default/provider/my-nim/-/v1" diff --git a/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py b/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py index 9a9312895e..2fa61dba8d 100644 --- a/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py +++ b/services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py @@ -46,11 +46,10 @@ from nmp.core.jobs.app.ctx import JobContext from nmp.core.jobs.app.dispatcher import JobDispatcher, StateTransitionConflictError from nmp.core.jobs.app.profiles import ExecutionProfileT -from nmp.core.jobs.app.providers import CPUExecutionProvider, SubprocessExecutionProvider from nmp.core.jobs.app.schemas import ( PlatformJobSpec, ) -from nmp.core.jobs.config import config, profiles +from nmp.core.jobs.config import profiles from nmp.core.jobs.entities import PlatformJobStep, PlatformJobTask from pydantic import ValidationError from starlette.responses import FileResponse @@ -102,34 +101,6 @@ def validate_job_spec( ) from e -def translate_cpu_container_steps_to_subprocess( - job_spec: PlatformJobSpec, - subprocess_profiles: set[str], -) -> PlatformJobSpec: - """Translate CPU container steps when explicitly configured for subprocess compatibility.""" - if not subprocess_profiles: - return job_spec - - translated_spec = job_spec.model_copy(deep=True) - for step in translated_spec.steps: - executor = step.executor - if not isinstance(executor, CPUExecutionProvider) or executor.profile not in subprocess_profiles: - continue - command = [*executor.container.entrypoint, *executor.container.command] - if not command: - raise HTTPException( - status_code=status.HTTP_422_UNPROCESSABLE_CONTENT, - detail=f"Subprocess execution for step '{step.name}' requires container.entrypoint and/or container.command.", - ) - step.executor = SubprocessExecutionProvider(provider="subprocess", profile=executor.profile, command=command) - return translated_spec - - -def configured_subprocess_translation_profiles() -> set[str]: - """Return explicitly configured subprocess profiles that should accept CPU container jobs.""" - return {profile.profile for profile in config.executors if profile.provider == "subprocess"} - - # Execution Profiles Endpoint @router.get("/v2/execution-profiles") async def get_execution_profiles() -> list[ExecutionProfileT]: @@ -149,10 +120,6 @@ async def create_job( sdk: AsyncNeMoPlatform = Depends(get_sdk_client), ) -> PlatformJobResponse: """Create a new platform job.""" - platform_spec = translate_cpu_container_steps_to_subprocess( - request.platform_spec, configured_subprocess_translation_profiles() - ) - request = request.model_copy(update={"platform_spec": platform_spec}) validate_job_spec(request.platform_spec, profiles) try: diff --git a/services/core/jobs/src/nmp/core/jobs/app/providers.py b/services/core/jobs/src/nmp/core/jobs/app/providers.py index 6d5c5486a9..4a5f00a9ce 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/providers.py +++ b/services/core/jobs/src/nmp/core/jobs/app/providers.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Annotated, Literal, Union +from typing import Annotated, Any, Literal, Union -from pydantic import BaseModel, Field, field_validator, model_validator +from pydantic import BaseModel, BeforeValidator, Field, field_validator, model_validator # SHM: megabyte/gigabyte scale only — Mi, Gi (binary) or M, G (decimal SI). # Ki / Ti / Pi / Ei and other suffixes are not accepted for /dev/shm. @@ -85,80 +85,44 @@ class TaskSpec(BaseModel): """Arguments to pass to the command. Can be a list of strings or a single string.""" -class CPUExecutionProvider(BaseModel): - """ - CPU-based execution provider. +class ContainerExecutionProvider(BaseModel): + """Container-based execution provider. - Provides configuration for running jobs on CPU resources with - resource requests and limits. + Runs a job step inside a container image. The ``provider`` field + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + identifies the payload shape. """ - provider: Literal["cpu"] = "cpu" - """The provider type, always 'cpu' for CPU execution.""" - - profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" - - container: ContainerSpec - """Container specification defining the execution environment.""" - - resources: ComputeResources = Field( - default_factory=ComputeResources, description="Resource requests and limits for CPU execution." - ) + kind: Literal["container"] = "container" + """Executor payload shape — always ``"container"`` for image-backed work.""" - -class GPUExecutionProvider(BaseModel): - """ - GPU-based execution provider. - - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ - - provider: Literal["gpu"] = "gpu" - """The provider type, always 'gpu' for GPU execution.""" + provider: Literal["cpu", "gpu", "gpu_distributed"] = "cpu" + """Compute requirement: ``cpu``, ``gpu``, or ``gpu_distributed``.""" profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" + """Operator-configured execution profile (e.g. ``"default"``, ``"a100"``).""" container: ContainerSpec """Container specification defining the execution environment.""" - resources: ComputeResources = Field( - default_factory=ComputeResources, description="Resource requests and limits for GPU execution." - ) + resources: ComputeResources = Field(default_factory=ComputeResources, description="Resource requests and limits.") -class DistributedGPUExecutionProvider(BaseModel): - """ - GPU-based execution provider. +class SubprocessExecutionProvider(BaseModel): + """Host subprocess execution provider. - Provides configuration for running jobs on GPU resources with - resource requests and limits. + Runs a job step as a local OS process. The ``provider`` field + expresses compute intent while ``kind`` identifies the payload shape. """ - provider: Literal["gpu_distributed"] = "gpu_distributed" - """The provider type, always 'gpu_distributed' for distributed GPU execution.""" - - profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" - - container: ContainerSpec - """Container specification defining the execution environment.""" + kind: Literal["subprocess"] = "subprocess" + """Executor payload shape — always ``"subprocess"`` for host command execution.""" - resources: ComputeResources = Field( - default_factory=ComputeResources, description="Resource requests and limits for distributed GPU execution." - ) + provider: Literal["cpu", "gpu"] = "cpu" + """Compute requirement: ``"cpu"`` or ``"gpu"`` (GPU subprocess inherits host devices).""" - -class SubprocessExecutionProvider(BaseModel): - """Host subprocess execution provider.""" - - provider: Literal["subprocess"] = "subprocess" - """The provider type, always 'subprocess' for host subprocess execution.""" - - profile: str = "default" - """The execution profile to use. Defaults to 'default'.""" + profile: str = "subprocess" + """Execution profile. Defaults to ``"subprocess"`` to match the registered backend.""" command: list[str] """The host command to execute as a list of strings (e.g., ['python', '-m', 'my_task']).""" @@ -170,20 +134,28 @@ def validate_command(self) -> "SubprocessExecutionProvider": return self -# Type alias for the current execution provider implementation -ExecutionProviderT = Union[ - CPUExecutionProvider, GPUExecutionProvider, DistributedGPUExecutionProvider, SubprocessExecutionProvider -] -"""Type alias representing the current execution provider type.""" +def _infer_executor_kind(v: Any) -> Any: + """Infer ``kind`` from payload shape when absent. -# Discriminated union type for execution providers + The SDK types haven't been regenerated yet, so incoming requests may + omit ``kind``. Infer it from the presence of ``container`` (→ container) + vs ``command`` (→ subprocess), or from the legacy ``provider="subprocess"``. + """ + if not isinstance(v, dict) or "kind" in v: + return v + if v.get("provider") == "subprocess" or "command" in v: + v = {**v, "kind": "subprocess"} + if v.get("provider") == "subprocess": + v["provider"] = "cpu" + else: + v = {**v, "kind": "container"} + return v + + +# Discriminated union type for execution providers. +# Uses ``kind`` to distinguish container vs subprocess payload shapes. Provider = Annotated[ - ExecutionProviderT, - Field(discriminator="provider"), + Union[ContainerExecutionProvider, SubprocessExecutionProvider], + BeforeValidator(_infer_executor_kind), + Field(discriminator="kind"), ] -""" -Discriminated union type for execution providers. - -Uses the 'provider' field to determine the specific provider type. -Currently supports CPU execution providers, with extensibility for future provider types. -""" diff --git a/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py b/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py index 004560f00b..1d0f469577 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py +++ b/services/core/jobs/src/nmp/core/jobs/app/test_helpers.py @@ -7,7 +7,7 @@ that are shared across test files. """ -from nmp.core.jobs.app.providers import ComputeResources, ComputeResourceSpec, ContainerSpec, CPUExecutionProvider +from nmp.core.jobs.app.providers import ComputeResources, ComputeResourceSpec, ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobSpec, PlatformJobStepSpec, @@ -45,7 +45,7 @@ class TestConstants: # etc.) all set both fields; the fixture mirrors that so submissions through the # core /apis/jobs/v2/workspaces/{ws}/jobs endpoint validate successfully and we # exercise the same translation path the user-facing tutorials do. - TEST_EXECUTOR = CPUExecutionProvider( + TEST_EXECUTOR = ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec( diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py index f5d4724fae..8c2870c663 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/config.py @@ -96,8 +96,8 @@ def get_default_executor_profiles_for_runtime( if enable_subprocess_executor: executors.append( SubprocessJobExecutionProfile( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", backend="subprocess", config=defaults.subprocess, ) diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py index 74ce48e097..573bacc072 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/docker.py @@ -61,9 +61,7 @@ from nmp.core.jobs.app.ctx import JobContext from nmp.core.jobs.app.providers import ( ComputeResources, - CPUExecutionProvider, - ExecutionProviderT, - GPUExecutionProvider, + ContainerExecutionProvider, ) from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import ( @@ -112,7 +110,7 @@ def k8s_shm_quantity_to_docker(quantity: str) -> str: DOCKER_STOP_TIMEOUT = int(os.getenv("NEMO_JOBS_DEFAULT_DOCKER_STOP_TIMEOUT", "30")) -ProviderT = TypeVar("ProviderT", bound=ExecutionProviderT) +ProviderT = TypeVar("ProviderT", bound=ContainerExecutionProvider) class DockerVolumeMount(BaseModel): @@ -1342,12 +1340,12 @@ def name_for_step(self, step: PlatformJobStepWithContext) -> str: return f"{step.job}-{step.name}" -class CPUDockerJobBackend(DockerJobBackend[CPUExecutionProvider]): +class CPUDockerJobBackend(DockerJobBackend[ContainerExecutionProvider]): """Docker job backend for CPU execution.""" def schedule( self, - executor_config: CPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: return self.schedule_single_container(executor_config, step) @@ -1358,12 +1356,12 @@ def sync( ) -> JobUpdate: return self._sync(step) - def configure_container(self, container_args: dict, executor_config: CPUExecutionProvider) -> dict: + def configure_container(self, container_args: dict, executor_config: ContainerExecutionProvider) -> dict: """Customize container arguments for CPU execution.""" return self.apply_resource_limits(container_args, executor_config.resources) -class GPUDockerJobBackend(DockerJobBackend[GPUExecutionProvider]): +class GPUDockerJobBackend(DockerJobBackend[ContainerExecutionProvider]): """Docker job backend for GPU execution.""" def init(self) -> None: @@ -1381,7 +1379,7 @@ def init(self) -> None: def schedule( self, - executor_config: GPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: return self.schedule_single_container(executor_config, step) @@ -1402,7 +1400,7 @@ def sync( self.gpu_pool.release_gpu(step.id) return job_update - def configure_container(self, container_args: dict, executor_config: GPUExecutionProvider) -> dict: + def configure_container(self, container_args: dict, executor_config: ContainerExecutionProvider) -> dict: """Customize container arguments for GPU execution.""" # Apply resource limits container_args = self.apply_resource_limits(container_args, executor_config.resources) diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py index d00870793e..16dab8de18 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/kubernetes_job.py @@ -24,10 +24,8 @@ ) from nmp.core.jobs.app.providers import ( ComputeResources, + ContainerExecutionProvider, ContainerSpec, - CPUExecutionProvider, - ExecutionProviderT, - GPUExecutionProvider, ) from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import JobBackend, JobUpdate, staleness_error_message @@ -51,7 +49,7 @@ logger = logging.getLogger(__name__) -ProviderT = TypeVar("ProviderT", bound=ExecutionProviderT) +ProviderT = TypeVar("ProviderT", bound=ContainerExecutionProvider) class KubernetesJobExecutionProfileConfig(BaseKubernetesExecutionProfileConfig): @@ -546,12 +544,12 @@ def cleanup_steps(self): self.terminate_job(job) -class CPUKubernetesJobBackend(KubernetesJobBackend[CPUExecutionProvider]): +class CPUKubernetesJobBackend(KubernetesJobBackend[ContainerExecutionProvider]): """Kubernetes job backend for CPU execution.""" def schedule( self, - executor_config: CPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: return self.schedule_job(executor_config.container, step) @@ -563,12 +561,12 @@ def sync( return self._sync(step) -class GPUKubernetesJobBackend(KubernetesJobBackend[GPUExecutionProvider]): +class GPUKubernetesJobBackend(KubernetesJobBackend[ContainerExecutionProvider]): """Kubernetes job backend for GPU execution.""" def schedule( self, - executor_config: GPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: if executor_config.resources is not None and executor_config.resources.num_gpus is not None: diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py index 0d07a89aa0..3d8ca76b21 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/kubernetes/volcano_job.py @@ -24,7 +24,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import DistributedGPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import JobBackend, JobUpdate, staleness_error_message from nmp.core.jobs.controllers.backends.kubernetes.common import ( @@ -86,7 +86,7 @@ def supports_persistent_storage(self) -> bool: class VolcanoJobBackend( - JobBackend[DistributedGPUExecutionProvider, VolcanoJobExecutionProfileConfig], + JobBackend[ContainerExecutionProvider, VolcanoJobExecutionProfileConfig], ): BACKEND_NAME: str = "volcano_job" @@ -151,7 +151,7 @@ def get_volcano_job_list_by_labels(self, labels: dict[str, str]) -> list[dict]: def schedule( self, - executor_config: DistributedGPUExecutionProvider, + executor_config: ContainerExecutionProvider, step: PlatformJobStepWithContext, ) -> JobUpdate: """ diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py index f5fd76accc..73ec1d9bad 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/registry.py @@ -56,7 +56,8 @@ class BackendKey: BackendKey("cpu", "kubernetes_job"): CPUKubernetesJobBackend, BackendKey("gpu", "kubernetes_job"): GPUKubernetesJobBackend, BackendKey("gpu_distributed", "volcano_job"): VolcanoJobBackend, - BackendKey("subprocess", "subprocess"): SubprocessJobBackend, + BackendKey("cpu", "subprocess"): SubprocessJobBackend, + BackendKey("gpu", "subprocess"): SubprocessJobBackend, BackendKey("cpu", "e2e"): TestE2ECPUJobBackend, BackendKey("gpu", "e2e"): TestE2EGPUJobBackend, } diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py index 35610fd19a..5405176fcd 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py @@ -83,7 +83,7 @@ class SubprocessJobExecutionProfileConfig(JobExecutionProfileConfig): class SubprocessJobExecutionProfile(BaseExecutionProfile): - provider: Literal["subprocess"] = "subprocess" + provider: Literal["cpu"] = "cpu" backend: Literal["subprocess"] = "subprocess" config: SubprocessJobExecutionProfileConfig = Field( default_factory=SubprocessJobExecutionProfileConfig, diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py index 0604f82657..7ead19ac6a 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/test.py @@ -6,14 +6,14 @@ from nemo_platform.types.jobs import PlatformJobStepWithContext from nmp.common.jobs.schemas import PlatformJobStatus -from nmp.core.jobs.app.providers import CPUExecutionProvider, ExecutionProviderT, GPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider from nmp.core.jobs.app.schemas import BaseExecutionProfile from nmp.core.jobs.controllers.backends.base import JobBackend, JobExecutionProfileConfig, JobUpdate from nmp.core.jobs.controllers.backends.docker import DockerJobExecutionProfileConfig from nmp.core.jobs.controllers.backends.kubernetes import KubernetesJobExecutionProfileConfig from pydantic import Field -ProviderT = TypeVar("ProviderT", bound=ExecutionProviderT) +ProviderT = TypeVar("ProviderT", bound=ContainerExecutionProvider) class E2EJobExecutionProfile(BaseExecutionProfile): @@ -136,25 +136,25 @@ def cleanup_steps(self): return -class TestE2ECPUJobBackend(TestE2EJobBackend[CPUExecutionProvider]): +class TestE2ECPUJobBackend(TestE2EJobBackend[ContainerExecutionProvider]): pass -class TestE2EGPUJobBackend(TestE2EJobBackend[GPUExecutionProvider]): +class TestE2EGPUJobBackend(TestE2EJobBackend[ContainerExecutionProvider]): pass -class MockDockerCPUJobBackend(MockDockerJobBackend[CPUExecutionProvider]): +class MockDockerCPUJobBackend(MockDockerJobBackend[ContainerExecutionProvider]): pass -class MockDockerGPUJobBackend(MockDockerJobBackend[GPUExecutionProvider]): +class MockDockerGPUJobBackend(MockDockerJobBackend[ContainerExecutionProvider]): pass -class MockKubernetesCPUJobBackend(MockKubernetesJobBackend[CPUExecutionProvider]): +class MockKubernetesCPUJobBackend(MockKubernetesJobBackend[ContainerExecutionProvider]): pass -class MockKubernetesGPUJobBackend(MockKubernetesJobBackend[GPUExecutionProvider]): +class MockKubernetesGPUJobBackend(MockKubernetesJobBackend[ContainerExecutionProvider]): pass diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index 03dbfbbbc4..c50df7f6ff 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -35,7 +35,7 @@ PlatformJobStepWithContext, ) from nmp.core.jobs.app.dispatcher import JobDispatcher -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobStepSpec, @@ -212,6 +212,7 @@ def sample_job_dict(): { "name": "docker-step-cpu-1", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "ubuntu:latest", "command": ["c1", "c2"], "entrypoint": ["a1", "a2"]}, @@ -221,6 +222,7 @@ def sample_job_dict(): { "name": "docker-step-gpu", "executor": { + "kind": "container", "provider": "gpu", "profile": "default", "container": {"image": "ubuntu:latest", "command": ["c1", "c2"], "entrypoint": ["a1", "a2"]}, @@ -231,6 +233,7 @@ def sample_job_dict(): { "name": "docker-step-no-command-or-entrypoint", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "ubuntu:latest"}, @@ -341,7 +344,7 @@ def create_step_with_status(status: PlatformJobStatus) -> PlatformJobStepWithCon fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image") ), config={}, @@ -385,16 +388,10 @@ def job_config_with_many_profiles() -> JobsServiceConfig: volume_name: test_jobs_storage jobs: - # Executor profiles configuration. The subprocess/default entry mirrors what - # ships in `packages/nmp_platform/config/local.yaml` and opts the documented - # `cpu/default` plugin steps into the cpu→subprocess translation in the Jobs - # API (see `translate_cpu_container_steps_to_subprocess`). Tests that submit - # jobs through the core /apis/jobs/v2/workspaces/{ws}/jobs endpoint with a - # `cpu/default` step will get rewritten to `subprocess/default` before - # validation, matching production deployment behavior. + # Executor profiles configuration. executors: - - provider: subprocess - profile: default + - provider: cpu + profile: subprocess backend: subprocess config: working_directory: /tmp/nmp-subprocess-jobs @@ -464,16 +461,14 @@ def backend_registry(mock_nmp_client, job_config_with_many_profiles) -> BackendR nmp_sdk=mock_nmp_client, profiles=job_config_with_many_profiles.executors, # Mock the backends. Register the real SubprocessJobBackend to satisfy - # the subprocess/default executor that ships in - # `job_config_with_many_profiles` (added so test_client picks up the - # subprocess profile and the cpu→subprocess translation in the Jobs - # API fires consistently with production deployments). + # the cpu/subprocess executor that ships in + # `job_config_with_many_profiles`. backends={ BackendKey("cpu", "docker"): MockDockerCPUJobBackend, BackendKey("gpu", "docker"): MockDockerGPUJobBackend, BackendKey("cpu", "kubernetes_job"): MockKubernetesCPUJobBackend, BackendKey("gpu", "kubernetes_job"): MockKubernetesGPUJobBackend, - BackendKey("subprocess", "subprocess"): SubprocessJobBackend, + BackendKey("cpu", "subprocess"): SubprocessJobBackend, }, ) @@ -513,8 +508,7 @@ def hello_world_job_config( @pytest_asyncio.fixture async def test_client(mock_dispatcher, mock_store, job_config_with_many_profiles) -> AsyncGenerator[AsyncClient, None]: - # Mock the config.executors to have the test execution profiles, including - # subprocess/default for cpu/default to subprocess/default translation. + # Mock the config.executors to have the test execution profiles. from nmp.common.auth.middleware import AuthorizationMiddleware from nmp.common.service.dependencies import get_sdk_client diff --git a/services/core/jobs/tests/controllers/test_base.py b/services/core/jobs/tests/controllers/test_base.py index 759e1e5f81..a74d816d47 100644 --- a/services/core/jobs/tests/controllers/test_base.py +++ b/services/core/jobs/tests/controllers/test_base.py @@ -9,7 +9,7 @@ from nmp.common.config import PlatformConfig from nmp.common.jobs.schemas import PlatformJobStatus from nmp.core.jobs.api.v2.jobs.schemas import PlatformJobStepWithContext -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerSpec, ContainerExecutionProvider from nmp.core.jobs.app.schemas import PlatformJobStepSpec, StepLifecycle from nmp.core.jobs.controllers.backends.base import get_logs_endpoint_from_fileset from nmp.core.jobs.controllers.backends.test import MockKubernetesCPUJobBackend @@ -164,7 +164,7 @@ def _make_step( if step_spec is ...: step_spec = PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider(provider="cpu", profile="default", container=ContainerSpec(image="img")), + executor=ContainerExecutionProvider(provider="cpu", profile="default", container=ContainerSpec(image="img")), config={}, lifecycle=StepLifecycle(staleness_timeout_seconds=staleness_timeout), ) diff --git a/services/core/jobs/tests/controllers/test_docker_backend.py b/services/core/jobs/tests/controllers/test_docker_backend.py index 0a9e1e4b08..7f8dfbc19f 100644 --- a/services/core/jobs/tests/controllers/test_docker_backend.py +++ b/services/core/jobs/tests/controllers/test_docker_backend.py @@ -39,8 +39,7 @@ ComputeResources, ComputeResourceSpec, ContainerSpec, - CPUExecutionProvider, - GPUExecutionProvider, + ContainerExecutionProvider, ) from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, @@ -128,7 +127,7 @@ def test_job_step(): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -161,7 +160,7 @@ def test_job_step_with_persistence(): workspace="default", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -467,7 +466,7 @@ def test_docker_job_sync_cancelling_sigkill(docker_job, docker_client_mock, test def test_docker_job_schedule_no_resources(docker_job, docker_client_mock): """Test that scheduling works with providers that don't have resources attribute.""" # Create a provider without resources attribute - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) test_job_step = PlatformJobStepWithContext( id="test-step-id", @@ -478,7 +477,7 @@ def test_docker_job_schedule_no_resources(docker_job, docker_client_mock): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -516,7 +515,7 @@ def test_docker_job_schedule_no_resources(docker_job, docker_client_mock): def test_docker_job_schedule_with_secrets(docker_job, docker_client_mock): """Test that scheduling works when secrets are provided.""" # Create a provider without resources attribute - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) test_job_step = PlatformJobStepWithContext( id="test-step-id", @@ -527,7 +526,7 @@ def test_docker_job_schedule_with_secrets(docker_job, docker_client_mock): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -576,7 +575,7 @@ def test_docker_job_nemo_job_secrets_format_same_and_cross_workspace(docker_job, Format must be ENV_VAR=workspace/secret_name per SECRETS.md; cross-workspace refs use the explicit workspace/secret_name from from_secret.name. """ - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) # Step in workspace "default"; one secret in same workspace, one in other workspace test_job_step = PlatformJobStepWithContext( id="test-step-id", @@ -587,7 +586,7 @@ def test_docker_job_nemo_job_secrets_format_same_and_cross_workspace(docker_job, name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -633,7 +632,7 @@ def test_docker_job_nemo_job_secrets_format_same_and_cross_workspace(docker_job, def test_docker_job_profile_environment_applied(mock_nmp_client, docker_client_mock, mock_platform_config): """Profile environment (e.g. HOME=/tmp) is applied to scheduled job containers.""" - provider = CPUExecutionProvider(container=ContainerSpec(image="test-image:latest")) + provider = ContainerExecutionProvider(container=ContainerSpec(image="test-image:latest")) config = DockerJobExecutionProfileConfig( storage=DockerJobStorageConfig(volume_name="test_jobs_storage"), env={"HOME": "/tmp"}, @@ -651,7 +650,7 @@ def test_docker_job_profile_environment_applied(mock_nmp_client, docker_client_m name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), @@ -690,7 +689,7 @@ def test_schedule_docker_gpu(mock_nmp_client, docker_client_mock): """Test successful job scheduling.""" gpus = 2 - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -828,7 +827,7 @@ def test_schedule_docker_gpu(mock_nmp_client, docker_client_mock): def test_gpu_cleanup_on_job_completion(mock_nmp_client, docker_client_mock): """Test that GPU resources are released when a job completes successfully.""" - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -928,7 +927,7 @@ def test_gpu_cleanup_on_job_completion(mock_nmp_client, docker_client_mock): def test_gpu_cleanup_on_job_error(mock_nmp_client, docker_client_mock): """Test that GPU resources are released when a job fails with an error.""" - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -1836,7 +1835,7 @@ def test_job_step_with_auth_context(): name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image"), diff --git a/services/core/jobs/tests/controllers/test_kubernetes_backend.py b/services/core/jobs/tests/controllers/test_kubernetes_backend.py index 2986c892bd..a54b473b6d 100644 --- a/services/core/jobs/tests/controllers/test_kubernetes_backend.py +++ b/services/core/jobs/tests/controllers/test_kubernetes_backend.py @@ -28,7 +28,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider, GPUExecutionProvider +from nmp.core.jobs.app.providers import ContainerSpec, ContainerExecutionProvider from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobSecretEnvironmentVariableRef, @@ -126,7 +126,7 @@ def kubernetes_execution_profile_config(): @pytest.fixture def cpu_execution_provider(): """Create a test CPU execution provider.""" - return CPUExecutionProvider( + return ContainerExecutionProvider( container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -1184,7 +1184,7 @@ def test_name_for_job_truncation(kubernetes_job): name="test-step-", # Job name with trailing dash. step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="k8s_profile", container=ContainerSpec(image="test-image") ), config={"command": ["echo", "Hello"]}, @@ -1204,7 +1204,7 @@ def test_name_for_job_truncation(kubernetes_job): def test_schedule_kubernetes_gpu(mock_nmp_client, kubernetes_execution_profile_config): """Test successful job scheduling.""" - gpu_executor_config = GPUExecutionProvider.model_validate( + gpu_executor_config = ContainerExecutionProvider.model_validate( { "provider": "gpu", "profile": "default", @@ -1392,7 +1392,7 @@ def test_schedule_nemo_job_secrets_format_same_and_cross_workspace(kubernetes_jo fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image") ), config={}, @@ -1439,7 +1439,7 @@ def test_schedule_without_storage_no_label(kubernetes_job, cpu_execution_provide fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="test-image") ), config={}, @@ -1705,7 +1705,7 @@ def test_step_pending_with_auth_context() -> PlatformJobStepWithContext: name="test-step", step_spec=PlatformJobStepSpec( name="test-step", - executor=CPUExecutionProvider( + executor=ContainerExecutionProvider( provider="cpu", profile="default", container=ContainerSpec(image="nvidia/cuda:11.8-runtime-ubuntu20.04"), diff --git a/services/core/jobs/tests/controllers/test_subprocess_backend.py b/services/core/jobs/tests/controllers/test_subprocess_backend.py index 2f40410326..0862a4beb2 100644 --- a/services/core/jobs/tests/controllers/test_subprocess_backend.py +++ b/services/core/jobs/tests/controllers/test_subprocess_backend.py @@ -27,7 +27,7 @@ def _subprocess_backend(mock_nmp_client, tmp_path, mock_platform_config) -> Subp def _step_with_command(step, command: list[str]): updated_step = step.model_copy(deep=True) updated_step.step_spec.executor = SubprocessExecutionProvider( - provider="subprocess", profile="default", command=command + provider="cpu", profile="subprocess", command=command ) return updated_step @@ -35,7 +35,7 @@ def _step_with_command(step, command: list[str]): def _step_with_unvalidated_command(step, command: list[str]): updated_step = step.model_copy(deep=True) updated_step.step_spec.executor = SubprocessExecutionProvider.model_construct( - provider="subprocess", profile="default", command=command + provider="cpu", profile="subprocess", command=command ) return updated_step @@ -219,8 +219,8 @@ def test_missing_command_fails_without_process(mock_nmp_client, tmp_path, mock_p def test_build_command_uses_current_interpreter_for_python_module_commands() -> None: executor = SubprocessExecutionProvider( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", command=["python", "-m", "nemo_evaluator.tasks.evaluate"], ) @@ -233,8 +233,8 @@ def test_build_command_uses_current_interpreter_for_python_module_commands() -> def test_build_command_uses_current_interpreter_for_python3_commands() -> None: executor = SubprocessExecutionProvider( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", command=["python3", "-m", "nemo_evaluator.tasks.evaluate"], ) @@ -251,8 +251,8 @@ def test_build_command_prefers_virtual_env_python(tmp_path) -> None: venv_python.write_text("#!/bin/sh\n", encoding="utf-8") venv_python.chmod(0o755) executor = SubprocessExecutionProvider( - provider="subprocess", - profile="default", + provider="cpu", + profile="subprocess", command=["python", "-m", "nemo_evaluator.tasks.evaluate"], ) diff --git a/services/core/jobs/tests/controllers/test_volcano_backend.py b/services/core/jobs/tests/controllers/test_volcano_backend.py index fcaf352ec2..17c034c7f2 100644 --- a/services/core/jobs/tests/controllers/test_volcano_backend.py +++ b/services/core/jobs/tests/controllers/test_volcano_backend.py @@ -29,7 +29,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import ComputeResources, ContainerSpec, DistributedGPUExecutionProvider +from nmp.core.jobs.app.providers import ComputeResources, ContainerSpec, ContainerExecutionProvider from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobSecretEnvironmentVariableRef, @@ -115,7 +115,8 @@ def volcano_execution_profile_config(): @pytest.fixture def distributed_gpu_execution_provider(): """Create a test Distributed GPU execution provider.""" - return DistributedGPUExecutionProvider( + return ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -335,7 +336,8 @@ def test_schedule_job_single_node_success( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Tweak the distributed_gpu_execution_provider for this one - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -469,7 +471,7 @@ def test_volcano_job_nemo_job_secrets_format_same_and_cross_workspace( fileset="test-logs-fileset", step_spec=PlatformJobStepSpec( name="test-step", - executor=DistributedGPUExecutionProvider( + executor=ContainerExecutionProvider( provider="gpu_distributed", profile="default", container=ContainerSpec(image="test-image"), @@ -551,7 +553,8 @@ def test_multi_node_networking_annotations_added( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Create multi-node job (num_nodes > 1) - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -591,7 +594,8 @@ def test_single_node_no_networking_annotations( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Create single-node job (num_nodes = 1) - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -658,7 +662,8 @@ def test_networking_annotations_disabled_via_config( volcano_job._custom_v1.create_namespaced_custom_object.return_value = MagicMock() # ty: ignore[invalid-assignment] # Create multi-node job (num_nodes > 1) - distributed_gpu_execution_provider = DistributedGPUExecutionProvider( + distributed_gpu_execution_provider = ContainerExecutionProvider( + provider="gpu_distributed", container=ContainerSpec( image="nvidia/cuda:11.8-runtime-ubuntu20.04", command=["python", "-c", "print('Hello World')"], @@ -801,7 +806,7 @@ def test_name_for_job_truncation(volcano_job: VolcanoJobBackend): name="test-step-", # Job name with trailing dash. step_spec=PlatformJobStepSpec( name="test-step", - executor=DistributedGPUExecutionProvider( + executor=ContainerExecutionProvider( provider="gpu_distributed", profile="volcano_profile", container=ContainerSpec(image="test-image") ), config={"command": ["echo", "Hello"]}, diff --git a/services/core/jobs/tests/test_config.py b/services/core/jobs/tests/test_config.py index f26eb23a48..b808ac5595 100644 --- a/services/core/jobs/tests/test_config.py +++ b/services/core/jobs/tests/test_config.py @@ -8,9 +8,8 @@ from nmp.common.config import Configuration, Runtime from nmp.core.jobs.app.providers import ( ComputeResources, + ContainerExecutionProvider, ContainerSpec, - CPUExecutionProvider, - GPUExecutionProvider, SubprocessExecutionProvider, ) from nmp.core.jobs.app.schemas import PlatformJobEnvironmentVariable @@ -67,7 +66,7 @@ def test_job_instantiation_and_validation(sample_job_dict): assert cpu_step.executor.profile == "default" assert cpu_step.executor.container.image == "ubuntu:latest" assert cpu_step.environment == [PlatformJobEnvironmentVariable(name="TEST_ENV", value="test_value")] - assert isinstance(cpu_step.executor, CPUExecutionProvider) + assert isinstance(cpu_step.executor, ContainerExecutionProvider) # Validate second step (GPU) gpu_step = job.platform_spec.steps[1] @@ -77,7 +76,7 @@ def test_job_instantiation_and_validation(sample_job_dict): assert gpu_step.executor.container.image == "ubuntu:latest" assert gpu_step.environment == [PlatformJobEnvironmentVariable(name="TEST_ENV", value="test_value")] assert gpu_step.executor.resources.num_gpus == 2 - assert isinstance(gpu_step.executor, GPUExecutionProvider) + assert isinstance(gpu_step.executor, ContainerExecutionProvider) def test_step_container_command_configuration(sample_job_dict): @@ -309,13 +308,13 @@ def test_default_profiles_include_subprocess_for_docker_runtime(): assert ("cpu", "default", "docker") in [(p.provider, p.profile, p.backend) for p in profiles] assert ("gpu", "default", "docker") in [(p.provider, p.profile, p.backend) for p in profiles] - assert ("subprocess", "default", "subprocess") in [(p.provider, p.profile, p.backend) for p in profiles] + assert ("cpu", "subprocess", "subprocess") in [(p.provider, p.profile, p.backend) for p in profiles] def test_default_profiles_include_subprocess_for_none_runtime(): profiles = get_default_executor_profiles_for_runtime(Runtime.NONE, DefaultExecutionProfileConfig()) - assert [(p.provider, p.profile, p.backend) for p in profiles] == [("subprocess", "default", "subprocess")] + assert [(p.provider, p.profile, p.backend) for p in profiles] == [("cpu", "subprocess", "subprocess")] def test_backend_registry_resolves_subprocess_default(mock_nmp_client): @@ -330,23 +329,23 @@ def __init__(self, nmp_sdk, execution_profile_config, profile_name): registry = BackendRegistry.from_config( nmp_sdk=mock_nmp_client, profiles=profiles, - backends={BackendKey("subprocess", "subprocess"): DummyBackend}, + backends={BackendKey("cpu", "subprocess"): DummyBackend}, ) - assert registry.get_backend(provider="subprocess", profile="default") is not None + assert registry.get_backend(provider="cpu", profile="subprocess") is not None -def test_subprocess_execution_profile_defaults_provider_to_subprocess(): - profile = SubprocessJobExecutionProfile(profile="default") +def test_subprocess_execution_profile_defaults_provider_to_cpu(): + profile = SubprocessJobExecutionProfile(profile="subprocess") - assert profile.provider == "subprocess" + assert profile.provider == "cpu" assert profile.backend == "subprocess" def test_default_profiles_exclude_subprocess_for_kubernetes_runtime(): profiles = get_default_executor_profiles_for_runtime(Runtime.KUBERNETES, DefaultExecutionProfileConfig()) - assert ("subprocess", "default", "subprocess") not in [(p.provider, p.profile, p.backend) for p in profiles] + assert ("cpu", "subprocess", "subprocess") not in [(p.provider, p.profile, p.backend) for p in profiles] def test_merged_profiles(): @@ -422,7 +421,7 @@ def test_merged_profiles(): assert type(gpu_distributed.config) is VolcanoJobExecutionProfileConfig assert gpu_distributed.config.storage.pvc_name == "default-pvc" - subprocess_default = next((p for p in merged if p.provider == "subprocess" and p.profile == "default"), None) + subprocess_default = next((p for p in merged if p.provider == "cpu" and p.profile == "subprocess"), None) assert subprocess_default is not None assert type(subprocess_default.config) is SubprocessJobExecutionProfileConfig diff --git a/services/core/jobs/tests/test_jobs_api.py b/services/core/jobs/tests/test_jobs_api.py index 218c6f6c05..2bd8a4c076 100644 --- a/services/core/jobs/tests/test_jobs_api.py +++ b/services/core/jobs/tests/test_jobs_api.py @@ -25,7 +25,7 @@ PlatformJobStepsListFilter, ) from nmp.core.jobs.app.dispatcher import JobDispatcher -from nmp.core.jobs.app.providers import ContainerSpec, GPUExecutionProvider, SubprocessExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec, SubprocessExecutionProvider from nmp.core.jobs.app.schemas import ( PlatformJobSpec, PlatformJobStepSpec, @@ -48,23 +48,13 @@ def to_sdk_create_params(request: CreatePlatformJobRequest) -> Dict[str, Any]: return data -def expected_translated_executor_dump() -> Dict[str, Any]: +def expected_persisted_executor_dump() -> Dict[str, Any]: """Return the expected persisted executor for ``TestConstants.TEST_EXECUTOR``. - The Jobs API rewrites ``cpu/`` steps into ``subprocess/`` - steps before persistence (see - ``translate_cpu_container_steps_to_subprocess`` in - ``services/core/jobs/src/nmp/core/jobs/api/v2/jobs/endpoints.py``), so the - round-trip representation of a step submitted with ``TestConstants.TEST_EXECUTOR`` - is the translated subprocess executor — with ``command`` set to - ``container.entrypoint + container.command``. + The executor is stored as-is — a ``ContainerExecutionProvider`` with + ``provider="cpu"`` and the container spec from the test constant. """ - container = TestConstants.TEST_EXECUTOR.container - return SubprocessExecutionProvider( - provider="subprocess", - profile=TestConstants.TEST_EXECUTOR.profile, - command=[*container.entrypoint, *container.command], - ).model_dump() + return TestConstants.TEST_EXECUTOR.model_dump() @pytest.mark.asyncio @@ -81,12 +71,6 @@ async def test_create_job_using_sdk(test_sdk: AsyncNeMoPlatform): "executor": { "provider": "cpu", "profile": "default", - # entrypoint+command are required so the cpu→subprocess - # translation hop in the Jobs API (see - # `translate_cpu_container_steps_to_subprocess`) can - # produce a non-empty subprocess command. Real plugin - # compilers always set both; mirroring that here keeps - # the SDK round-trip path realistic. "container": { "image": "test-image", "entrypoint": ["python", "-m"], @@ -238,7 +222,7 @@ async def test_create_job_gpu_fail_fast_when_docker_no_gpus(test_client: AsyncCl """Direct Jobs API create with GPU step fails fast with 422 when platform is Docker with no GPUs.""" from nmp.common.config import Runtime - gpu_executor = GPUExecutionProvider( + gpu_executor = ContainerExecutionProvider( provider="gpu", profile="default", container=ContainerSpec(image="gpu-image"), @@ -398,7 +382,7 @@ async def test_job_lifecycle_single_step(test_client: AsyncClient): # Assert that the platform_spec is created correctly assert len(get_data["platform_spec"]["steps"]) == 1 assert get_data["platform_spec"]["steps"][0]["name"] == "step1" - assert get_data["platform_spec"]["steps"][0]["executor"] == expected_translated_executor_dump() + assert get_data["platform_spec"]["steps"][0]["executor"] == expected_persisted_executor_dump() assert get_data["platform_spec"]["steps"][0]["config"] == {} # list all steps (scoped to this job name — list_steps injects filter.job = name) @@ -557,10 +541,10 @@ async def test_job_lifecycle_multi_step(test_client: AsyncClient): # Assert that the platform_spec is created correctly assert len(get_data["platform_spec"]["steps"]) == 2 assert get_data["platform_spec"]["steps"][0]["name"] == "step1" - assert get_data["platform_spec"]["steps"][0]["executor"] == expected_translated_executor_dump() + assert get_data["platform_spec"]["steps"][0]["executor"] == expected_persisted_executor_dump() assert get_data["platform_spec"]["steps"][0]["config"] == {} assert get_data["platform_spec"]["steps"][1]["name"] == "step2" - assert get_data["platform_spec"]["steps"][1]["executor"] == expected_translated_executor_dump() + assert get_data["platform_spec"]["steps"][1]["executor"] == expected_persisted_executor_dump() assert get_data["platform_spec"]["steps"][1]["config"] == {} # Assert from the api that the first step is created correctly diff --git a/services/core/jobs/tests/test_jobs_endpoint_translation.py b/services/core/jobs/tests/test_jobs_endpoint_translation.py index ed424ef5a7..4687a25d10 100644 --- a/services/core/jobs/tests/test_jobs_endpoint_translation.py +++ b/services/core/jobs/tests/test_jobs_endpoint_translation.py @@ -3,48 +3,18 @@ import pytest from fastapi import HTTPException -from nmp.core.jobs.api.v2.jobs.endpoints import translate_cpu_container_steps_to_subprocess, validate_job_spec -from nmp.core.jobs.app.providers import ContainerSpec, CPUExecutionProvider, SubprocessExecutionProvider +from nmp.core.jobs.api.v2.jobs.endpoints import validate_job_spec +from nmp.core.jobs.app.providers import SubprocessExecutionProvider from nmp.core.jobs.app.schemas import PlatformJobSpec, PlatformJobStepSpec from nmp.core.jobs.controllers.backends.docker import DockerJobExecutionProfile, DockerJobExecutionProfileConfig -def _cpu_step(name: str, profile: str = "default") -> PlatformJobStepSpec: - return PlatformJobStepSpec( - name=name, - executor=CPUExecutionProvider( - provider="cpu", - profile=profile, - container=ContainerSpec(image="image", entrypoint=["python", "-m"], command=["task"]), - ), - ) - - -def test_translate_cpu_container_steps_to_subprocess_uses_explicit_compat_profiles() -> None: - spec = PlatformJobSpec(steps=[_cpu_step("local-step"), _cpu_step("docker-step", profile="docker")]) - - translated = translate_cpu_container_steps_to_subprocess(spec, {"default"}) - - assert isinstance(translated.steps[0].executor, SubprocessExecutionProvider) - assert translated.steps[0].executor.command == ["python", "-m", "task"] - assert isinstance(translated.steps[1].executor, CPUExecutionProvider) - assert isinstance(spec.steps[0].executor, CPUExecutionProvider) - - -def test_translate_cpu_container_steps_to_subprocess_does_not_use_implicit_defaults() -> None: - spec = PlatformJobSpec(steps=[_cpu_step("docker-step")]) - - translated = translate_cpu_container_steps_to_subprocess(spec, set()) - - assert isinstance(translated.steps[0].executor, CPUExecutionProvider) - - def test_validate_job_spec_matches_provider_and_profile() -> None: spec = PlatformJobSpec( steps=[ PlatformJobStepSpec( name="local-step", - executor=SubprocessExecutionProvider(provider="subprocess", profile="default", command=["true"]), + executor=SubprocessExecutionProvider(provider="cpu", profile="subprocess", command=["true"]), ) ] ) @@ -54,5 +24,5 @@ def test_validate_job_spec_matches_provider_and_profile() -> None: ) ] - with pytest.raises(HTTPException, match="subprocess/default"): + with pytest.raises(HTTPException, match="cpu/subprocess"): validate_job_spec(spec, profiles) From 665e334a36fec4c86d91723cb252ab418c3cfc91 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 12:50:48 -0700 Subject: [PATCH 02/17] self review Signed-off-by: Matthew Grossman --- e2e/test_jobs.py | 2 +- openapi/ga/individual/platform.openapi.yaml | 280 ++++++------------ openapi/ga/openapi.yaml | 280 ++++++------------ openapi/openapi.yaml | 280 ++++++------------ .../nemo_platform_plugin/jobs/api_factory.py | 11 +- .../src/nemo_platform_plugin/jobs/profile.py | 7 +- .../tests/api_factory/test_api_factory.py | 19 +- packages/nmp_common/tests/jobs/test_docker.py | 15 +- plugins/nemo-agents/openapi/openapi.yaml | 35 +++ .../nemo_agents_plugin/jobs/analyze_batch.py | 1 + .../nemo_agents_plugin/jobs/evaluate_agent.py | 1 + .../nemo_agents_plugin/jobs/evaluate_suite.py | 1 + .../nemo_agents_plugin/jobs/optimize_agent.py | 1 + .../jobs/optimize_skills.py | 1 + .../src/nemo_anonymizer_plugin/jobs/run.py | 5 +- .../nemo-data-designer/openapi/openapi.yaml | 7 + .../nemo_data_designer_plugin/jobs/create.py | 5 +- plugins/nemo-evaluator/openapi/openapi.yaml | 7 + .../src/nemo_evaluator/jobs/compiler.py | 5 +- .../openapi/openapi.yaml | 7 + .../api/v2/jobs/endpoints.py | 6 +- .../nemo-platform/.nmpcontext/openapi.yaml | 280 ++++++------------ .../nemo-platform/.nmpcontext/stainless.yaml | 8 +- .../src/nemo_platform/resources/jobs/api.md | 8 +- .../src/nemo_platform/types/jobs/__init__.py | 10 +- ...der.py => container_execution_provider.py} | 15 +- ... => container_execution_provider_param.py} | 15 +- .../jobs/cpu_execution_provider_param.py | 46 --- .../distributed_gpu_execution_provider.py | 46 --- ...istributed_gpu_execution_provider_param.py | 46 --- .../types/jobs/gpu_execution_provider.py | 46 --- .../types/jobs/platform_job_step_spec.py | 7 +- .../jobs/platform_job_step_spec_param.py | 11 +- .../jobs/subprocess_execution_provider.py | 10 +- .../subprocess_execution_provider_param.py | 10 +- .../jobs/subprocess_job_execution_profile.py | 2 +- .../tests/api_resources/test_jobs.py | 18 +- sdk/stainless.yaml | 8 +- .../src/nmp/automodel/app/jobs/compiler.py | 11 +- .../automodel/app/jobs/training/compiler.py | 9 +- .../jobs/src/nmp/core/jobs/app/providers.py | 23 +- services/core/jobs/tests/conftest.py | 3 +- services/core/jobs/tests/test_jobs_api.py | 1 + .../src/nmp/core/models/api/v2/models.py | 5 +- .../nmp/hello_world/api/v2/jobs/endpoints.py | 5 +- .../src/nmp/unsloth/app/jobs/compiler.py | 11 +- .../nmp/unsloth/app/jobs/training/compiler.py | 5 +- 47 files changed, 581 insertions(+), 1054 deletions(-) rename sdk/python/nemo-platform/src/nemo_platform/types/jobs/{cpu_execution_provider.py => container_execution_provider.py} (75%) rename sdk/python/nemo-platform/src/nemo_platform/types/jobs/{gpu_execution_provider_param.py => container_execution_provider_param.py} (75%) delete mode 100644 sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py delete mode 100644 sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py delete mode 100644 sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py delete mode 100644 sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py diff --git a/e2e/test_jobs.py b/e2e/test_jobs.py index 12255b52df..fc9bd25c38 100644 --- a/e2e/test_jobs.py +++ b/e2e/test_jobs.py @@ -1,6 +1,6 @@ """E2E tests for platform jobs via the subprocess executor. -These tests submit jobs with CPUExecutionProviderSpec (container image + command). +These tests submit jobs with ContainerExecutionProviderSpec (container image + command). In subprocess mode, the jobs service translates cpu/default steps to subprocess steps automatically — the container image is discarded and the command runs directly on the host. diff --git a/openapi/ga/individual/platform.openapi.yaml b/openapi/ga/individual/platform.openapi.yaml index 386598e5cf..f387108325 100644 --- a/openapi/ga/individual/platform.openapi.yaml +++ b/openapi/ga/individual/platform.openapi.yaml @@ -8028,60 +8028,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8523,6 +8469,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9246,60 +9266,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -10753,60 +10719,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -14576,18 +14488,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14626,18 +14534,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -16469,15 +16373,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -16487,14 +16398,19 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile diff --git a/openapi/ga/openapi.yaml b/openapi/ga/openapi.yaml index 386598e5cf..f387108325 100644 --- a/openapi/ga/openapi.yaml +++ b/openapi/ga/openapi.yaml @@ -8028,60 +8028,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8523,6 +8469,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9246,60 +9266,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -10753,60 +10719,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -14576,18 +14488,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14626,18 +14534,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -16469,15 +16373,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -16487,14 +16398,19 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile diff --git a/openapi/openapi.yaml b/openapi/openapi.yaml index 386598e5cf..f387108325 100644 --- a/openapi/openapi.yaml +++ b/openapi/openapi.yaml @@ -8028,60 +8028,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8523,6 +8469,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9246,60 +9266,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -10753,60 +10719,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -14576,18 +14488,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14626,18 +14534,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -16469,15 +16373,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -16487,14 +16398,19 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py index 0d06ce0a27..f5080b80a0 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py @@ -34,10 +34,8 @@ from nemo_platform.types.jobs import ( ComputeResourcesParam, ComputeResourceSpecParam, + ContainerExecutionProviderParam, ContainerSpecParam, - CPUExecutionProviderParam, - DistributedGPUExecutionProviderParam, - GPUExecutionProviderParam, PlatformJobEnvironmentVariableParam, PlatformJobSecretEnvironmentVariableRefParam, PlatformJobSpecParam, @@ -75,13 +73,8 @@ PlatformJobStep = PlatformJobStepSpecParam StepLifecycle = StepLifecycleParam ExecutorSpec = Executor -CPUExecutionProviderSpec = CPUExecutionProviderParam -GPUExecutionProviderSpec = GPUExecutionProviderParam -DistributedGPUExecutionProviderSpec = DistributedGPUExecutionProviderParam +ContainerExecutionProviderSpec = ContainerExecutionProviderParam SubprocessExecutionProviderSpec = SubprocessExecutionProviderParam -# Container providers (cpu, gpu, gpu_distributed) share the same shape today. -# This alias will point at a dedicated SDK type once the SDK is regenerated. -ContainerExecutionProviderSpec = CPUExecutionProviderParam ResourcesSpec = ComputeResourcesParam ResourcesLimitsSpec = ComputeResourceSpecParam ResourcesRequestsSpec = ComputeResourceSpecParam diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py index 1e45c7405a..fc82afcc75 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profile.py @@ -23,10 +23,9 @@ def compile(self, *, profile, ...): Why it lives in ``nemo_platform_plugin`` and not in the Jobs service: Each step's ``executor`` is a discriminated union -(``CPUExecutionProviderParam`` / ``GPUExecutionProviderParam`` / -``DistributedGPUExecutionProviderParam``) from the generated -``nemo_platform`` SDK. All three carry a ``profile: str`` field — that's the -only attribute the stamper touches. Keeping the helper in ``nemo_platform_plugin`` +(``ContainerExecutionProviderParam`` / ``SubprocessExecutionProviderParam``) +from the generated ``nemo_platform`` SDK. Both carry a ``profile: str`` +field — that's the only attribute the stamper touches. Keeping the helper in ``nemo_platform_plugin`` alongside the factory avoids dragging plugin-service code through the Jobs service's internals and matches where ``add_job_routes()`` already lives. """ diff --git a/packages/nmp_common/tests/api_factory/test_api_factory.py b/packages/nmp_common/tests/api_factory/test_api_factory.py index cff87af28f..0d11c2d21e 100644 --- a/packages/nmp_common/tests/api_factory/test_api_factory.py +++ b/packages/nmp_common/tests/api_factory/test_api_factory.py @@ -18,8 +18,8 @@ from nemo_platform.types.shared.platform_job_status import PlatformJobStatus from nemo_platform_plugin.entities import EntityClient from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, FileResultSerializer, JobRouteOption, PlatformJobResultRoute, @@ -69,7 +69,8 @@ def foo_job_config_compiler( steps=[ PlatformJobStep( name="foo_step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec( @@ -121,7 +122,7 @@ def test_api_factory_routes(): def test_validate_job_spec(): - executor = CPUExecutionProviderSpec(provider="cpu", profile="default", container=ContainerSpec(image="foo_image")) + executor = ContainerExecutionProviderSpec(kind="container", provider="cpu", profile="default", container=ContainerSpec(image="foo_image")) valid_job = PlatformJobSpec( steps=[ PlatformJobStep( @@ -1396,7 +1397,8 @@ def compiler( steps=[ PlatformJobStep( name="test_step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="test_image"), @@ -1443,7 +1445,8 @@ def sync_compiler( steps=[ PlatformJobStep( name="test_step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="test_image"), @@ -1659,7 +1662,8 @@ def _make_platform_spec(self, output_spec: FooJobConfig) -> PlatformJobSpec: steps=[ PlatformJobStep( name="step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="img"), @@ -1719,7 +1723,8 @@ def compiler_bad_config(workspace, input_spec, output_spec, entity_client, job_n steps=[ PlatformJobStep( name="step", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="img"), diff --git a/packages/nmp_common/tests/jobs/test_docker.py b/packages/nmp_common/tests/jobs/test_docker.py index 60b54000b0..042a3c057f 100644 --- a/packages/nmp_common/tests/jobs/test_docker.py +++ b/packages/nmp_common/tests/jobs/test_docker.py @@ -7,9 +7,8 @@ import pytest from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, - GPUExecutionProviderSpec, PlatformJobSpec, PlatformJobStep, ) @@ -20,11 +19,11 @@ def test_spec_has_gpu_step(): """spec_has_gpu_step returns True when any step has provider gpu or gpu_distributed.""" - cpu_executor = CPUExecutionProviderSpec( - provider="cpu", profile="default", container=ContainerSpec(image="foo_image") + cpu_executor = ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="foo_image") ) - gpu_executor = GPUExecutionProviderSpec( - provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") + gpu_executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") ) cpu_only_job = PlatformJobSpec( steps=[ @@ -61,8 +60,8 @@ def test_spec_has_gpu_step(): ) def test_validate_gpu_available_for_docker(runtime, reserved_gpu_ids, config_raises, expect_raise, message_contains): """GPU job validation: raise when Docker has no GPUs; pass or skip otherwise.""" - gpu_executor = GPUExecutionProviderSpec( - provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") + gpu_executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile="default", container=ContainerSpec(image="gpu_image") ) gpu_job = PlatformJobSpec( steps=[ diff --git a/plugins/nemo-agents/openapi/openapi.yaml b/plugins/nemo-agents/openapi/openapi.yaml index 18c511b9f1..5e52cecc7f 100644 --- a/plugins/nemo-agents/openapi/openapi.yaml +++ b/plugins/nemo-agents/openapi/openapi.yaml @@ -2549,6 +2549,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec @@ -2838,6 +2845,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec @@ -3053,6 +3067,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec @@ -3408,6 +3429,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec @@ -3624,6 +3652,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py index 81d6abab78..9141a21865 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py @@ -107,6 +107,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="analyze", executor=SubprocessExecutionProviderSpec( + kind="subprocess", provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.analyze"], ), diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py index db74b9364d..ee9ca7007d 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py @@ -182,6 +182,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="evaluate-agent", executor=SubprocessExecutionProviderSpec( + kind="subprocess", provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.evaluate"], ), diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py index 0f9d56c37a..12222e2525 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py @@ -201,6 +201,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="evaluate-suite", executor=SubprocessExecutionProviderSpec( + kind="subprocess", provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"], ), diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py index 5c2b1ea79d..5c4a0f41b7 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py @@ -164,6 +164,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="optimize-agent", executor=SubprocessExecutionProviderSpec( + kind="subprocess", provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.optimize"], ), diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py index a5118457d7..df52feb4ae 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py @@ -147,6 +147,7 @@ async def compile( # type: ignore[override] PlatformJobStep( name="optimize-skills", executor=SubprocessExecutionProviderSpec( + kind="subprocess", provider="cpu", command=["python", "-m", "nemo_agents_plugin.tasks.optimize_skills"], ), diff --git a/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py b/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py index 78905bbbe6..5dcb54f472 100644 --- a/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py +++ b/plugins/nemo-anonymizer/src/nemo_anonymizer_plugin/jobs/run.py @@ -26,8 +26,8 @@ from nemo_platform_plugin.job import NemoJob from nemo_platform_plugin.job_context import JobContext from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -114,7 +114,8 @@ async def compile( steps=[ PlatformJobStep( name="anonymizer-job", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", profile=profile or "default", provider="cpu", container=ContainerSpec( diff --git a/plugins/nemo-data-designer/openapi/openapi.yaml b/plugins/nemo-data-designer/openapi/openapi.yaml index 6da475527a..b175ada9e4 100644 --- a/plugins/nemo-data-designer/openapi/openapi.yaml +++ b/plugins/nemo-data-designer/openapi/openapi.yaml @@ -834,6 +834,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py index 6deb3aa05d..6cccb39630 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py @@ -16,8 +16,8 @@ from nemo_platform_plugin.job import NemoJob from nemo_platform_plugin.job_context import JobContext from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, PlatformJobSpec, PlatformJobStep, ) @@ -73,7 +73,8 @@ async def compile( steps=[ PlatformJobStep( name="data-designer-job", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", profile=profile or "default", provider="cpu", container=ContainerSpec( diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml index ecbed63932..27887f8588 100644 --- a/plugins/nemo-evaluator/openapi/openapi.yaml +++ b/plugins/nemo-evaluator/openapi/openapi.yaml @@ -630,6 +630,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py index 79f9fd557d..57dce42839 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/compiler.py @@ -8,8 +8,8 @@ from nemo_evaluator.jobs.evaluate import EvaluateSpec from nemo_evaluator_sdk.values import Agent, Model, RunConfig, RunConfigOnline, RunConfigOnlineModel from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, EnvironmentVariableFromSecret, PlatformJobSpec, @@ -80,7 +80,8 @@ def _secret_environment(spec: EvaluateSpec) -> list[EnvironmentVariable]: def _evaluate_step(spec: EvaluateSpec, profile: str | None) -> PlatformJobStep: return PlatformJobStep( name=EVALUATE_STEP_NAME, - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", profile=profile or "default", provider="cpu", container=ContainerSpec( diff --git a/plugins/nemo-safe-synthesizer/openapi/openapi.yaml b/plugins/nemo-safe-synthesizer/openapi/openapi.yaml index 1f8097ffe2..583589a967 100644 --- a/plugins/nemo-safe-synthesizer/openapi/openapi.yaml +++ b/plugins/nemo-safe-synthesizer/openapi/openapi.yaml @@ -1502,6 +1502,13 @@ components: title: Custom Fields additionalProperties: true type: object + profile: + title: Profile + type: string + options: + title: Options + additionalProperties: true + type: object type: object required: - spec diff --git a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py index 18466d8dbb..e75682576c 100644 --- a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py +++ b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py @@ -15,11 +15,11 @@ from nemo_platform.filesets import FilesetPathError, parse_fileset_ref from nemo_platform_plugin.entities import EntityClient from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, EnvironmentVariable, EnvironmentVariableFromSecret, FileResultSerializer, - GPUExecutionProviderSpec, PlatformJobResultRoute, PlatformJobSpec, PlatformJobStep, @@ -62,6 +62,7 @@ def _create_job_step(job_config: SafeSynthesizerJobConfig, environment: list[Env return PlatformJobStep( name="safe-synthesizer", executor=SubprocessExecutionProviderSpec( + kind="subprocess", provider="cpu", profile=config.job_executor_profile, command=command, @@ -85,7 +86,8 @@ def _create_job_step(job_config: SafeSynthesizerJobConfig, environment: list[Env ) return PlatformJobStep( name="safe-synthesizer", - executor=GPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile=config.job_executor_profile, container=ContainerSpec( diff --git a/sdk/python/nemo-platform/.nmpcontext/openapi.yaml b/sdk/python/nemo-platform/.nmpcontext/openapi.yaml index 386598e5cf..f387108325 100644 --- a/sdk/python/nemo-platform/.nmpcontext/openapi.yaml +++ b/sdk/python/nemo-platform/.nmpcontext/openapi.yaml @@ -8028,60 +8028,6 @@ components: title: Name title: BaseModelFilter type: object - CPUExecutionProviderInput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderInput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' - CPUExecutionProviderOutput: - properties: - provider: - type: string - const: cpu - title: Provider - default: cpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for CPU execution. - type: object - required: - - container - title: CPUExecutionProviderOutput - description: 'CPU-based execution provider. - - - Provides configuration for running jobs on CPU resources with - - resource requests and limits.' CacheStatsConfig: properties: enabled: @@ -8523,6 +8469,80 @@ components: type: object title: ComputeResources description: Resource requirements matching k8s ResourceRequirements format. + ContainerExecutionProviderInput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderInput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' + ContainerExecutionProviderOutput: + properties: + kind: + type: string + const: container + title: Kind + default: container + provider: + type: string + enum: + - cpu + - gpu + - gpu_distributed + title: Provider + default: cpu + profile: + type: string + title: Profile + default: default + container: + $ref: '#/components/schemas/ContainerSpec' + resources: + allOf: + - $ref: '#/components/schemas/ComputeResources' + description: Resource requests and limits. + type: object + required: + - container + title: ContainerExecutionProviderOutput + description: 'Container-based execution provider. + + + Runs a job step inside a container image. The ``provider`` field + + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + + identifies the payload shape.' ContainerExecutorConfig: properties: gpu: @@ -9246,60 +9266,6 @@ components: type: object title: DialogRails description: Configuration of topical rails. - DistributedGPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - DistributedGPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu_distributed - title: Provider - default: gpu_distributed - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for distributed GPU execution. - type: object - required: - - container - title: DistributedGPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' DockerJobExecutionProfile: properties: provider: @@ -10753,60 +10719,6 @@ components: type: object title: GLiNERDetectionOptions description: Configuration options for GLiNER. - GPUExecutionProviderInput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderInput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' - GPUExecutionProviderOutput: - properties: - provider: - type: string - const: gpu - title: Provider - default: gpu - profile: - type: string - title: Profile - default: default - container: - $ref: '#/components/schemas/ContainerSpec' - resources: - allOf: - - $ref: '#/components/schemas/ComputeResources' - description: Resource requests and limits for GPU execution. - type: object - required: - - container - title: GPUExecutionProviderOutput - description: 'GPU-based execution provider. - - - Provides configuration for running jobs on GPU resources with - - resource requests and limits.' GenerationLog: properties: activated_rails: @@ -14576,18 +14488,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderInput' - - $ref: '#/components/schemas/GPUExecutionProviderInput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderInput' + - $ref: '#/components/schemas/ContainerExecutionProviderInput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderInput' - gpu: '#/components/schemas/GPUExecutionProviderInput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderInput' + container: '#/components/schemas/ContainerExecutionProviderInput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -14626,18 +14534,14 @@ components: type: array executor: oneOf: - - $ref: '#/components/schemas/CPUExecutionProviderOutput' - - $ref: '#/components/schemas/GPUExecutionProviderOutput' - - $ref: '#/components/schemas/DistributedGPUExecutionProviderOutput' + - $ref: '#/components/schemas/ContainerExecutionProviderOutput' - $ref: '#/components/schemas/SubprocessExecutionProvider' title: Executor description: The executor for the step discriminator: - propertyName: provider + propertyName: kind mapping: - cpu: '#/components/schemas/CPUExecutionProviderOutput' - gpu: '#/components/schemas/GPUExecutionProviderOutput' - gpu_distributed: '#/components/schemas/DistributedGPUExecutionProviderOutput' + container: '#/components/schemas/ContainerExecutionProviderOutput' subprocess: '#/components/schemas/SubprocessExecutionProvider' config: additionalProperties: true @@ -16469,15 +16373,22 @@ components: type: object SubprocessExecutionProvider: properties: - provider: + kind: type: string const: subprocess - title: Provider + title: Kind default: subprocess + provider: + type: string + enum: + - cpu + - gpu + title: Provider + default: cpu profile: type: string title: Profile - default: default + default: subprocess command: items: type: string @@ -16487,14 +16398,19 @@ components: required: - command title: SubprocessExecutionProvider - description: Host subprocess execution provider. + description: 'Host subprocess execution provider. + + + Runs a job step as a local OS process. The ``provider`` field + + expresses compute intent while ``kind`` identifies the payload shape.' SubprocessJobExecutionProfile: properties: provider: type: string - const: subprocess + const: cpu title: Provider - default: subprocess + default: cpu profile: type: string title: Profile diff --git a/sdk/python/nemo-platform/.nmpcontext/stainless.yaml b/sdk/python/nemo-platform/.nmpcontext/stainless.yaml index 473c2c6a45..24e7837c72 100644 --- a/sdk/python/nemo-platform/.nmpcontext/stainless.yaml +++ b/sdk/python/nemo-platform/.nmpcontext/stainless.yaml @@ -501,20 +501,16 @@ resources: models: compute_resource_spec: ComputeResourceSpec compute_resources: ComputeResources + container_execution_provider: ContainerExecutionProviderOutput + container_execution_provider_param: ContainerExecutionProviderInput container_spec: ContainerSpec - cpu_execution_provider: CPUExecutionProviderOutput - cpu_execution_provider_param: CPUExecutionProviderInput create_platform_job_request: CreatePlatformJobRequest - distributed_gpu_execution_provider: DistributedGPUExecutionProviderOutput - distributed_gpu_execution_provider_param: DistributedGPUExecutionProviderInput docker_job_execution_profile: DockerJobExecutionProfile docker_job_execution_profile_config: DockerJobExecutionProfileConfig docker_job_network_config: DockerJobNetworkConfig docker_job_storage_config: DockerJobStorageConfig docker_volume_mount: DockerVolumeMount e2e_job_execution_profile: E2EJobExecutionProfile - gpu_execution_provider: GPUExecutionProviderOutput - gpu_execution_provider_param: GPUExecutionProviderInput image_pull_secret: ImagePullSecret job_execution_profile_config: JobExecutionProfileConfig kubernetes_empty_dir_volume: KubernetesEmptyDirVolume diff --git a/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md b/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md index f5bfd41e0c..954ef664c9 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md +++ b/sdk/python/nemo-platform/src/nemo_platform/resources/jobs/api.md @@ -6,20 +6,16 @@ Types: from nemo_platform.types.jobs import ( ComputeResourceSpec, ComputeResources, + ContainerExecutionProvider, + ContainerExecutionProviderParam, ContainerSpec, - CPUExecutionProvider, - CPUExecutionProviderParam, CreatePlatformJobRequest, - DistributedGPUExecutionProvider, - DistributedGPUExecutionProviderParam, DockerJobExecutionProfile, DockerJobExecutionProfileConfig, DockerJobNetworkConfig, DockerJobStorageConfig, DockerVolumeMount, E2EJobExecutionProfile, - GPUExecutionProvider, - GPUExecutionProviderParam, ImagePullSecret, JobExecutionProfileConfig, KubernetesEmptyDirVolume, diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py index 6c6a8d01c1..34c9777606 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/__init__.py @@ -36,8 +36,6 @@ from .step_lifecycle_param import StepLifecycleParam as StepLifecycleParam from .compute_resource_spec import ComputeResourceSpec as ComputeResourceSpec from .platform_job_response import PlatformJobResponse as PlatformJobResponse -from .cpu_execution_provider import CPUExecutionProvider as CPUExecutionProvider -from .gpu_execution_provider import GPUExecutionProvider as GPUExecutionProvider from .platform_job_step_spec import PlatformJobStepSpec as PlatformJobStepSpec from .compute_resources_param import ComputeResourcesParam as ComputeResourcesParam from .kubernetes_volume_mount import KubernetesVolumeMount as KubernetesVolumeMount @@ -51,9 +49,8 @@ from .compute_resource_spec_param import ComputeResourceSpecParam as ComputeResourceSpecParam from .kubernetes_empty_dir_volume import KubernetesEmptyDirVolume as KubernetesEmptyDirVolume from .platform_job_responses_page import PlatformJobResponsesPage as PlatformJobResponsesPage -from .cpu_execution_provider_param import CPUExecutionProviderParam as CPUExecutionProviderParam +from .container_execution_provider import ContainerExecutionProvider as ContainerExecutionProvider from .docker_job_execution_profile import DockerJobExecutionProfile as DockerJobExecutionProfile -from .gpu_execution_provider_param import GPUExecutionProviderParam as GPUExecutionProviderParam from .job_execution_profile_config import JobExecutionProfileConfig as JobExecutionProfileConfig from .platform_job_step_spec_param import PlatformJobStepSpecParam as PlatformJobStepSpecParam from .task_create_or_update_params import TaskCreateOrUpdateParams as TaskCreateOrUpdateParams @@ -67,7 +64,7 @@ from .kubernetes_job_execution_profile import KubernetesJobExecutionProfile as KubernetesJobExecutionProfile from .subprocess_job_execution_profile import SubprocessJobExecutionProfile as SubprocessJobExecutionProfile from .platform_job_environment_variable import PlatformJobEnvironmentVariable as PlatformJobEnvironmentVariable -from .distributed_gpu_execution_provider import DistributedGPUExecutionProvider as DistributedGPUExecutionProvider +from .container_execution_provider_param import ContainerExecutionProviderParam as ContainerExecutionProviderParam from .kubernetes_persistent_volume_claim import KubernetesPersistentVolumeClaim as KubernetesPersistentVolumeClaim from .docker_job_execution_profile_config import DockerJobExecutionProfileConfig as DockerJobExecutionProfileConfig from .subprocess_execution_provider_param import SubprocessExecutionProviderParam as SubprocessExecutionProviderParam @@ -84,9 +81,6 @@ from .subprocess_job_execution_profile_config import ( SubprocessJobExecutionProfileConfig as SubprocessJobExecutionProfileConfig, ) -from .distributed_gpu_execution_provider_param import ( - DistributedGPUExecutionProviderParam as DistributedGPUExecutionProviderParam, -) from .platform_job_secret_environment_variable_ref import ( PlatformJobSecretEnvironmentVariableRef as PlatformJobSecretEnvironmentVariableRef, ) diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider.py similarity index 75% rename from sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider.py rename to sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider.py index d890dbb9e9..f18f5c41b7 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider.py @@ -22,14 +22,15 @@ from .container_spec import ContainerSpec from .compute_resources import ComputeResources -__all__ = ["CPUExecutionProvider"] +__all__ = ["ContainerExecutionProvider"] -class CPUExecutionProvider(BaseModel): - """CPU-based execution provider. +class ContainerExecutionProvider(BaseModel): + """Container-based execution provider. - Provides configuration for running jobs on CPU resources with - resource requests and limits. + Runs a job step inside a container image. The ``provider`` field + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + identifies the payload shape. """ container: ContainerSpec @@ -38,9 +39,11 @@ class CPUExecutionProvider(BaseModel): Defines the container image and related configuration for job execution. """ + kind: Optional[Literal["container"]] = None + profile: Optional[str] = None - provider: Optional[Literal["cpu"]] = None + provider: Optional[Literal["cpu", "gpu", "gpu_distributed"]] = None resources: Optional[ComputeResources] = None """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider_param.py similarity index 75% rename from sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider_param.py rename to sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider_param.py index 4471def866..83971919e7 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/container_execution_provider_param.py @@ -22,14 +22,15 @@ from .container_spec_param import ContainerSpecParam from .compute_resources_param import ComputeResourcesParam -__all__ = ["GPUExecutionProviderParam"] +__all__ = ["ContainerExecutionProviderParam"] -class GPUExecutionProviderParam(TypedDict, total=False): - """GPU-based execution provider. +class ContainerExecutionProviderParam(TypedDict, total=False): + """Container-based execution provider. - Provides configuration for running jobs on GPU resources with - resource requests and limits. + Runs a job step inside a container image. The ``provider`` field + expresses compute intent (cpu, gpu, gpu_distributed) while ``kind`` + identifies the payload shape. """ container: Required[ContainerSpecParam] @@ -38,9 +39,11 @@ class GPUExecutionProviderParam(TypedDict, total=False): Defines the container image and related configuration for job execution. """ + kind: Literal["container"] + profile: str - provider: Literal["gpu"] + provider: Literal["cpu", "gpu", "gpu_distributed"] resources: ComputeResourcesParam """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py deleted file mode 100644 index 02eac5b152..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/cpu_execution_provider_param.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Literal, Required, TypedDict - -from .container_spec_param import ContainerSpecParam -from .compute_resources_param import ComputeResourcesParam - -__all__ = ["CPUExecutionProviderParam"] - - -class CPUExecutionProviderParam(TypedDict, total=False): - """CPU-based execution provider. - - Provides configuration for running jobs on CPU resources with - resource requests and limits. - """ - - container: Required[ContainerSpecParam] - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: str - - provider: Literal["cpu"] - - resources: ComputeResourcesParam - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py deleted file mode 100644 index 5594c9b31c..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from ..._models import BaseModel -from .container_spec import ContainerSpec -from .compute_resources import ComputeResources - -__all__ = ["DistributedGPUExecutionProvider"] - - -class DistributedGPUExecutionProvider(BaseModel): - """GPU-based execution provider. - - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ - - container: ContainerSpec - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: Optional[str] = None - - provider: Optional[Literal["gpu_distributed"]] = None - - resources: Optional[ComputeResources] = None - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py deleted file mode 100644 index 2ad674ab22..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/distributed_gpu_execution_provider_param.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import Literal, Required, TypedDict - -from .container_spec_param import ContainerSpecParam -from .compute_resources_param import ComputeResourcesParam - -__all__ = ["DistributedGPUExecutionProviderParam"] - - -class DistributedGPUExecutionProviderParam(TypedDict, total=False): - """GPU-based execution provider. - - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ - - container: Required[ContainerSpecParam] - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: str - - provider: Literal["gpu_distributed"] - - resources: ComputeResourcesParam - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py deleted file mode 100644 index 8df8dc74d4..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/gpu_execution_provider.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from typing import Optional -from typing_extensions import Literal - -from ..._models import BaseModel -from .container_spec import ContainerSpec -from .compute_resources import ComputeResources - -__all__ = ["GPUExecutionProvider"] - - -class GPUExecutionProvider(BaseModel): - """GPU-based execution provider. - - Provides configuration for running jobs on GPU resources with - resource requests and limits. - """ - - container: ContainerSpec - """Specification for a container configuration. - - Defines the container image and related configuration for job execution. - """ - - profile: Optional[str] = None - - provider: Optional[Literal["gpu"]] = None - - resources: Optional[ComputeResources] = None - """Resource requirements matching k8s ResourceRequirements format.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py index c23f1ec802..52267abf3d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec.py @@ -21,17 +21,14 @@ from ..._utils import PropertyInfo from ..._models import BaseModel from .step_lifecycle import StepLifecycle -from .cpu_execution_provider import CPUExecutionProvider -from .gpu_execution_provider import GPUExecutionProvider +from .container_execution_provider import ContainerExecutionProvider from .subprocess_execution_provider import SubprocessExecutionProvider from .platform_job_environment_variable import PlatformJobEnvironmentVariable -from .distributed_gpu_execution_provider import DistributedGPUExecutionProvider __all__ = ["PlatformJobStepSpec", "Executor"] Executor: TypeAlias = Annotated[ - Union[CPUExecutionProvider, GPUExecutionProvider, DistributedGPUExecutionProvider, SubprocessExecutionProvider], - PropertyInfo(discriminator="provider"), + Union[ContainerExecutionProvider, SubprocessExecutionProvider], PropertyInfo(discriminator="kind") ] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py index d0577d2eb7..0c7397010a 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/platform_job_step_spec_param.py @@ -21,20 +21,13 @@ from typing_extensions import Required, TypeAlias, TypedDict from .step_lifecycle_param import StepLifecycleParam -from .cpu_execution_provider_param import CPUExecutionProviderParam -from .gpu_execution_provider_param import GPUExecutionProviderParam +from .container_execution_provider_param import ContainerExecutionProviderParam from .subprocess_execution_provider_param import SubprocessExecutionProviderParam from .platform_job_environment_variable_param import PlatformJobEnvironmentVariableParam -from .distributed_gpu_execution_provider_param import DistributedGPUExecutionProviderParam __all__ = ["PlatformJobStepSpecParam", "Executor"] -Executor: TypeAlias = Union[ - CPUExecutionProviderParam, - GPUExecutionProviderParam, - DistributedGPUExecutionProviderParam, - SubprocessExecutionProviderParam, -] +Executor: TypeAlias = Union[ContainerExecutionProviderParam, SubprocessExecutionProviderParam] class PlatformJobStepSpecParam(TypedDict, total=False): diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py index 8b5536767b..278b24063e 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider.py @@ -24,10 +24,16 @@ class SubprocessExecutionProvider(BaseModel): - """Host subprocess execution provider.""" + """Host subprocess execution provider. + + Runs a job step as a local OS process. The ``provider`` field + expresses compute intent while ``kind`` identifies the payload shape. + """ command: List[str] + kind: Optional[Literal["subprocess"]] = None + profile: Optional[str] = None - provider: Optional[Literal["subprocess"]] = None + provider: Optional[Literal["cpu", "gpu"]] = None diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py index 979c4ab4f5..f46f456f27 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_execution_provider_param.py @@ -25,10 +25,16 @@ class SubprocessExecutionProviderParam(TypedDict, total=False): - """Host subprocess execution provider.""" + """Host subprocess execution provider. + + Runs a job step as a local OS process. The ``provider`` field + expresses compute intent while ``kind`` identifies the payload shape. + """ command: Required[SequenceNotStr[str]] + kind: Literal["subprocess"] + profile: str - provider: Literal["subprocess"] + provider: Literal["cpu", "gpu"] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py index a83d6acedc..1cdbb5d38d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py @@ -33,4 +33,4 @@ class SubprocessJobExecutionProfile(BaseModel): profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" - provider: Optional[Literal["subprocess"]] = None + provider: Optional[Literal["cpu"]] = None diff --git a/sdk/python/nemo-platform/tests/api_resources/test_jobs.py b/sdk/python/nemo-platform/tests/api_resources/test_jobs.py index dce491ed23..334883f4db 100644 --- a/sdk/python/nemo-platform/tests/api_resources/test_jobs.py +++ b/sdk/python/nemo-platform/tests/api_resources/test_jobs.py @@ -53,7 +53,7 @@ def test_method_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -78,6 +78,7 @@ def test_method_create_with_all_params(self, client: NeMoPlatform) -> None: "command": ["string"], "entrypoint": ["string"], }, + "kind": "container", "profile": "profile", "provider": "cpu", "resources": { @@ -127,7 +128,7 @@ def test_raw_response_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -152,7 +153,7 @@ def test_streaming_response_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -180,7 +181,7 @@ def test_path_params_create(self, client: NeMoPlatform) -> None: { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -747,7 +748,7 @@ async def test_method_create(self, async_client: AsyncNeMoPlatform) -> None: { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -772,6 +773,7 @@ async def test_method_create_with_all_params(self, async_client: AsyncNeMoPlatfo "command": ["string"], "entrypoint": ["string"], }, + "kind": "container", "profile": "profile", "provider": "cpu", "resources": { @@ -821,7 +823,7 @@ async def test_raw_response_create(self, async_client: AsyncNeMoPlatform) -> Non { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -846,7 +848,7 @@ async def test_streaming_response_create(self, async_client: AsyncNeMoPlatform) { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } @@ -874,7 +876,7 @@ async def test_path_params_create(self, async_client: AsyncNeMoPlatform) -> None { "executor": { "container": {"image": "image"}, - "provider": "cpu", + "kind": "container", }, "name": "preprocess", } diff --git a/sdk/stainless.yaml b/sdk/stainless.yaml index 473c2c6a45..24e7837c72 100644 --- a/sdk/stainless.yaml +++ b/sdk/stainless.yaml @@ -501,20 +501,16 @@ resources: models: compute_resource_spec: ComputeResourceSpec compute_resources: ComputeResources + container_execution_provider: ContainerExecutionProviderOutput + container_execution_provider_param: ContainerExecutionProviderInput container_spec: ContainerSpec - cpu_execution_provider: CPUExecutionProviderOutput - cpu_execution_provider_param: CPUExecutionProviderInput create_platform_job_request: CreatePlatformJobRequest - distributed_gpu_execution_provider: DistributedGPUExecutionProviderOutput - distributed_gpu_execution_provider_param: DistributedGPUExecutionProviderInput docker_job_execution_profile: DockerJobExecutionProfile docker_job_execution_profile_config: DockerJobExecutionProfileConfig docker_job_network_config: DockerJobNetworkConfig docker_job_storage_config: DockerJobStorageConfig docker_volume_mount: DockerVolumeMount e2e_job_execution_profile: E2EJobExecutionProfile - gpu_execution_provider: GPUExecutionProviderOutput - gpu_execution_provider_param: GPUExecutionProviderInput image_pull_secret: ImagePullSecret job_execution_profile_config: JobExecutionProfileConfig kubernetes_empty_dir_volume: KubernetesEmptyDirVolume diff --git a/services/automodel/src/nmp/automodel/app/jobs/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/compiler.py index 5b3b0625f9..55201aa35c 100644 --- a/services/automodel/src/nmp/automodel/app/jobs/compiler.py +++ b/services/automodel/src/nmp/automodel/app/jobs/compiler.py @@ -8,8 +8,8 @@ from nemo_platform import AsyncNeMoPlatform, NotFoundError from nemo_platform.types.models.model_entity import ModelEntity from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -447,7 +447,8 @@ async def platform_job_config_compiler( # Step 1: Download model and dataset files from Files service PlatformJobStep( name="model-and-dataset-download", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -469,7 +470,8 @@ async def platform_job_config_compiler( # Step 3: Upload customized model PlatformJobStep( name="model-upload", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -484,7 +486,8 @@ async def platform_job_config_compiler( # Step 4: Create model entity PlatformJobStep( name="model-entity-creation", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py index 5533baae6e..7ca7b7b582 100644 --- a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py +++ b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py @@ -7,11 +7,10 @@ from nemo_platform.types.models.model_entity import ModelEntity from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - DistributedGPUExecutionProviderSpec, EnvironmentVariable, EnvironmentVariableFromSecret, - GPUExecutionProviderSpec, PlatformJobStep, ResourcesSpec, StepLifecycle, @@ -194,7 +193,8 @@ def compile_training_step( if p.num_nodes > 1: logger.debug(f"Using distributed GPU executor: num_nodes={p.num_nodes}, num_gpus_per_node={num_gpus_per_node}") - executor = DistributedGPUExecutionProviderSpec( + executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu_distributed", profile=profile, container=container, @@ -205,7 +205,8 @@ def compile_training_step( ) else: logger.debug(f"Using single-node GPU executor: num_gpus={num_gpus_per_node}") - executor = GPUExecutionProviderSpec( + executor = ContainerExecutionProviderSpec( + kind="container", provider="gpu", profile=profile, container=container, diff --git a/services/core/jobs/src/nmp/core/jobs/app/providers.py b/services/core/jobs/src/nmp/core/jobs/app/providers.py index 4a5f00a9ce..f64e7fb5ef 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/providers.py +++ b/services/core/jobs/src/nmp/core/jobs/app/providers.py @@ -2,9 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import re -from typing import Annotated, Any, Literal, Union +from typing import Annotated, Literal, Union -from pydantic import BaseModel, BeforeValidator, Field, field_validator, model_validator +from pydantic import BaseModel, Field, field_validator, model_validator # SHM: megabyte/gigabyte scale only — Mi, Gi (binary) or M, G (decimal SI). # Ki / Ti / Pi / Ei and other suffixes are not accepted for /dev/shm. @@ -134,28 +134,9 @@ def validate_command(self) -> "SubprocessExecutionProvider": return self -def _infer_executor_kind(v: Any) -> Any: - """Infer ``kind`` from payload shape when absent. - - The SDK types haven't been regenerated yet, so incoming requests may - omit ``kind``. Infer it from the presence of ``container`` (→ container) - vs ``command`` (→ subprocess), or from the legacy ``provider="subprocess"``. - """ - if not isinstance(v, dict) or "kind" in v: - return v - if v.get("provider") == "subprocess" or "command" in v: - v = {**v, "kind": "subprocess"} - if v.get("provider") == "subprocess": - v["provider"] = "cpu" - else: - v = {**v, "kind": "container"} - return v - - # Discriminated union type for execution providers. # Uses ``kind`` to distinguish container vs subprocess payload shapes. Provider = Annotated[ Union[ContainerExecutionProvider, SubprocessExecutionProvider], - BeforeValidator(_infer_executor_kind), Field(discriminator="kind"), ] diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index c50df7f6ff..18f45d983a 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -13,7 +13,7 @@ from httpx import ASGITransport, AsyncClient from nemo_platform import AsyncNeMoPlatform from nemo_platform_plugin.jobs.api_factory import ContainerSpec as FactoryContainerSpec -from nemo_platform_plugin.jobs.api_factory import CPUExecutionProviderSpec as FactoryCPUExecutionProviderSpec +from nemo_platform_plugin.jobs.api_factory import ContainerExecutionProviderSpec as FactoryCPUExecutionProviderSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobEnvironmentVariableParam, job_route_factory from nemo_platform_plugin.jobs.api_factory import PlatformJobSpec as FactoryPlatformJobSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobStep as FactoryPlatformJobStep @@ -491,6 +491,7 @@ def hello_world_job_config( FactoryPlatformJobStep( name="hello-world-step-1", executor=FactoryCPUExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=FactoryContainerSpec( diff --git a/services/core/jobs/tests/test_jobs_api.py b/services/core/jobs/tests/test_jobs_api.py index 2bd8a4c076..627dc6f4b2 100644 --- a/services/core/jobs/tests/test_jobs_api.py +++ b/services/core/jobs/tests/test_jobs_api.py @@ -69,6 +69,7 @@ async def test_create_job_using_sdk(test_sdk: AsyncNeMoPlatform): { "name": "basic", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { diff --git a/services/core/models/src/nmp/core/models/api/v2/models.py b/services/core/models/src/nmp/core/models/api/v2/models.py index aa84d33ffd..b023c7681b 100644 --- a/services/core/models/src/nmp/core/models/api/v2/models.py +++ b/services/core/models/src/nmp/core/models/api/v2/models.py @@ -6,8 +6,8 @@ from fastapi import APIRouter, Depends, HTTPException, Query, status from nemo_platform import APIError, AsyncNeMoPlatform from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -267,7 +267,8 @@ async def start_update_model_spec_job(model_entity: ModelEntity): # Step 1: Download model and dataset files from Files service PlatformJobStep( name="model-spec-analysis", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_qualified_image("nmp-automodel-tasks"), diff --git a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py index 711a80e9df..6747f52bd3 100644 --- a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py +++ b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py @@ -7,8 +7,8 @@ from nemo_platform import AsyncNeMoPlatform from nemo_platform_plugin.entities import EntityClient from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, PlatformJobSpec, PlatformJobStep, job_route_factory, @@ -41,7 +41,8 @@ def compile_hello_world_job( steps=[ PlatformJobStep( name="hello-world", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec( diff --git a/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py b/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py index b4b3083986..b741b4293e 100644 --- a/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py +++ b/services/unsloth/src/nmp/unsloth/app/jobs/compiler.py @@ -19,8 +19,8 @@ from nemo_platform import AsyncNeMoPlatform from nemo_platform.types.models.model_entity import ModelEntity from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, - CPUExecutionProviderSpec, EnvironmentVariable, PlatformJobSpec, PlatformJobStep, @@ -240,7 +240,8 @@ async def platform_job_config_compiler( steps: list[PlatformJobStep] = [ PlatformJobStep( name="model-and-dataset-download", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -260,7 +261,8 @@ async def platform_job_config_compiler( ), PlatformJobStep( name="model-upload", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), @@ -274,7 +276,8 @@ async def platform_job_config_compiler( ), PlatformJobStep( name="model-entity-creation", - executor=CPUExecutionProviderSpec( + executor=ContainerExecutionProviderSpec( + kind="container", provider="cpu", container=ContainerSpec( image=get_tasks_image(), diff --git a/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py b/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py index 9439c175f9..c5f020efbc 100644 --- a/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py +++ b/services/unsloth/src/nmp/unsloth/app/jobs/training/compiler.py @@ -13,9 +13,9 @@ import logging from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec, ContainerSpec, EnvironmentVariable, - GPUExecutionProviderSpec, PlatformJobStep, ResourcesSpec, ) @@ -62,7 +62,8 @@ def compile_training_step( output_path=DEFAULT_OUTPUT_MODEL_PATH, ) - executor: GPUExecutionProviderSpec = { + executor: ContainerExecutionProviderSpec = { + "kind": "container", "provider": "gpu", "container": ContainerSpec( image=get_training_image(), From af07f83272288602868d4fbea08083974d5886c4 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 14:15:37 -0700 Subject: [PATCH 03/17] fixes Signed-off-by: Matthew Grossman --- openapi/ga/individual/platform.openapi.yaml | 41 +++++++++++++++++++ openapi/ga/openapi.yaml | 41 +++++++++++++++++++ openapi/openapi.yaml | 41 +++++++++++++++++++ .../src/nemo_platform_plugin/job.py | 8 ++-- .../nemo-platform/.nmpcontext/openapi.yaml | 41 +++++++++++++++++++ .../jobs/docker_job_execution_profile.py | 3 ++ .../types/jobs/e2e_job_execution_profile.py | 3 ++ .../jobs/kubernetes_job_execution_profile.py | 3 ++ .../jobs/subprocess_job_execution_profile.py | 2 + .../jobs/volcano_job_execution_profile.py | 3 ++ .../jobs/src/nmp/core/jobs/app/providers.py | 4 ++ .../jobs/src/nmp/core/jobs/app/schemas.py | 6 ++- .../jobs/controllers/backends/subprocess.py | 1 + 13 files changed, 193 insertions(+), 4 deletions(-) diff --git a/openapi/ga/individual/platform.openapi.yaml b/openapi/ga/individual/platform.openapi.yaml index f387108325..def92d98c3 100644 --- a/openapi/ga/individual/platform.openapi.yaml +++ b/openapi/ga/individual/platform.openapi.yaml @@ -9279,6 +9279,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9419,6 +9428,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11758,6 +11776,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -16417,6 +16444,11 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -17432,6 +17464,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/openapi/ga/openapi.yaml b/openapi/ga/openapi.yaml index f387108325..def92d98c3 100644 --- a/openapi/ga/openapi.yaml +++ b/openapi/ga/openapi.yaml @@ -9279,6 +9279,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9419,6 +9428,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11758,6 +11776,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -16417,6 +16444,11 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -17432,6 +17464,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/openapi/openapi.yaml b/openapi/openapi.yaml index f387108325..def92d98c3 100644 --- a/openapi/openapi.yaml +++ b/openapi/openapi.yaml @@ -9279,6 +9279,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9419,6 +9428,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11758,6 +11776,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -16417,6 +16444,11 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -17432,6 +17464,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py index 265a56dcfb..24976c19da 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py @@ -172,9 +172,6 @@ class NemoJob(_NamedPlugin): description: ClassVar[str] = "" container: ClassVar[str] = "cpu-tasks" execution_provider: ClassVar[str] = "cpu" - # Execution kind: "container" (default) or "subprocess". - # Subprocess jobs override this to "subprocess". - execution_kind: ClassVar[str] = "container" # ------------------------------------------------------------------ # # Spec schemas — canonical ``spec_schema``; optional ``input_spec_schema`` @@ -288,6 +285,11 @@ async def compile( must override this method; the plugin service produces the ``PlatformJobSpec`` the Jobs service expects by invoking it. + Compilers that need to support both container and subprocess + backends can use :func:`~nemo_platform_plugin.jobs.profiles.resolve_profile_kind` + to determine the executor kind for the given profile, rather + than hardcoding profile names. + Args: workspace: Workspace scope. spec: Canonical :attr:`spec_schema` instance. diff --git a/sdk/python/nemo-platform/.nmpcontext/openapi.yaml b/sdk/python/nemo-platform/.nmpcontext/openapi.yaml index f387108325..def92d98c3 100644 --- a/sdk/python/nemo-platform/.nmpcontext/openapi.yaml +++ b/sdk/python/nemo-platform/.nmpcontext/openapi.yaml @@ -9279,6 +9279,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: docker @@ -9419,6 +9428,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: e2e @@ -11758,6 +11776,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: kubernetes_job @@ -16417,6 +16444,11 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + const: subprocess + title: Kind + default: subprocess backend: type: string const: subprocess @@ -17432,6 +17464,15 @@ components: description: The profile name for the executor, e.g., high_priority_a100, low_priority, etc. default: default + kind: + type: string + enum: + - container + - subprocess + title: Kind + description: 'The executor payload shape this profile expects: ''container'' + or ''subprocess''.' + default: container backend: type: string const: volcano_job diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py index db6bfb344a..19b52806dc 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/docker_job_execution_profile.py @@ -36,6 +36,9 @@ class DockerJobExecutionProfile(BaseModel): backend: Optional[Literal["docker"]] = None + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py index c7a8d0f4df..04f77975c3 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/e2e_job_execution_profile.py @@ -36,6 +36,9 @@ class E2EJobExecutionProfile(BaseModel): config: Optional[JobExecutionProfileConfig] = None """Configuration for the e2e test executor""" + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py index 81aeec0675..d64092149c 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/kubernetes_job_execution_profile.py @@ -36,6 +36,9 @@ class KubernetesJobExecutionProfile(BaseModel): backend: Optional[Literal["kubernetes_job"]] = None + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py index 1cdbb5d38d..c5154eaf6d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/subprocess_job_execution_profile.py @@ -30,6 +30,8 @@ class SubprocessJobExecutionProfile(BaseModel): config: Optional[SubprocessJobExecutionProfileConfig] = None """Additional configuration for the subprocess executor""" + kind: Optional[Literal["subprocess"]] = None + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py index f4e1456ff2..0954f4563a 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/jobs/volcano_job_execution_profile.py @@ -32,6 +32,9 @@ class VolcanoJobExecutionProfile(BaseModel): backend: Optional[Literal["volcano_job"]] = None + kind: Optional[Literal["container", "subprocess"]] = None + """The executor payload shape this profile expects: 'container' or 'subprocess'.""" + profile: Optional[str] = None """The profile name for the executor, e.g., high_priority_a100, low_priority, etc.""" diff --git a/services/core/jobs/src/nmp/core/jobs/app/providers.py b/services/core/jobs/src/nmp/core/jobs/app/providers.py index f64e7fb5ef..61894c285e 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/providers.py +++ b/services/core/jobs/src/nmp/core/jobs/app/providers.py @@ -85,6 +85,10 @@ class TaskSpec(BaseModel): """Arguments to pass to the command. Can be a list of strings or a single string.""" +ExecutorKind = Literal["container", "subprocess"] +"""Executor payload shape: ``"container"`` for image-backed work, ``"subprocess"`` for host commands.""" + + class ContainerExecutionProvider(BaseModel): """Container-based execution provider. diff --git a/services/core/jobs/src/nmp/core/jobs/app/schemas.py b/services/core/jobs/src/nmp/core/jobs/app/schemas.py index d5e8b75992..c3f15b19d1 100644 --- a/services/core/jobs/src/nmp/core/jobs/app/schemas.py +++ b/services/core/jobs/src/nmp/core/jobs/app/schemas.py @@ -7,7 +7,7 @@ from nmp.common.entities.constants import NAME_PATTERN, NAME_PATTERN_DESCRIPTION from nmp.common.jobs.constants import PERSISTENT_JOB_STORAGE_PATH_ENVVAR -from nmp.core.jobs.app.providers import Provider +from nmp.core.jobs.app.providers import ExecutorKind, Provider from pydantic import BaseModel, ConfigDict, Field, model_validator # ============================================================================= @@ -130,6 +130,10 @@ class BaseExecutionProfile(BaseModel): default="default", description="The profile name for the executor, e.g., high_priority_a100, low_priority, etc.", ) + kind: ExecutorKind = Field( + default="container", + description="The executor payload shape this profile expects: 'container' or 'subprocess'.", + ) @property def supports_persistent_storage(self) -> bool: diff --git a/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py b/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py index 5405176fcd..7b69ab4b22 100644 --- a/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py +++ b/services/core/jobs/src/nmp/core/jobs/controllers/backends/subprocess.py @@ -85,6 +85,7 @@ class SubprocessJobExecutionProfileConfig(JobExecutionProfileConfig): class SubprocessJobExecutionProfile(BaseExecutionProfile): provider: Literal["cpu"] = "cpu" backend: Literal["subprocess"] = "subprocess" + kind: Literal["subprocess"] = "subprocess" config: SubprocessJobExecutionProfileConfig = Field( default_factory=SubprocessJobExecutionProfileConfig, description="Additional configuration for the subprocess executor", From 78191cbf10dc78600beabbbe90a86faccd0fa868 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 14:23:37 -0700 Subject: [PATCH 04/17] make update-sdk Signed-off-by: Matthew Grossman --- .../src/nemo_platform/resources/files/api.md | 2 +- .../nemo_platform/resources/files/filesets.py | 1 - .../src/nemo_platform/types/__init__.py | 1 + .../src/nemo_platform/types/files/__init__.py | 2 - .../src/nemo_platform/types/files/fileset.py | 2 +- .../types/files/fileset_create_params.py | 1 - .../types/files/fileset_metadata_param.py | 47 ------------------- .../nemo_platform/types/shared/__init__.py | 1 + .../{files => shared}/fileset_metadata.py | 4 +- .../shared_params/fileset_metadata_param.py | 4 +- 10 files changed, 8 insertions(+), 57 deletions(-) delete mode 100644 sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py rename sdk/python/nemo-platform/src/nemo_platform/types/{files => shared}/fileset_metadata.py (91%) diff --git a/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md b/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md index 882f649add..72e7b5ca66 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md +++ b/sdk/python/nemo-platform/src/nemo_platform/resources/files/api.md @@ -33,7 +33,7 @@ Methods: Types: ```python -from nemo_platform.types.files import FilesetFilter, FilesetMetadata, FilesetMetadataParam +from nemo_platform.types.files import FilesetFilter ``` Methods: diff --git a/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py b/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py index 2fbd9935dc..9b7afcb95f 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py +++ b/sdk/python/nemo-platform/src/nemo_platform/resources/files/filesets.py @@ -34,7 +34,6 @@ from ...pagination import SyncDefaultPagination, AsyncDefaultPagination from ...types.files import ( FilesetPurpose, - FilesetMetadataParam, fileset_list_params, fileset_create_params, fileset_update_params, diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py index 571b87927b..eb3af5c4f7 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/__init__.py @@ -32,6 +32,7 @@ PlatformJobLog as PlatformJobLog, ToolCallConfig as ToolCallConfig, APIEndpointData as APIEndpointData, + FilesetMetadata as FilesetMetadata, FileStorageType as FileStorageType, InferenceParams as InferenceParams, LinearLayerSpec as LinearLayerSpec, diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py index b76dd4a694..3833c1d785 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py @@ -22,7 +22,6 @@ from .cache_status import CacheStatus as CacheStatus from .fileset_file import FilesetFile as FilesetFile from .fileset_purpose import FilesetPurpose as FilesetPurpose -from .fileset_metadata import FilesetMetadata as FilesetMetadata from .s3_storage_config import S3StorageConfig as S3StorageConfig from .ngc_storage_config import NGCStorageConfig as NGCStorageConfig from .fileset_list_params import FilesetListParams as FilesetListParams @@ -33,7 +32,6 @@ from .fileset_create_params import FilesetCreateParams as FilesetCreateParams from .fileset_update_params import FilesetUpdateParams as FilesetUpdateParams from .file_list_files_params import FileListFilesParams as FileListFilesParams -from .fileset_metadata_param import FilesetMetadataParam as FilesetMetadataParam from .file_upload_file_params import FileUploadFileParams as FileUploadFileParams from .s3_storage_config_param import S3StorageConfigParam as S3StorageConfigParam from .ngc_storage_config_param import NGCStorageConfigParam as NGCStorageConfigParam diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py index e6d9642b7a..810d5ce990 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset.py @@ -20,10 +20,10 @@ from ..._models import BaseModel from .fileset_purpose import FilesetPurpose -from .fileset_metadata import FilesetMetadata from .s3_storage_config import S3StorageConfig from .ngc_storage_config import NGCStorageConfig from .local_storage_config import LocalStorageConfig +from ..shared.fileset_metadata import FilesetMetadata from .huggingface_storage_config import HuggingfaceStorageConfig __all__ = ["Fileset", "Storage"] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py index ea3cb763f7..9836fcb477 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_create_params.py @@ -21,7 +21,6 @@ from typing_extensions import Required, TypeAlias, TypedDict from .fileset_purpose import FilesetPurpose -from .fileset_metadata_param import FilesetMetadataParam from .s3_storage_config_param import S3StorageConfigParam from .ngc_storage_config_param import NGCStorageConfigParam from .local_storage_config_param import LocalStorageConfigParam diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py deleted file mode 100644 index 66f37de921..0000000000 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata_param.py +++ /dev/null @@ -1,47 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. - -from __future__ import annotations - -from typing_extensions import TypedDict - -from ..shared_params.model_metadata_content import ModelMetadataContent -from ..shared_params.dataset_metadata_content import DatasetMetadataContent - -__all__ = ["FilesetMetadataParam"] - - -class FilesetMetadataParam(TypedDict, total=False): - """Tagged metadata container - the key indicates the type. - - Example: - metadata = FilesetMetadata( - dataset=DatasetMetadataContent( - schema={"columns": ["id", "name"]}, - ) - ) - """ - - dataset: DatasetMetadataContent - """Content for dataset-type filesets.""" - - model: ModelMetadataContent - """Content for model-type filesets. - - Contains tool calling configuration that is merged into the ModelSpec during - checkpoint analysis. - """ diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py index 70ea9bdc92..d16fead87f 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/shared/__init__.py @@ -26,6 +26,7 @@ from .delete_response import DeleteResponse as DeleteResponse from .finetuning_type import FinetuningType as FinetuningType from .pagination_data import PaginationData as PaginationData +from .fileset_metadata import FilesetMetadata as FilesetMetadata from .inference_params import InferenceParams as InferenceParams from .platform_job_log import PlatformJobLog as PlatformJobLog from .tool_call_config import ToolCallConfig as ToolCallConfig diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata.py b/sdk/python/nemo-platform/src/nemo_platform/types/shared/fileset_metadata.py similarity index 91% rename from sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata.py rename to sdk/python/nemo-platform/src/nemo_platform/types/shared/fileset_metadata.py index 36573bd374..b35b6d8ecc 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/fileset_metadata.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/shared/fileset_metadata.py @@ -18,8 +18,8 @@ from typing import Optional from ..._models import BaseModel -from ..shared.model_metadata_content import ModelMetadataContent -from ..shared.dataset_metadata_content import DatasetMetadataContent +from .model_metadata_content import ModelMetadataContent +from .dataset_metadata_content import DatasetMetadataContent __all__ = ["FilesetMetadata"] diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py b/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py index 66f37de921..e3f510ca6e 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/shared_params/fileset_metadata_param.py @@ -19,8 +19,8 @@ from typing_extensions import TypedDict -from ..shared_params.model_metadata_content import ModelMetadataContent -from ..shared_params.dataset_metadata_content import DatasetMetadataContent +from .model_metadata_content import ModelMetadataContent +from .dataset_metadata_content import DatasetMetadataContent __all__ = ["FilesetMetadataParam"] From 03120ab152bab243daf691627cbe20c0ecc8de04 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 14:24:04 -0700 Subject: [PATCH 05/17] add profiles Signed-off-by: Matthew Grossman --- .../src/nemo_platform_plugin/jobs/profiles.py | 101 ++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py new file mode 100644 index 0000000000..96aee31c87 --- /dev/null +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py @@ -0,0 +1,101 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Execution profile resolution for plugin compilers. + +Provides :func:`resolve_profile_kind` which queries the Jobs service's +execution profiles endpoint to determine what executor payload shape +(``"container"`` or ``"subprocess"``) a given ``(provider, profile)`` +pair expects. + +Plugin compilers use this to emit the correct executor type without +hardcoding profile-name-to-kind mappings:: + + from nemo_platform_plugin.jobs.profiles import resolve_profile_kind + + kind = await resolve_profile_kind(async_sdk, "cpu", profile or "default") + if kind == "subprocess": + executor = SubprocessExecutionProviderSpec(...) + else: + executor = ContainerExecutionProviderSpec(...) + +.. note:: + + **Not the long-term strategy.** This client-side resolution is a + pragmatic bridge. The end-state (Razvan's ``compile_default`` design + from AIRCORE-397) moves compilation to the Jobs service backend + itself — the backend knows its own kind and constructs the executor + server-side. When that lands, plugins will post a ``PluginJobSpec`` + (just the domain payload + metadata) and this helper becomes + unnecessary. See ``plan-default-compilation.md`` in the AIRCORE-397 + architecture plans. +""" + +from __future__ import annotations + +import logging +import time +from typing import Any, Literal + +from nemo_platform import AsyncNeMoPlatform +from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError + +ExecutorKind = Literal["container", "subprocess"] +"""Executor payload shape: ``"container"`` for image-backed work, ``"subprocess"`` for host commands.""" + +logger = logging.getLogger(__name__) + +# TODO(AIRCORE-397): Remove this module when compile_default() lands on +# the backend classes. At that point the Jobs service resolves the +# profile kind server-side and plugin compilers no longer need to query +# execution profiles. + +_CACHE_TTL_SECONDS = 300 # 5 minutes +_cache: dict[str, Any] = {"profiles": None, "fetched_at": 0.0} + + +async def _fetch_execution_profiles(sdk: AsyncNeMoPlatform) -> list[Any]: + """Fetch execution profiles from the Jobs service, with caching.""" + now = time.monotonic() + if _cache["profiles"] is not None and (now - _cache["fetched_at"]) < _CACHE_TTL_SECONDS: + return _cache["profiles"] + + profiles = await sdk.jobs.list_execution_profiles() + _cache["profiles"] = profiles + _cache["fetched_at"] = now + return profiles + + +async def resolve_profile_kind( + sdk: AsyncNeMoPlatform, + provider: str, + profile: str, +) -> ExecutorKind: + """Resolve the executor payload kind for a ``(provider, profile)`` pair. + + Queries the Jobs service's ``GET /v2/execution-profiles`` endpoint + (cached for 5 minutes) and returns the profile's ``kind`` field + (``"container"`` or ``"subprocess"``). + + Args: + sdk: Async platform SDK client. + provider: Compute provider (``"cpu"``, ``"gpu"``, ``"gpu_distributed"``). + profile: Execution profile name (``"default"``, ``"subprocess"``, etc.). + + Returns: + ``"container"`` or ``"subprocess"``. + + Raises: + PlatformJobCompilationError: If no matching execution profile is found. + """ + profiles = await _fetch_execution_profiles(sdk) + for p in profiles: + if getattr(p, "provider", None) == provider and getattr(p, "profile", None) == profile: + kind = getattr(p, "kind", None) + if kind is not None: + return kind + + raise PlatformJobCompilationError( + f"Execution profile '{provider}/{profile}' not found. " + f"Check that the Jobs service has a profile registered for provider='{provider}', profile='{profile}'." + ) From 91e6e65458d366b1c0c7dde64c9b813524e7ec9f Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 14:30:25 -0700 Subject: [PATCH 06/17] fixes Signed-off-by: Matthew Grossman --- services/core/jobs/tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index 18f45d983a..7d54aeb22c 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -13,7 +13,7 @@ from httpx import ASGITransport, AsyncClient from nemo_platform import AsyncNeMoPlatform from nemo_platform_plugin.jobs.api_factory import ContainerSpec as FactoryContainerSpec -from nemo_platform_plugin.jobs.api_factory import ContainerExecutionProviderSpec as FactoryCPUExecutionProviderSpec +from nemo_platform_plugin.jobs.api_factory import ContainerExecutionProviderSpec as FactoryContainerExecutionProviderSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobEnvironmentVariableParam, job_route_factory from nemo_platform_plugin.jobs.api_factory import PlatformJobSpec as FactoryPlatformJobSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobStep as FactoryPlatformJobStep @@ -490,7 +490,7 @@ def hello_world_job_config( steps=[ FactoryPlatformJobStep( name="hello-world-step-1", - executor=FactoryCPUExecutionProviderSpec( + executor=FactoryContainerExecutionProviderSpec( kind="container", provider="cpu", profile="default", From 47c25165c89787650b1485896e6bc77909a8f640 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 15:26:54 -0700 Subject: [PATCH 07/17] self code review Signed-off-by: Matthew Grossman --- .../nemo_platform_plugin/jobs/api_factory.py | 1 - .../src/nemo_platform_plugin/jobs/profiles.py | 23 ++++++++++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py index f5080b80a0..9e6c615487 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py @@ -99,7 +99,6 @@ class BaseJobRequest(BaseModel, Generic[JobConfigT]): ownership: dict | None = None custom_fields: dict | None = None profile: str | None = None - options: dict | None = None class BaseJob(BaseModel, Generic[JobConfigT]): diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py index 96aee31c87..86982fd459 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/profiles.py @@ -35,9 +35,10 @@ import logging import time -from typing import Any, Literal +from typing import Literal from nemo_platform import AsyncNeMoPlatform +from nemo_platform.types.jobs.job_list_execution_profiles_response import JobListExecutionProfilesResponseItem from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError ExecutorKind = Literal["container", "subprocess"] @@ -51,18 +52,20 @@ # execution profiles. _CACHE_TTL_SECONDS = 300 # 5 minutes -_cache: dict[str, Any] = {"profiles": None, "fetched_at": 0.0} +_cached_profiles: list[JobListExecutionProfilesResponseItem] | None = None +_cached_at: float = 0.0 -async def _fetch_execution_profiles(sdk: AsyncNeMoPlatform) -> list[Any]: +async def _fetch_execution_profiles(sdk: AsyncNeMoPlatform) -> list[JobListExecutionProfilesResponseItem]: """Fetch execution profiles from the Jobs service, with caching.""" + global _cached_profiles, _cached_at now = time.monotonic() - if _cache["profiles"] is not None and (now - _cache["fetched_at"]) < _CACHE_TTL_SECONDS: - return _cache["profiles"] + if _cached_profiles is not None and (now - _cached_at) < _CACHE_TTL_SECONDS: + return _cached_profiles profiles = await sdk.jobs.list_execution_profiles() - _cache["profiles"] = profiles - _cache["fetched_at"] = now + _cached_profiles = profiles + _cached_at = now return profiles @@ -90,10 +93,8 @@ async def resolve_profile_kind( """ profiles = await _fetch_execution_profiles(sdk) for p in profiles: - if getattr(p, "provider", None) == provider and getattr(p, "profile", None) == profile: - kind = getattr(p, "kind", None) - if kind is not None: - return kind + if p.provider == provider and p.profile == profile and p.kind is not None: + return p.kind raise PlatformJobCompilationError( f"Execution profile '{provider}/{profile}' not found. " From 3b920b84873f211bed6c711cc7c20485f8de83d6 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 16:07:45 -0700 Subject: [PATCH 08/17] lint Signed-off-by: Matthew Grossman --- plugins/nemo-agents/openapi/openapi.yaml | 20 ------------------- .../nemo-data-designer/openapi/openapi.yaml | 4 ---- plugins/nemo-evaluator/openapi/openapi.yaml | 4 ---- .../openapi/openapi.yaml | 4 ---- 4 files changed, 32 deletions(-) diff --git a/plugins/nemo-agents/openapi/openapi.yaml b/plugins/nemo-agents/openapi/openapi.yaml index 5e52cecc7f..4b00e75c7b 100644 --- a/plugins/nemo-agents/openapi/openapi.yaml +++ b/plugins/nemo-agents/openapi/openapi.yaml @@ -2552,10 +2552,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec @@ -2848,10 +2844,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec @@ -3070,10 +3062,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec @@ -3432,10 +3420,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec @@ -3655,10 +3639,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec diff --git a/plugins/nemo-data-designer/openapi/openapi.yaml b/plugins/nemo-data-designer/openapi/openapi.yaml index b175ada9e4..509f111f17 100644 --- a/plugins/nemo-data-designer/openapi/openapi.yaml +++ b/plugins/nemo-data-designer/openapi/openapi.yaml @@ -837,10 +837,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec diff --git a/plugins/nemo-evaluator/openapi/openapi.yaml b/plugins/nemo-evaluator/openapi/openapi.yaml index 27887f8588..2fada6a0c6 100644 --- a/plugins/nemo-evaluator/openapi/openapi.yaml +++ b/plugins/nemo-evaluator/openapi/openapi.yaml @@ -633,10 +633,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec diff --git a/plugins/nemo-safe-synthesizer/openapi/openapi.yaml b/plugins/nemo-safe-synthesizer/openapi/openapi.yaml index 583589a967..1694156aef 100644 --- a/plugins/nemo-safe-synthesizer/openapi/openapi.yaml +++ b/plugins/nemo-safe-synthesizer/openapi/openapi.yaml @@ -1505,10 +1505,6 @@ components: profile: title: Profile type: string - options: - title: Options - additionalProperties: true - type: object type: object required: - spec From ca3bebc41e14d7bb3b93ebfdac37ce4c0c680b5f Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 16:13:03 -0700 Subject: [PATCH 09/17] fix(jobs): update config files and clean up PR review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix local.yaml configs: provider=subprocess → provider=cpu, profile=subprocess - Remove unused `options` field from BaseJobRequest - Type the execution profiles cache properly (no more dict[str, Any]) - Rename stale FactoryCPUExecutionProviderSpec alias in conftest Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- packages/nmp_platform/config/local.yaml | 4 ++-- .../src/nmp/platform_runner/config/local.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/nmp_platform/config/local.yaml b/packages/nmp_platform/config/local.yaml index 7b7542da0c..9c67656982 100644 --- a/packages/nmp_platform/config/local.yaml +++ b/packages/nmp_platform/config/local.yaml @@ -62,8 +62,8 @@ jobs: # keys; `merge_executor_profiles` keys on (provider, profile) so subprocess # and Docker entries coexist. executors: - - provider: subprocess - profile: default + - provider: cpu + profile: subprocess backend: subprocess config: working_directory: /tmp/nmp-subprocess-jobs diff --git a/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml b/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml index bcb5a60a55..ad351abadd 100644 --- a/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml +++ b/packages/nmp_platform_runner/src/nmp/platform_runner/config/local.yaml @@ -19,8 +19,8 @@ entities: {} jobs: executors: - - provider: subprocess - profile: default + - provider: cpu + profile: subprocess backend: subprocess config: working_directory: /tmp/nmp-subprocess-jobs From aed8e5dca603f1657db7e0ef07d18d5879002a2b Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 16:23:14 -0700 Subject: [PATCH 10/17] fix(jobs): fix lint errors and re-export FilesetMetadata from types.files - Fix import sorting (I001) in test files - Remove unused SubprocessExecutionProvider import (F401) in test_jobs_api - Re-export FilesetMetadata from types.files (moved to types.shared by SDK regen) Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- .../nemo-platform/src/nemo_platform/types/files/__init__.py | 1 + services/core/jobs/tests/conftest.py | 4 +++- services/core/jobs/tests/controllers/test_base.py | 2 +- services/core/jobs/tests/controllers/test_docker_backend.py | 2 +- .../core/jobs/tests/controllers/test_kubernetes_backend.py | 2 +- services/core/jobs/tests/controllers/test_volcano_backend.py | 2 +- services/core/jobs/tests/test_jobs_api.py | 2 +- 7 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py b/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py index 3833c1d785..920d1b889d 100644 --- a/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py +++ b/sdk/python/nemo-platform/src/nemo_platform/types/files/__init__.py @@ -22,6 +22,7 @@ from .cache_status import CacheStatus as CacheStatus from .fileset_file import FilesetFile as FilesetFile from .fileset_purpose import FilesetPurpose as FilesetPurpose +from ..shared.fileset_metadata import FilesetMetadata as FilesetMetadata from .s3_storage_config import S3StorageConfig as S3StorageConfig from .ngc_storage_config import NGCStorageConfig as NGCStorageConfig from .fileset_list_params import FilesetListParams as FilesetListParams diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index 7d54aeb22c..1ad85342fe 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -12,8 +12,10 @@ from fastapi import FastAPI from httpx import ASGITransport, AsyncClient from nemo_platform import AsyncNeMoPlatform +from nemo_platform_plugin.jobs.api_factory import ( + ContainerExecutionProviderSpec as FactoryContainerExecutionProviderSpec, +) from nemo_platform_plugin.jobs.api_factory import ContainerSpec as FactoryContainerSpec -from nemo_platform_plugin.jobs.api_factory import ContainerExecutionProviderSpec as FactoryContainerExecutionProviderSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobEnvironmentVariableParam, job_route_factory from nemo_platform_plugin.jobs.api_factory import PlatformJobSpec as FactoryPlatformJobSpec from nemo_platform_plugin.jobs.api_factory import PlatformJobStep as FactoryPlatformJobStep diff --git a/services/core/jobs/tests/controllers/test_base.py b/services/core/jobs/tests/controllers/test_base.py index 3b1f4c7bf8..1a3721860b 100644 --- a/services/core/jobs/tests/controllers/test_base.py +++ b/services/core/jobs/tests/controllers/test_base.py @@ -9,7 +9,7 @@ from nmp.common.config import PlatformConfig from nmp.common.jobs.schemas import PlatformJobStatus from nmp.core.jobs.api.v2.jobs.schemas import PlatformJobStepWithContext -from nmp.core.jobs.app.providers import ContainerSpec, ContainerExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import PlatformJobStepSpec, StepLifecycle from nmp.core.jobs.controllers.backends.base import get_logs_endpoint_from_fileset, resolve_task_image from nmp.core.jobs.controllers.backends.test import MockKubernetesCPUJobBackend diff --git a/services/core/jobs/tests/controllers/test_docker_backend.py b/services/core/jobs/tests/controllers/test_docker_backend.py index 7f8dfbc19f..542b5d396a 100644 --- a/services/core/jobs/tests/controllers/test_docker_backend.py +++ b/services/core/jobs/tests/controllers/test_docker_backend.py @@ -38,8 +38,8 @@ from nmp.core.jobs.app.providers import ( ComputeResources, ComputeResourceSpec, - ContainerSpec, ContainerExecutionProvider, + ContainerSpec, ) from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, diff --git a/services/core/jobs/tests/controllers/test_kubernetes_backend.py b/services/core/jobs/tests/controllers/test_kubernetes_backend.py index a54b473b6d..8641138626 100644 --- a/services/core/jobs/tests/controllers/test_kubernetes_backend.py +++ b/services/core/jobs/tests/controllers/test_kubernetes_backend.py @@ -28,7 +28,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import ContainerSpec, ContainerExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobSecretEnvironmentVariableRef, diff --git a/services/core/jobs/tests/controllers/test_volcano_backend.py b/services/core/jobs/tests/controllers/test_volcano_backend.py index 17c034c7f2..317fa08dc9 100644 --- a/services/core/jobs/tests/controllers/test_volcano_backend.py +++ b/services/core/jobs/tests/controllers/test_volcano_backend.py @@ -29,7 +29,7 @@ JOB_WORKSPACE_ID_LABEL, KUBE_JOB_SELECTOR_LABELS, ) -from nmp.core.jobs.app.providers import ComputeResources, ContainerSpec, ContainerExecutionProvider +from nmp.core.jobs.app.providers import ComputeResources, ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobEnvironmentVariable, PlatformJobSecretEnvironmentVariableRef, diff --git a/services/core/jobs/tests/test_jobs_api.py b/services/core/jobs/tests/test_jobs_api.py index 627dc6f4b2..0109b6b29b 100644 --- a/services/core/jobs/tests/test_jobs_api.py +++ b/services/core/jobs/tests/test_jobs_api.py @@ -25,7 +25,7 @@ PlatformJobStepsListFilter, ) from nmp.core.jobs.app.dispatcher import JobDispatcher -from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec, SubprocessExecutionProvider +from nmp.core.jobs.app.providers import ContainerExecutionProvider, ContainerSpec from nmp.core.jobs.app.schemas import ( PlatformJobSpec, PlatformJobStepSpec, From 6552ef1b7524c56f06c9bdfd2382d6d19968027b Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 16:35:12 -0700 Subject: [PATCH 11/17] style: format test files Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- packages/nmp_common/tests/api_factory/test_api_factory.py | 4 +++- services/core/jobs/tests/controllers/test_base.py | 4 +++- .../core/jobs/tests/controllers/test_subprocess_backend.py | 4 +--- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/packages/nmp_common/tests/api_factory/test_api_factory.py b/packages/nmp_common/tests/api_factory/test_api_factory.py index 0d11c2d21e..ff9dcb718e 100644 --- a/packages/nmp_common/tests/api_factory/test_api_factory.py +++ b/packages/nmp_common/tests/api_factory/test_api_factory.py @@ -122,7 +122,9 @@ def test_api_factory_routes(): def test_validate_job_spec(): - executor = ContainerExecutionProviderSpec(kind="container", provider="cpu", profile="default", container=ContainerSpec(image="foo_image")) + executor = ContainerExecutionProviderSpec( + kind="container", provider="cpu", profile="default", container=ContainerSpec(image="foo_image") + ) valid_job = PlatformJobSpec( steps=[ PlatformJobStep( diff --git a/services/core/jobs/tests/controllers/test_base.py b/services/core/jobs/tests/controllers/test_base.py index 1a3721860b..c3ef2d37a2 100644 --- a/services/core/jobs/tests/controllers/test_base.py +++ b/services/core/jobs/tests/controllers/test_base.py @@ -164,7 +164,9 @@ def _make_step( if step_spec is ...: step_spec = PlatformJobStepSpec( name="test-step", - executor=ContainerExecutionProvider(provider="cpu", profile="default", container=ContainerSpec(image="img")), + executor=ContainerExecutionProvider( + provider="cpu", profile="default", container=ContainerSpec(image="img") + ), config={}, lifecycle=StepLifecycle(staleness_timeout_seconds=staleness_timeout), ) diff --git a/services/core/jobs/tests/controllers/test_subprocess_backend.py b/services/core/jobs/tests/controllers/test_subprocess_backend.py index 0862a4beb2..50446eb69d 100644 --- a/services/core/jobs/tests/controllers/test_subprocess_backend.py +++ b/services/core/jobs/tests/controllers/test_subprocess_backend.py @@ -26,9 +26,7 @@ def _subprocess_backend(mock_nmp_client, tmp_path, mock_platform_config) -> Subp def _step_with_command(step, command: list[str]): updated_step = step.model_copy(deep=True) - updated_step.step_spec.executor = SubprocessExecutionProvider( - provider="cpu", profile="subprocess", command=command - ) + updated_step.step_spec.executor = SubprocessExecutionProvider(provider="cpu", profile="subprocess", command=command) return updated_step From ba8ec9912f9f9974dc28d9e9ea9ff99968d74714 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 16:41:56 -0700 Subject: [PATCH 12/17] fix(lint): add unused-type-ignore-comment to ty ignore list SDK type regeneration changed type signatures, making some existing `# type: ignore` comments unnecessary. Add the correct ty rule name to the CI ignore list alongside the existing unused-ignore-comment. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- tools/lint/lint-python-types.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lint/lint-python-types.sh b/tools/lint/lint-python-types.sh index ae6077aef3..d132eb5b96 100755 --- a/tools/lint/lint-python-types.sh +++ b/tools/lint/lint-python-types.sh @@ -9,7 +9,8 @@ set -euo pipefail # Counts reflect the violation count at the time of suppression. ci_ignored_rules=( invalid-argument-type # 148 - unused-ignore-comment # 14 + unused-ignore-comment # 14 + unused-type-ignore-comment # triggered by SDK type changes unresolved-attribute # 141 not-subscriptable # 19 invalid-assignment # 9 From a4f2afc656bbd87c37fc5e63a67457ef060bdbb2 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 16:53:52 -0700 Subject: [PATCH 13/17] fix(tests): add kind to e2e executor dicts, suppress ty invalid-key in agent tests - Add kind="container" to all 15 executor dicts in e2e/test_jobs.py - Add type: ignore[invalid-key] for subprocess command assertions in agent tests (ty can't narrow the Executor union from dict key access) Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- e2e/test_jobs.py | 14 ++++++++++++++ .../tests/unit/test_evaluate_agent_job.py | 2 +- .../tests/unit/test_improvement_jobs.py | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/e2e/test_jobs.py b/e2e/test_jobs.py index fa8dcb416c..9940f75ccf 100644 --- a/e2e/test_jobs.py +++ b/e2e/test_jobs.py @@ -55,6 +55,7 @@ def test_basic_platform_job_lifecycle(sdk: NeMoPlatform, workspace: str): { "name": "echo-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["echo", "Hello from e2e test!"], @@ -98,6 +99,7 @@ def test_job_logs_across_multiple_batches(sdk: NeMoPlatform, workspace: str): { "name": "multi-log-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", log_command], @@ -137,6 +139,7 @@ def test_job_config_is_readable(sdk: NeMoPlatform, workspace: str): { "name": "config-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", "echo 'Step config:'; cat $NEMO_JOB_STEP_CONFIG_FILE_PATH;"], @@ -171,6 +174,7 @@ def test_job_passing_data_between_steps(sdk: NeMoPlatform, workspace: str): { "name": "generate-data-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": [ @@ -184,6 +188,7 @@ def test_job_passing_data_between_steps(sdk: NeMoPlatform, workspace: str): { "name": "consume-data-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": [ @@ -225,6 +230,7 @@ def test_job_using_secret_environment_variable(sdk: NeMoPlatform, workspace: str { "name": "secret-envvar-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], @@ -262,6 +268,7 @@ def test_job_with_expected_failure(sdk: NeMoPlatform, workspace: str): { "name": "failing-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", "echo 'This step will fail'; exit 1;"], @@ -291,6 +298,7 @@ def test_job_cancel_immediately(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", "sleep 60"], @@ -320,6 +328,7 @@ def test_job_cancel_once_active(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", "sleep 300"], @@ -360,6 +369,7 @@ def test_job_pause_resume(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step-pause-resume", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", "sleep 300"], @@ -401,6 +411,7 @@ def test_job_pause_and_cancel(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step-pause-cancel", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": ["sh", "-c", "sleep 300"], @@ -437,6 +448,7 @@ def test_job_using_additional_volume(sdk: NeMoPlatform, workspace: str): { "name": "write-data", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": [ @@ -451,6 +463,7 @@ def test_job_using_additional_volume(sdk: NeMoPlatform, workspace: str): { "name": "read-data", "executor": { + "kind": "container", "provider": "cpu", "container": { "command": [ @@ -492,6 +505,7 @@ def test_job_invalid_image_format(sdk: NeMoPlatform, workspace: str, bad_image: { "name": "bad-image-step", "executor": { + "kind": "container", "provider": "cpu", "container": { "image": bad_image, diff --git a/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py b/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py index f626e71cb4..3747e4966f 100644 --- a/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py +++ b/plugins/nemo-agents/tests/unit/test_evaluate_agent_job.py @@ -37,7 +37,7 @@ async def test_compile_produces_single_cpu_step() -> None: step = steps[0] assert step["name"] == "evaluate-agent" assert step["executor"]["provider"] == "cpu" - assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate"] + assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate"] # type: ignore[invalid-key] assert step["config"]["agent"] == "calc" assert step["config"]["eval_config"] == "config.yml" assert step["config"]["eval_config_fileset"] == "nemo-agent-eval-calc" diff --git a/plugins/nemo-agents/tests/unit/test_improvement_jobs.py b/plugins/nemo-agents/tests/unit/test_improvement_jobs.py index 011d472fd2..1221f448c8 100644 --- a/plugins/nemo-agents/tests/unit/test_improvement_jobs.py +++ b/plugins/nemo-agents/tests/unit/test_improvement_jobs.py @@ -87,7 +87,7 @@ async def test_evaluate_suite_compile_produces_single_subprocess_step() -> None: step = steps[0] assert step["name"] == "evaluate-suite" assert step["executor"]["provider"] == "cpu" - assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"] + assert step["executor"]["command"] == ["python", "-m", "nemo_agents_plugin.tasks.evaluate_suite"] # type: ignore[invalid-key] assert step["config"]["evals"] == "/abs/evals" assert step["config"]["agent"] == "/abs/agent" From ebee99c237435df871622a96f0ab5d810f6580aa Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 17:16:59 -0700 Subject: [PATCH 14/17] fix(tests): convert e2e tests to subprocess executors, add kind to integration/search tests - e2e/test_jobs.py: convert all executor blocks from container to subprocess format (these run host commands, not containers) - integration tests: add kind="container" to executor dicts - test_job_search.py: add kind="container" to all 25+ executor dicts - test_jobs_api.py: add kind="container" to secrets test executor Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- e2e/test_jobs.py | 127 +++++++----------- .../integration/test_jobs_auth_propagation.py | 2 + .../integration/test_jobs_secrets_access.py | 1 + services/core/jobs/tests/test_job_search.py | 48 +++---- services/core/jobs/tests/test_jobs_api.py | 1 + 5 files changed, 77 insertions(+), 102 deletions(-) diff --git a/e2e/test_jobs.py b/e2e/test_jobs.py index 9940f75ccf..87ed787948 100644 --- a/e2e/test_jobs.py +++ b/e2e/test_jobs.py @@ -1,8 +1,7 @@ """E2E tests for platform jobs. -These tests submit jobs with ContainerExecutionProviderSpec (container + command). -The container image is omitted so that the execution profile's default_task_image -is used on Kubernetes/Docker backends. +These tests submit jobs with SubprocessExecutionProviderSpec (host command). +The e2e test environment runs against the subprocess backend. Ported from Platform-Deploy e2e/test_jobs.py, adapted for the SDK's TypedDict param types and filtered to tests that work without Docker. @@ -55,11 +54,9 @@ def test_basic_platform_job_lifecycle(sdk: NeMoPlatform, workspace: str): { "name": "echo-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["echo", "Hello from e2e test!"], - }, + "command": ["echo", "Hello from e2e test!"], }, }, ], @@ -99,11 +96,9 @@ def test_job_logs_across_multiple_batches(sdk: NeMoPlatform, workspace: str): { "name": "multi-log-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", log_command], - }, + "command": ["sh", "-c", log_command], }, }, ], @@ -139,11 +134,9 @@ def test_job_config_is_readable(sdk: NeMoPlatform, workspace: str): { "name": "config-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "echo 'Step config:'; cat $NEMO_JOB_STEP_CONFIG_FILE_PATH;"], - }, + "command": ["sh", "-c", "echo 'Step config:'; cat $NEMO_JOB_STEP_CONFIG_FILE_PATH;"], }, "config": { "message": "Hello from job config!", @@ -174,29 +167,25 @@ def test_job_passing_data_between_steps(sdk: NeMoPlatform, workspace: str): { "name": "generate-data-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "echo 'Data from first step' > $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", - ], - }, + "command": [ + "sh", + "-c", + "echo 'Data from first step' > $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", + ], }, }, { "name": "consume-data-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "echo 'Consuming data:'; cat $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", - ], - }, + "command": [ + "sh", + "-c", + "echo 'Consuming data:'; cat $NEMO_JOB_PERSISTENT_JOB_STORAGE_PATH/data.txt", + ], }, }, ], @@ -230,11 +219,9 @@ def test_job_using_secret_environment_variable(sdk: NeMoPlatform, workspace: str { "name": "secret-envvar-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], - }, + "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], }, "environment": [ { @@ -268,11 +255,9 @@ def test_job_with_expected_failure(sdk: NeMoPlatform, workspace: str): { "name": "failing-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "echo 'This step will fail'; exit 1;"], - }, + "command": ["sh", "-c", "echo 'This step will fail'; exit 1;"], }, }, ], @@ -298,11 +283,9 @@ def test_job_cancel_immediately(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 60"], - }, + "command": ["sh", "-c", "sleep 60"], }, }, ], @@ -328,11 +311,9 @@ def test_job_cancel_once_active(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 300"], - }, + "command": ["sh", "-c", "sleep 300"], }, }, ], @@ -369,11 +350,9 @@ def test_job_pause_resume(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step-pause-resume", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 300"], - }, + "command": ["sh", "-c", "sleep 300"], }, }, ], @@ -411,11 +390,9 @@ def test_job_pause_and_cancel(sdk: NeMoPlatform, workspace: str): { "name": "long-running-step-pause-cancel", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": ["sh", "-c", "sleep 300"], - }, + "command": ["sh", "-c", "sleep 300"], }, }, ], @@ -448,31 +425,27 @@ def test_job_using_additional_volume(sdk: NeMoPlatform, workspace: str): { "name": "write-data", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "echo 'Hello, World!' > /mnt/additional_storage/shared_data.txt; " - "echo 'Successfully wrote data to persistent storage';", - ], - }, + "command": [ + "sh", + "-c", + "echo 'Hello, World!' > /mnt/additional_storage/shared_data.txt; " + "echo 'Successfully wrote data to persistent storage';", + ], }, }, { "name": "read-data", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "command": [ - "sh", - "-c", - "cat /mnt/additional_storage/shared_data.txt; " - "echo 'Successfully read data from persistent storage';", - ], - }, + "command": [ + "sh", + "-c", + "cat /mnt/additional_storage/shared_data.txt; " + "echo 'Successfully read data from persistent storage';", + ], }, }, ], @@ -505,12 +478,10 @@ def test_job_invalid_image_format(sdk: NeMoPlatform, workspace: str, bad_image: { "name": "bad-image-step", "executor": { - "kind": "container", + "kind": "subprocess", "provider": "cpu", - "container": { - "image": bad_image, - "command": ["echo", "This should not run"], - }, + "image": bad_image, + "command": ["echo", "This should not run"], }, }, ], diff --git a/services/core/jobs/tests/integration/test_jobs_auth_propagation.py b/services/core/jobs/tests/integration/test_jobs_auth_propagation.py index 1efbc6cab7..d501d8e042 100644 --- a/services/core/jobs/tests/integration/test_jobs_auth_propagation.py +++ b/services/core/jobs/tests/integration/test_jobs_auth_propagation.py @@ -61,6 +61,7 @@ def test_auth_context_stripped_for_regular_user(self, sdk: NeMoPlatform): { "name": "test-step", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { @@ -97,6 +98,7 @@ def test_auth_context_visible_to_service_principal(self, sdk: NeMoPlatform): { "name": "test-step", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { diff --git a/services/core/jobs/tests/integration/test_jobs_secrets_access.py b/services/core/jobs/tests/integration/test_jobs_secrets_access.py index 5ed5e411f7..cdb4412a83 100644 --- a/services/core/jobs/tests/integration/test_jobs_secrets_access.py +++ b/services/core/jobs/tests/integration/test_jobs_secrets_access.py @@ -49,6 +49,7 @@ def _platform_spec_with_secret(secret_ref: str, env_var_name: str = "MY_SECRET") { "name": "step-with-secret", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": { diff --git a/services/core/jobs/tests/test_job_search.py b/services/core/jobs/tests/test_job_search.py index 370a8fdad4..f2764bf308 100644 --- a/services/core/jobs/tests/test_job_search.py +++ b/services/core/jobs/tests/test_job_search.py @@ -20,7 +20,7 @@ async def test_search_jobs_by_name(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -31,7 +31,7 @@ async def test_search_jobs_by_name(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -53,7 +53,7 @@ async def test_search_jobs_by_project(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -65,7 +65,7 @@ async def test_search_jobs_by_project(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -85,7 +85,7 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -96,7 +96,7 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -107,7 +107,7 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -133,7 +133,7 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -145,7 +145,7 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -157,7 +157,7 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -181,7 +181,7 @@ async def test_search_jobs_case_insensitive(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -204,7 +204,7 @@ async def test_search_jobs_partial_match(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -224,7 +224,7 @@ async def test_search_combined_with_filter(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -235,7 +235,7 @@ async def test_search_combined_with_filter(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -259,7 +259,7 @@ async def test_search_no_results(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -278,7 +278,7 @@ async def test_search_empty_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -289,7 +289,7 @@ async def test_search_empty_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -330,7 +330,7 @@ async def test_search_pagination(test_sdk: AsyncNeMoPlatform): "steps": [ { "name": "step1", - "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}, + "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}, } ] }, @@ -359,7 +359,7 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -370,7 +370,7 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -388,7 +388,7 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -410,7 +410,7 @@ async def test_search_long_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) @@ -434,7 +434,7 @@ async def test_search_result_limit(test_sdk: AsyncNeMoPlatform): "steps": [ { "name": "step1", - "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}, + "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}, } ] }, @@ -473,7 +473,7 @@ async def test_search_special_characters(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"provider": "cpu", "profile": "default", "container": {"image": "test"}}} + {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} ] }, ) diff --git a/services/core/jobs/tests/test_jobs_api.py b/services/core/jobs/tests/test_jobs_api.py index 0109b6b29b..27b6d7dbd7 100644 --- a/services/core/jobs/tests/test_jobs_api.py +++ b/services/core/jobs/tests/test_jobs_api.py @@ -165,6 +165,7 @@ async def test_create_job_with_secrets(test_sdk: AsyncNeMoPlatform): { "name": "basic", "executor": { + "kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test-image"}, From 1cbe6a5601fce5fe8710634e098b89f67e1feabc Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Mon, 15 Jun 2026 17:32:17 -0700 Subject: [PATCH 15/17] style: format test_job_search.py Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- services/core/jobs/tests/test_job_search.py | 234 ++++++++++++++++++-- 1 file changed, 210 insertions(+), 24 deletions(-) diff --git a/services/core/jobs/tests/test_job_search.py b/services/core/jobs/tests/test_job_search.py index f2764bf308..9cd0741575 100644 --- a/services/core/jobs/tests/test_job_search.py +++ b/services/core/jobs/tests/test_job_search.py @@ -20,7 +20,15 @@ async def test_search_jobs_by_name(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -31,7 +39,15 @@ async def test_search_jobs_by_name(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -53,7 +69,15 @@ async def test_search_jobs_by_project(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -65,7 +89,15 @@ async def test_search_jobs_by_project(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -85,7 +117,15 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -96,7 +136,15 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -107,7 +155,15 @@ async def test_search_jobs_multiple_values_or_logic(test_sdk: AsyncNeMoPlatform) spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -133,7 +189,15 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -145,7 +209,15 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -157,7 +229,15 @@ async def test_search_jobs_multiple_fields_and_logic(test_sdk: AsyncNeMoPlatform spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -181,7 +261,15 @@ async def test_search_jobs_case_insensitive(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -204,7 +292,15 @@ async def test_search_jobs_partial_match(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -224,7 +320,15 @@ async def test_search_combined_with_filter(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -235,7 +339,15 @@ async def test_search_combined_with_filter(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -259,7 +371,15 @@ async def test_search_no_results(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -278,7 +398,15 @@ async def test_search_empty_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -289,7 +417,15 @@ async def test_search_empty_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -330,7 +466,12 @@ async def test_search_pagination(test_sdk: AsyncNeMoPlatform): "steps": [ { "name": "step1", - "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}, + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, } ] }, @@ -359,7 +500,15 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -370,7 +519,15 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -388,7 +545,15 @@ async def test_search_underscore_behavior(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -410,7 +575,15 @@ async def test_search_long_string(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) @@ -434,7 +607,12 @@ async def test_search_result_limit(test_sdk: AsyncNeMoPlatform): "steps": [ { "name": "step1", - "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}, + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, } ] }, @@ -473,7 +651,15 @@ async def test_search_special_characters(test_sdk: AsyncNeMoPlatform): spec={}, platform_spec={ "steps": [ - {"name": "step1", "executor": {"kind": "container", "provider": "cpu", "profile": "default", "container": {"image": "test"}}} + { + "name": "step1", + "executor": { + "kind": "container", + "provider": "cpu", + "profile": "default", + "container": {"image": "test"}, + }, + } ] }, ) From 0ff5d89b8864b4e7e00256db0047d1034a48ed94 Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Tue, 16 Jun 2026 12:37:42 -0700 Subject: [PATCH 16/17] feat(jobs): thread resolved kind+profile through compile pipeline The framework now resolves the executor kind ("container" or "subprocess") from the submitter's profile before calling compile(). Compilers receive both `kind` and `profile` as parameters, so they can emit the correct executor shape without querying execution profiles themselves. - _compile_platform_spec resolves kind via resolve_profile_kind() once - Compiler signature: added kind and profile as last two positional args - NemoJob.compile() receives kind and profile from the framework - stamp_profile() moved from _adapt_compile into _compile_platform_spec - Data designer compiler uses kind to support both container/subprocess - All plugin compilers and test compilers updated for new signature Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- .../src/nemo_platform_plugin/job.py | 20 ++++++---- .../nemo_platform_plugin/jobs/api_factory.py | 40 +++++++++++++++++-- .../src/nemo_platform_plugin/jobs/routes.py | 18 +++++---- .../tests/test_jobs_filter.py | 2 +- .../tests/test_jobs_routes.py | 15 +++---- .../tests/api_factory/test_api_factory.py | 30 +++++++++++--- .../nemo_agents_plugin/jobs/analyze_batch.py | 1 + .../nemo_agents_plugin/jobs/evaluate_agent.py | 1 + .../nemo_agents_plugin/jobs/evaluate_suite.py | 1 + .../nemo_agents_plugin/jobs/optimize_agent.py | 1 + .../jobs/optimize_skills.py | 1 + .../nemo_data_designer_plugin/jobs/create.py | 37 ++++++++++++----- .../src/nemo_evaluator/jobs/evaluate.py | 1 + .../api/v2/jobs/endpoints.py | 2 + services/core/jobs/tests/conftest.py | 2 + .../nmp/hello_world/api/v2/jobs/endpoints.py | 2 + 16 files changed, 131 insertions(+), 43 deletions(-) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py index 24976c19da..d9a54bff4f 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py @@ -273,6 +273,7 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: AsyncNeMoPlatform, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> object: @@ -285,10 +286,12 @@ async def compile( must override this method; the plugin service produces the ``PlatformJobSpec`` the Jobs service expects by invoking it. - Compilers that need to support both container and subprocess - backends can use :func:`~nemo_platform_plugin.jobs.profiles.resolve_profile_kind` - to determine the executor kind for the given profile, rather - than hardcoding profile names. + The ``kind`` parameter is the resolved executor payload shape + (``"container"`` or ``"subprocess"``), resolved by the framework + from the submitter's profile before ``compile()`` is called. + Compilers use this to decide which executor type to emit without + querying execution profiles themselves. ``profile`` is also + provided for compilers that need to stamp it on specific steps. Args: workspace: Workspace scope. @@ -298,9 +301,12 @@ async def compile( async_sdk: ``AsyncNeMoPlatform`` handle. Same contract as :meth:`to_spec`: this runs in the API process so only the async client is offered. - profile: Submitter-selected profile. The factory applies - ``stamp_profile(spec, profile)`` after this method - returns; per-step overrides set here take precedence. + kind: Resolved executor payload shape — ``"container"`` or + ``"subprocess"``. ``None`` when no profile was specified + (compilers should default to ``"container"``). + profile: The submitter-selected execution profile name + (e.g. ``"subprocess"``, ``"default"``). ``None`` when + no profile was specified. options: Opaque wire ``{"": {...}}`` bag; read keys defensively. diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py index 9e6c615487..f997214f5c 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py @@ -439,11 +439,15 @@ class PlatformJobResultRoute(BaseModel): # Signature: (workspace, original_spec, transformed_spec, entity_client, job_name, sdk) -> PlatformJobSpec # job_name is the resolved name (user-provided or auto-generated), None when no name is available # sdk is always provided for accessing secrets, files, and models with user context +# kind is the resolved executor payload shape ("container" or "subprocess") +# profile is the submitter-selected execution profile name PlatformJobSpecCompiler = Callable[ - [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform], PlatformJobSpec + [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform, str | None, str | None], + PlatformJobSpec, ] PlatformJobSpecCompilerAsync = Callable[ - [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform], Awaitable[PlatformJobSpec] + [str, JobInputT, JobOutputT, EntityClient, str | None, AsyncNeMoPlatform, str | None, str | None], + Awaitable[PlatformJobSpec], ] # Input-to-output transformer types: receives job_name to use for related fields (e.g., output) @@ -625,6 +629,8 @@ async def _compile_platform_spec( job_name: str | None, service_name: str, sdk: AsyncNeMoPlatform, + profile: str | None = None, + default_provider: str = "cpu", ) -> PlatformJobSpec: """Compile input and output specs into a PlatformJobSpec for execution. @@ -632,6 +638,10 @@ async def _compile_platform_spec( (with auto-generated fields), allowing it to distinguish between user intent and system-generated values. + The ``kind`` (executor payload shape) is resolved here from the + ``(provider, profile)`` pair before being passed to the compiler, so + individual compilers never need to query execution profiles themselves. + Supports both sync and async compiler callables. Validates the resulting spec for common misconfigurations. @@ -639,15 +649,36 @@ async def _compile_platform_spec( HTTPException(422): If the compiler raises PlatformJobCompilationError. PermissionError: If the compiler raises a PermissionError. """ + from nemo_platform_plugin.jobs.profiles import resolve_profile_kind + + kind: str | None = None + if profile is not None: + try: + kind = await resolve_profile_kind(sdk, default_provider, profile) + except PlatformJobCompilationError: + logger.warning( + "Could not resolve kind for profile '%s/%s', defaulting to container", default_provider, profile + ) + kind = "container" + try: if inspect.iscoroutinefunction(compiler): - platform_spec = await compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk) + platform_spec = await compiler( + workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind, profile + ) else: # Run sync compilers in a thread pool to avoid blocking the event loop. platform_spec = await to_thread.run_sync( - partial(compiler, workspace, original_spec, transformed_spec, entity_client, job_name, sdk) + partial( + compiler, workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind, profile + ) ) + if profile is not None: + from nemo_platform_plugin.jobs.profile import stamp_profile + + stamp_profile(platform_spec, profile) + _validate_job_spec(platform_spec) return platform_spec except PermissionError as e: @@ -816,6 +847,7 @@ async def create_job( job_name, service_name, sdk, + profile=request.profile, ) # Create the job using the SDK pointed to the platform jobs microservice. diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py index 4a921a613d..530faef308 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py @@ -58,7 +58,6 @@ job_route_factory, ) from nemo_platform_plugin.jobs.exceptions import PlatformJobCompilationError -from nemo_platform_plugin.jobs.profile import stamp_profile if TYPE_CHECKING: from collections.abc import Callable @@ -258,11 +257,12 @@ def _adapt_compile( """Bridge ``NemoJob.compile`` to the factory's ``platform_job_config_compiler`` shape. The factory calls ``compiler(workspace, original_spec, transformed_spec, - entity_client, job_name, sdk)``. :meth:`NemoJob.compile` is an - ``async classmethod`` that uses kwargs and also accepts - ``profile`` / ``options`` — phase 1 MR 1.1b passes ``None`` for - both (body-field wiring is a follow-up). After ``compile`` returns, - the adapter applies :func:`stamp_profile` with ``default_profile``. + entity_client, job_name, sdk, kind)`` where ``kind`` is the resolved + executor payload shape (``"container"`` or ``"subprocess"``), already + resolved by ``_compile_platform_spec`` from the submitter's profile. + + ``stamp_profile`` is applied by ``_compile_platform_spec`` after this + adapter returns — the adapter only handles the NemoJob.compile bridge. Missing-override errors from the ``NemoJob.compile`` base marker become :class:`PlatformJobCompilationError` so the factory's @@ -276,6 +276,8 @@ async def compile_adapter( entity_client: Any, job_name: str | None, sdk: Any, + kind: str | None = None, + profile: str | None = None, ) -> Any: del original_spec # NemoJob.compile only needs the canonical (transformed) spec try: @@ -285,13 +287,13 @@ async def compile_adapter( entity_client=entity_client, job_name=job_name, async_sdk=sdk, - profile=None, + kind=kind, + profile=profile, options=None, ) except NotImplementedError as exc: raise PlatformJobCompilationError(str(exc)) from exc - stamp_profile(result, default_profile) return result return compile_adapter diff --git a/packages/nemo_platform_plugin/tests/test_jobs_filter.py b/packages/nemo_platform_plugin/tests/test_jobs_filter.py index 1fafe7d9e5..15c30017f3 100644 --- a/packages/nemo_platform_plugin/tests/test_jobs_filter.py +++ b/packages/nemo_platform_plugin/tests/test_jobs_filter.py @@ -42,7 +42,7 @@ class _Spec(BaseModel): foo: str = "bar" -def _fake_compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk): +def _fake_compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind=None, profile=None): return {"steps": []} diff --git a/packages/nemo_platform_plugin/tests/test_jobs_routes.py b/packages/nemo_platform_plugin/tests/test_jobs_routes.py index e114a4a32d..26e7b4ff12 100644 --- a/packages/nemo_platform_plugin/tests/test_jobs_routes.py +++ b/packages/nemo_platform_plugin/tests/test_jobs_routes.py @@ -73,6 +73,7 @@ async def compile( entity_client, job_name, async_sdk, + kind=None, profile=None, options=None, ): @@ -115,6 +116,7 @@ async def compile( entity_client, job_name, async_sdk, + kind=None, profile=None, options=None, ): @@ -274,15 +276,14 @@ def run(self, config: dict) -> dict: @pytest.mark.asyncio -async def test_compile_adapter_invokes_nemo_compile_and_stamps_default_profile() -> None: +async def test_compile_adapter_invokes_nemo_compile() -> None: adapter = _adapt_compile(_WidgetJob, default_profile="research") spec = _WidgetSpec(name="w") - platform_spec = await adapter("ws", spec, spec, "entity_client", "job-1", "sdk") + # Adapter receives kind and profile from _compile_platform_spec. + # Profile stamping is now done by _compile_platform_spec, not the adapter. + platform_spec = await adapter("ws", spec, spec, "entity_client", "job-1", "sdk", "container", "research") assert isinstance(platform_spec, _FakePlatformSpec) - # Profile stamped on every step since the compiler didn't set one. - for step in platform_spec.steps: - assert step.executor.profile == "research" @pytest.mark.asyncio @@ -299,7 +300,7 @@ async def compile(cls, **kwargs): return _FakePlatformSpec(steps=[_FakeStep(profile="explicit")]) adapter = _adapt_compile(CompileSetsProfile, default_profile="default") - platform_spec = await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk") + platform_spec = await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk", None, None) assert platform_spec.steps[0].executor.profile == "explicit" @@ -308,7 +309,7 @@ async def compile(cls, **kwargs): async def test_compile_adapter_converts_not_implemented_to_compilation_error() -> None: adapter = _adapt_compile(_NoCompileJob, default_profile="default") with pytest.raises(PlatformJobCompilationError, match="must override compile"): - await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk") + await adapter("ws", None, _WidgetSpec(name="x"), "ec", None, "sdk", None, None) # --------------------------------------------------------------------------- diff --git a/packages/nmp_common/tests/api_factory/test_api_factory.py b/packages/nmp_common/tests/api_factory/test_api_factory.py index ff9dcb718e..b3b6120775 100644 --- a/packages/nmp_common/tests/api_factory/test_api_factory.py +++ b/packages/nmp_common/tests/api_factory/test_api_factory.py @@ -64,6 +64,8 @@ def foo_job_config_compiler( entity_client: EntityClient, job_name: str | None, sdk, + kind: str | None = None, + profile: str | None = None, ) -> PlatformJobSpec: return PlatformJobSpec( steps=[ @@ -1389,7 +1391,14 @@ def test_create_job_injects_workspace_and_entity_client(): received_entity_client = None def compiler( - workspace: str, input_spec: FooJobConfig, output_spec: FooJobConfig, entity_client, job_name: str | None, sdk + workspace: str, + input_spec: FooJobConfig, + output_spec: FooJobConfig, + entity_client, + job_name: str | None, + sdk, + kind: str | None = None, + profile: str | None = None, ) -> PlatformJobSpec: nonlocal received_workspace received_workspace = workspace @@ -1439,7 +1448,14 @@ def test_sync_compiler_is_called_correctly(): compiler_called = False def sync_compiler( - workspace: str, input_spec: FooJobConfig, output_spec: FooJobConfig, entity_client, job_name: str | None, sdk + workspace: str, + input_spec: FooJobConfig, + output_spec: FooJobConfig, + entity_client, + job_name: str | None, + sdk, + kind: str | None = None, + profile: str | None = None, ) -> PlatformJobSpec: nonlocal compiler_called compiler_called = True @@ -1681,7 +1697,7 @@ async def test_sync_compiler(self): spec = FooJobConfig(foo="a", bar=1) expected = self._make_platform_spec(spec) - def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk): + def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None): return expected result = await _compile_platform_spec(compiler, "ws", spec, spec, MagicMock(), "name", "svc", MagicMock()) @@ -1693,7 +1709,7 @@ async def test_async_compiler(self): spec = FooJobConfig(foo="a", bar=1) expected = self._make_platform_spec(spec) - async def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk): + async def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None): return expected result = await _compile_platform_spec(compiler, "ws", spec, spec, MagicMock(), "name", "svc", MagicMock()) @@ -1704,7 +1720,7 @@ async def test_compilation_error_becomes_422(self): """PlatformJobCompilationError is wrapped in HTTPException 422.""" from fastapi import HTTPException - def bad_compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk): + def bad_compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None): raise PlatformJobCompilationError("missing field") spec = FooJobConfig(foo="a", bar=1) @@ -1719,7 +1735,9 @@ async def test_validate_job_spec_is_called(self): """_validate_job_spec is invoked on the compiled result (catches non-serializable config).""" from fastapi import HTTPException - def compiler_bad_config(workspace, input_spec, output_spec, entity_client, job_name, sdk): + def compiler_bad_config( + workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None + ): # Return a spec whose step config is not JSON serializable return PlatformJobSpec( steps=[ diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py index 9141a21865..09025e1d52 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py @@ -64,6 +64,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py index ee9ca7007d..36b760f557 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py @@ -158,6 +158,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py index 12222e2525..666f649435 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py @@ -137,6 +137,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py index 5c4a0f41b7..3562dd2792 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py @@ -138,6 +138,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py index df52feb4ae..95b7aadb2e 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py @@ -87,6 +87,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py index 6cccb39630..10a5953a97 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py @@ -20,6 +20,7 @@ ContainerSpec, PlatformJobSpec, PlatformJobStep, + SubprocessExecutionProviderSpec, ) from nemo_platform_plugin.jobs.image import get_qualified_image from pydantic import BaseModel @@ -66,23 +67,39 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: + resolved_profile = profile or "default" + + # Use the kind parameter directly; default to container when not provided. + resolved_kind = kind or "container" + + if resolved_kind == "subprocess": + executor = SubprocessExecutionProviderSpec( + kind="subprocess", + provider="cpu", + profile=resolved_profile, + command=["python", "-m", "nemo_data_designer_plugin.jobs.bridge"], + ) + else: + executor = ContainerExecutionProviderSpec( + kind="container", + provider="cpu", + profile=resolved_profile, + container=ContainerSpec( + image=get_qualified_image("nmp-cpu-tasks"), + entrypoint=["python", "-m"], + command=["nemo_data_designer_plugin.jobs.bridge"], + ), + ) + return PlatformJobSpec( steps=[ PlatformJobStep( name="data-designer-job", - executor=ContainerExecutionProviderSpec( - kind="container", - profile=profile or "default", - provider="cpu", - container=ContainerSpec( - image=get_qualified_image("nmp-cpu-tasks"), - entrypoint=["python", "-m"], - command=["nemo_data_designer_plugin.jobs.bridge"], - ), - ), + executor=executor, config=spec.model_dump(), environment=[], ) diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py index bade6c835d..698a6374d4 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py @@ -169,6 +169,7 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: object, + kind: str | None = None, profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py index e75682576c..8aeaf1bd91 100644 --- a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py +++ b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py @@ -108,6 +108,8 @@ async def job_config_compiler( entity_client: EntityClient, job_name: str | None, sdk: AsyncNeMoPlatform, + kind: str | None = None, + profile: str | None = None, ) -> PlatformJobSpec: """Compile Safe Synthesizer job config into a platform job.""" del original_spec, entity_client, job_name diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index 1ad85342fe..1120177bab 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -487,6 +487,8 @@ def hello_world_job_config( entity_client: EntityClient, job_name: str | None, sdk, + kind: str | None = None, + profile: str | None = None, ) -> FactoryPlatformJobSpec: return FactoryPlatformJobSpec( steps=[ diff --git a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py index 6747f52bd3..502e8361de 100644 --- a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py +++ b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py @@ -24,6 +24,8 @@ def compile_hello_world_job( entity_client: EntityClient, job_name: str | None, sdk: AsyncNeMoPlatform, + kind: str | None = None, + profile: str | None = None, ) -> PlatformJobSpec: """Compile a hello world job config into a platform job spec. From 85c29de730cc3c27eccaf7258af0e7bede31d02d Mon Sep 17 00:00:00 2001 From: Matthew Grossman Date: Tue, 16 Jun 2026 13:40:37 -0700 Subject: [PATCH 17/17] fix(jobs): default kind to "container" in compile signatures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kind should never be None — container is the natural default. Updated all compile() signatures and compiler functions to use kind: str = "container" instead of kind: str | None = None. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matthew Grossman --- .../src/nemo_platform_plugin/job.py | 5 ++--- .../src/nemo_platform_plugin/jobs/api_factory.py | 2 +- .../src/nemo_platform_plugin/jobs/routes.py | 2 +- .../nemo_platform_plugin/tests/test_jobs_filter.py | 2 +- .../nemo_platform_plugin/tests/test_jobs_routes.py | 4 ++-- .../tests/api_factory/test_api_factory.py | 14 +++++++------- .../src/nemo_agents_plugin/jobs/analyze_batch.py | 2 +- .../src/nemo_agents_plugin/jobs/evaluate_agent.py | 2 +- .../src/nemo_agents_plugin/jobs/evaluate_suite.py | 2 +- .../src/nemo_agents_plugin/jobs/optimize_agent.py | 2 +- .../src/nemo_agents_plugin/jobs/optimize_skills.py | 2 +- .../src/nemo_data_designer_plugin/jobs/create.py | 7 ++----- .../src/nemo_evaluator/jobs/evaluate.py | 2 +- .../api/v2/jobs/endpoints.py | 2 +- services/core/jobs/tests/conftest.py | 2 +- .../src/nmp/hello_world/api/v2/jobs/endpoints.py | 2 +- 16 files changed, 25 insertions(+), 29 deletions(-) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py index d9a54bff4f..0bcc7747ba 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/job.py @@ -273,7 +273,7 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: AsyncNeMoPlatform, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> object: @@ -302,8 +302,7 @@ async def compile( :meth:`to_spec`: this runs in the API process so only the async client is offered. kind: Resolved executor payload shape — ``"container"`` or - ``"subprocess"``. ``None`` when no profile was specified - (compilers should default to ``"container"``). + ``"subprocess"``. Defaults to ``"container"``. profile: The submitter-selected execution profile name (e.g. ``"subprocess"``, ``"default"``). ``None`` when no profile was specified. diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py index f997214f5c..20c22f00d4 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/api_factory.py @@ -651,7 +651,7 @@ async def _compile_platform_spec( """ from nemo_platform_plugin.jobs.profiles import resolve_profile_kind - kind: str | None = None + kind: str = "container" if profile is not None: try: kind = await resolve_profile_kind(sdk, default_provider, profile) diff --git a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py index 530faef308..caa2df6fbc 100644 --- a/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py +++ b/packages/nemo_platform_plugin/src/nemo_platform_plugin/jobs/routes.py @@ -276,7 +276,7 @@ async def compile_adapter( entity_client: Any, job_name: str | None, sdk: Any, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> Any: del original_spec # NemoJob.compile only needs the canonical (transformed) spec diff --git a/packages/nemo_platform_plugin/tests/test_jobs_filter.py b/packages/nemo_platform_plugin/tests/test_jobs_filter.py index 15c30017f3..e14d4d83ab 100644 --- a/packages/nemo_platform_plugin/tests/test_jobs_filter.py +++ b/packages/nemo_platform_plugin/tests/test_jobs_filter.py @@ -42,7 +42,7 @@ class _Spec(BaseModel): foo: str = "bar" -def _fake_compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind=None, profile=None): +def _fake_compiler(workspace, original_spec, transformed_spec, entity_client, job_name, sdk, kind="container", profile=None): return {"steps": []} diff --git a/packages/nemo_platform_plugin/tests/test_jobs_routes.py b/packages/nemo_platform_plugin/tests/test_jobs_routes.py index 26e7b4ff12..51193d2ebc 100644 --- a/packages/nemo_platform_plugin/tests/test_jobs_routes.py +++ b/packages/nemo_platform_plugin/tests/test_jobs_routes.py @@ -73,7 +73,7 @@ async def compile( entity_client, job_name, async_sdk, - kind=None, + kind="container", profile=None, options=None, ): @@ -116,7 +116,7 @@ async def compile( entity_client, job_name, async_sdk, - kind=None, + kind="container", profile=None, options=None, ): diff --git a/packages/nmp_common/tests/api_factory/test_api_factory.py b/packages/nmp_common/tests/api_factory/test_api_factory.py index b3b6120775..17a33ee14e 100644 --- a/packages/nmp_common/tests/api_factory/test_api_factory.py +++ b/packages/nmp_common/tests/api_factory/test_api_factory.py @@ -64,7 +64,7 @@ def foo_job_config_compiler( entity_client: EntityClient, job_name: str | None, sdk, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> PlatformJobSpec: return PlatformJobSpec( @@ -1397,7 +1397,7 @@ def compiler( entity_client, job_name: str | None, sdk, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> PlatformJobSpec: nonlocal received_workspace @@ -1454,7 +1454,7 @@ def sync_compiler( entity_client, job_name: str | None, sdk, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> PlatformJobSpec: nonlocal compiler_called @@ -1697,7 +1697,7 @@ async def test_sync_compiler(self): spec = FooJobConfig(foo="a", bar=1) expected = self._make_platform_spec(spec) - def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None): + def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None): return expected result = await _compile_platform_spec(compiler, "ws", spec, spec, MagicMock(), "name", "svc", MagicMock()) @@ -1709,7 +1709,7 @@ async def test_async_compiler(self): spec = FooJobConfig(foo="a", bar=1) expected = self._make_platform_spec(spec) - async def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None): + async def compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None): return expected result = await _compile_platform_spec(compiler, "ws", spec, spec, MagicMock(), "name", "svc", MagicMock()) @@ -1720,7 +1720,7 @@ async def test_compilation_error_becomes_422(self): """PlatformJobCompilationError is wrapped in HTTPException 422.""" from fastapi import HTTPException - def bad_compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None): + def bad_compiler(workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None): raise PlatformJobCompilationError("missing field") spec = FooJobConfig(foo="a", bar=1) @@ -1736,7 +1736,7 @@ async def test_validate_job_spec_is_called(self): from fastapi import HTTPException def compiler_bad_config( - workspace, input_spec, output_spec, entity_client, job_name, sdk, kind=None, profile=None + workspace, input_spec, output_spec, entity_client, job_name, sdk, kind="container", profile=None ): # Return a spec whose step config is not JSON serializable return PlatformJobSpec( diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py index 09025e1d52..6cb1a51dff 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/analyze_batch.py @@ -64,7 +64,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py index 36b760f557..9a7af615ea 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_agent.py @@ -158,7 +158,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py index 666f649435..d955efe9c5 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/evaluate_suite.py @@ -137,7 +137,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py index 3562dd2792..e337e98dde 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_agent.py @@ -138,7 +138,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py index 95b7aadb2e..9006951762 100644 --- a/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py +++ b/plugins/nemo-agents/src/nemo_agents_plugin/jobs/optimize_skills.py @@ -87,7 +87,7 @@ async def compile( # type: ignore[override] entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py index 10a5953a97..04db71e5e5 100644 --- a/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py +++ b/plugins/nemo-data-designer/src/nemo_data_designer_plugin/jobs/create.py @@ -67,16 +67,13 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: resolved_profile = profile or "default" - # Use the kind parameter directly; default to container when not provided. - resolved_kind = kind or "container" - - if resolved_kind == "subprocess": + if kind == "subprocess": executor = SubprocessExecutionProviderSpec( kind="subprocess", provider="cpu", diff --git a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py index 698a6374d4..620cd79fa6 100644 --- a/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py +++ b/plugins/nemo-evaluator/src/nemo_evaluator/jobs/evaluate.py @@ -169,7 +169,7 @@ async def compile( entity_client: object, job_name: str | None, async_sdk: object, - kind: str | None = None, + kind: str = "container", profile: str | None = None, options: dict | None = None, ) -> PlatformJobSpec: diff --git a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py index 8aeaf1bd91..5925cae185 100644 --- a/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py +++ b/plugins/nemo-safe-synthesizer/src/nemo_safe_synthesizer_plugin/api/v2/jobs/endpoints.py @@ -108,7 +108,7 @@ async def job_config_compiler( entity_client: EntityClient, job_name: str | None, sdk: AsyncNeMoPlatform, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> PlatformJobSpec: """Compile Safe Synthesizer job config into a platform job.""" diff --git a/services/core/jobs/tests/conftest.py b/services/core/jobs/tests/conftest.py index 1120177bab..30f43f0c87 100644 --- a/services/core/jobs/tests/conftest.py +++ b/services/core/jobs/tests/conftest.py @@ -487,7 +487,7 @@ def hello_world_job_config( entity_client: EntityClient, job_name: str | None, sdk, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> FactoryPlatformJobSpec: return FactoryPlatformJobSpec( diff --git a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py index 502e8361de..a118b6b868 100644 --- a/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py +++ b/services/hello-world/src/nmp/hello_world/api/v2/jobs/endpoints.py @@ -24,7 +24,7 @@ def compile_hello_world_job( entity_client: EntityClient, job_name: str | None, sdk: AsyncNeMoPlatform, - kind: str | None = None, + kind: str = "container", profile: str | None = None, ) -> PlatformJobSpec: """Compile a hello world job config into a platform job spec.