diff --git a/README.md b/README.md index 3475c828..84c9ca20 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,23 @@ docker-git apply-all --active - `apply` применяет конфиг к одному проекту. `--no-up` только обновляет файлы без `docker compose up`. - `apply-all` применяет конфиг ко всем проектам. `--active` только к запущенным контейнерам. +## GPU режим + +По умолчанию проекты запускаются без GPU (`gpu: "none"`), поэтому Docker не +требует NVIDIA runtime на обычных CPU-хостах. + +GPU включается только явно через `--gpu all` или сохранённое значение +`"gpu": "all"` в `docker-git.json`. Если Docker возвращает ошибку NVIDIA +prestart hook вида `nvidia-container-cli` / `libnvidia-ml.so.1`, `docker-git` +перезаписывает managed-файлы проекта с `gpu: "none"` и повторяет +`docker compose up`, чтобы среда оставалась запускаемой на хосте без рабочей +NVIDIA userspace-части. + +Если проекту действительно нужен GPU, установите драйвер NVIDIA и NVIDIA +Container Toolkit на хосте, затем снова примените конфигурацию с `--gpu all`. +GPU для controller-контейнера включается отдельно через +`DOCKER_GIT_CONTROLLER_GPU=all`; значение по умолчанию для controller тоже +`none`. Для запуска WEB версии: ```bash diff --git a/bun.lock b/bun.lock index e0e2c218..0593a0df 100644 --- a/bun.lock +++ b/bun.lock @@ -163,6 +163,7 @@ "eslint-plugin-sonarjs": "^4.0.3", "eslint-plugin-sort-destructure-keys": "^3.0.0", "eslint-plugin-unicorn": "^64.0.0", + "fast-check": "^3.23.2", "globals": "^17.6.0", "jscpd": "^4.1.1", "typescript": "^6.0.3", diff --git a/packages/app/src/lib/core/gpu.ts b/packages/app/src/lib/core/gpu.ts new file mode 100644 index 00000000..4dfbb580 --- /dev/null +++ b/packages/app/src/lib/core/gpu.ts @@ -0,0 +1,39 @@ +/* jscpd:ignore-start */ +import type { GpuMode } from "./domain.js" + +const nvidiaFailureMarkers = [ + "nvidia-container-cli", + "libnvidia-ml.so.1", + "could not select device driver" +] + +// CHANGE: classify Docker/NVIDIA runtime failures from compose output. +// WHY: GPU device requests fail before the container entrypoint can run, so recovery must happen outside Docker. +// QUOTE(ТЗ): "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall s: contains_nvidia_runtime_marker(s) -> nvidia_runtime_failure(s) +// PURITY: CORE +// EFFECT: n/a +// INVARIANT: detection is monotonic over output text; adding unrelated text cannot flip true to false +// COMPLEXITY: O(n * m) where n = |details| and m = marker count +export const isNvidiaRuntimeFailure = (details: string | undefined): boolean => { + const normalized = details?.toLowerCase() ?? "" + return nvidiaFailureMarkers.some((marker) => normalized.includes(marker)) +} + +// CHANGE: derive the safe GPU fallback mode after a Docker runtime failure. +// WHY: non-GPU containers must remain startable on hosts without a working NVIDIA userspace stack. +// QUOTE(ТЗ): "load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall g,e: fallback(g,e) = none iff g = all and nvidia_runtime_failure(e) +// PURITY: CORE +// EFFECT: n/a +// INVARIANT: gpu=none is idempotent and never escalates to gpu=all +// COMPLEXITY: O(n * m) where n = |details| and m = marker count +export const gpuModeAfterDockerFailure = ( + gpu: GpuMode, + details: string | undefined +): GpuMode => gpu === "all" && isNvidiaRuntimeFailure(details) ? "none" : gpu +/* jscpd:ignore-end */ diff --git a/packages/app/src/lib/shell/docker-compose.ts b/packages/app/src/lib/shell/docker-compose.ts index c5d5f8cb..ca6e2c57 100644 --- a/packages/app/src/lib/shell/docker-compose.ts +++ b/packages/app/src/lib/shell/docker-compose.ts @@ -3,6 +3,7 @@ import type * as CommandExecutor from "@effect/platform/CommandExecutor" import type { PlatformError } from "@effect/platform/Error" import { Duration, Effect, pipe, Schedule } from "effect" +import { isNvidiaRuntimeFailure } from "../core/gpu.js" import { runCommandCapture, runCommandWithStreamingOutput } from "./command-runner.js" import { composeSpec, resolveDockerComposeEnv } from "./docker-compose-env.js" import { DockerCommandError } from "./errors.js" @@ -53,17 +54,40 @@ const dockerComposeUpRetrySchedule = Schedule.addDelay( () => Duration.seconds(2) ) +// CHANGE: classify compose-up failures that are worth retrying. +// WHY: host NVIDIA runtime misconfiguration is deterministic, so repeated compose attempts only delay fallback. +// QUOTE(ТЗ): "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall e: nvidia_runtime_failure(e) -> retryable(e)=false +// PURITY: SHELL +// EFFECT: n/a +// INVARIANT: non-Docker platform errors keep the existing retry behavior +// COMPLEXITY: O(n * m) where n = |details| and m = marker count +const isRetryableDockerComposeUpError = (error: DockerCommandError | PlatformError): boolean => { + if (error._tag !== "DockerCommandError") { + return true + } + + return !isNvidiaRuntimeFailure(error.details) +} + const retryDockerComposeUp = ( cwd: string, effect: Effect.Effect ): Effect.Effect => effect.pipe( - Effect.tapError(() => - Effect.logWarning( - `docker compose up failed in ${cwd}; retrying (possible transient Docker Hub/DNS issue)...` - ) + Effect.tapError((error) => + isRetryableDockerComposeUpError(error) + ? Effect.logWarning( + `docker compose up failed in ${cwd}; retrying (possible transient Docker Hub/DNS issue)...` + ) + : Effect.void ), - Effect.retry(dockerComposeUpRetrySchedule) + Effect.retry({ + schedule: dockerComposeUpRetrySchedule, + while: isRetryableDockerComposeUpError + }) ) export const runDockerComposeUp = ( diff --git a/packages/app/src/lib/usecases/errors.ts b/packages/app/src/lib/usecases/errors.ts index 44c8b4ee..22dca9df 100644 --- a/packages/app/src/lib/usecases/errors.ts +++ b/packages/app/src/lib/usecases/errors.ts @@ -2,6 +2,7 @@ import type { PlatformError } from "@effect/platform/Error" import { Match } from "effect" import { type ParseError } from "../core/domain.js" +import { isNvidiaRuntimeFailure } from "../core/gpu.js" import { formatParseError } from "../core/parse-errors.js" import type { AgentFailedError, @@ -86,6 +87,11 @@ const renderDockerCommandError = ({ details, exitCode }: DockerCommandError): st "Hint: ensure Docker daemon is running and current user can access /var/run/docker.sock (for example via the docker group).", "Hint: if output above contains 'port is already allocated', retry with a free SSH port via --ssh-port (for example --ssh-port 2235), or stop the conflicting project/container.", "Hint: if output above contains 'all predefined address pools have been fully subnetted', run `docker network prune -f`, configure Docker `default-address-pools`, or use shared network mode (`--network-mode shared`).", + ...(isNvidiaRuntimeFailure(details) + ? [ + "Hint: NVIDIA GPU access is enabled but Docker cannot load the host NVIDIA runtime; run with GPU disabled (`--gpu none`) or install the NVIDIA driver and NVIDIA Container Toolkit." + ] + : []), "Hint: if output above contains 'lookup auth.docker.io' or 'read udp ... [::1]:53 ... connection refused', fix Docker DNS resolver (set working DNS in host/daemon config) and retry." ].join("\n") diff --git a/packages/app/src/lib/usecases/projects-up.ts b/packages/app/src/lib/usecases/projects-up.ts index 789c4cae..b844d904 100644 --- a/packages/app/src/lib/usecases/projects-up.ts +++ b/packages/app/src/lib/usecases/projects-up.ts @@ -6,6 +6,7 @@ import type { Path } from "@effect/platform/Path" import { Effect, pipe } from "effect" import type { ProjectConfig, TemplateConfig } from "../core/domain.js" +import { gpuModeAfterDockerFailure } from "../core/gpu.js" import { readProjectConfig } from "../shell/config.js" import { runDockerComposePsFormatted, @@ -31,6 +32,9 @@ import { ensureSharedCodexVolumeReady } from "./shared-volume-seed.js" const maxPortAttempts = 25 +type ProjectComposeUpError = DockerCommandError | FileExistsError | PlatformError +type ProjectComposeUpRequirements = FileSystem | Path | CommandExecutor + const syncManagedProjectFiles = ( projectDir: string, template: TemplateConfig @@ -122,6 +126,44 @@ const ensureClaudeCliReady = ( }) ) +// CHANGE: recover from host NVIDIA runtime failures by disabling per-project GPU access. +// WHY: Docker rejects gpus: all before the container starts when the host NVIDIA runtime is unavailable. +// QUOTE(ТЗ): "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall t,e: gpu(t)=all and nvidia_runtime_failure(e) -> gpu(retry(t,e))=none +// PURITY: SHELL +// EFFECT: Effect +// INVARIANT: fallback never escalates GPU access and terminates after at most one all -> none downgrade +// COMPLEXITY: O(1) compose attempts plus O(file-size) template rewrite +const runProjectComposeUp = ( + projectDir: string, + template: TemplateConfig +): Effect.Effect => + runDockerComposeUp(projectDir).pipe( + Effect.as(template), + Effect.catchTag("DockerCommandError", (error) => { + const fallbackGpu = gpuModeAfterDockerFailure(template.gpu, error.details) + if (fallbackGpu === template.gpu) { + // Idempotence witness: non-NVIDIA errors and gpu=none cannot produce a lower GPU mode. + return Effect.fail(error) + } + + const fallbackTemplate: TemplateConfig = { ...template, gpu: fallbackGpu } + return Effect.gen(function*(_) { + yield* _( + Effect.logWarning( + `NVIDIA runtime failed while GPU access was enabled (${ + error.details ?? "no docker output" + }); rewriting project with GPU access disabled and retrying docker compose up.` + ) + ) + yield* _(syncManagedProjectFiles(projectDir, fallbackTemplate)) + return yield* _(runProjectComposeUp(projectDir, fallbackTemplate)) + }) + }) + ) + // CHANGE: update template port when the preferred SSH port is reserved or busy // WHY: keep each project on a unique port even across restarts // QUOTE(ТЗ): "Почему контейнер пытается подниматься на существующий порт?" @@ -194,8 +236,8 @@ export const runDockerComposeUpWithPortCheck = ( yield* _(syncManagedProjectFiles(projectDir, resolvedTemplate)) yield* _(ensureComposeNetworkReady(projectDir, resolvedTemplate)) yield* _(ensureSharedCodexVolumeReady(projectDir, resolvedTemplate)) - yield* _(runDockerComposeUp(projectDir)) - yield* _(ensureClaudeCliReady(projectDir, resolvedTemplate.containerName)) + const startedTemplate = yield* _(runProjectComposeUp(projectDir, resolvedTemplate)) + yield* _(ensureClaudeCliReady(projectDir, startedTemplate.containerName)) const ensureBridgeAccess = (containerName: string) => runDockerInspectContainerBridgeIp(projectDir, containerName).pipe( @@ -215,11 +257,11 @@ export const runDockerComposeUpWithPortCheck = ( }) ) - yield* _(ensureBridgeAccess(resolvedTemplate.containerName)) - if (resolvedTemplate.enableMcpPlaywright) { - yield* _(ensureBridgeAccess(`${resolvedTemplate.containerName}-browser`)) + yield* _(ensureBridgeAccess(startedTemplate.containerName)) + if (startedTemplate.enableMcpPlaywright) { + yield* _(ensureBridgeAccess(`${startedTemplate.containerName}-browser`)) } - return resolvedTemplate + return startedTemplate }) /* jscpd:ignore-end */ diff --git a/packages/lib/package.json b/packages/lib/package.json index a88df06e..3b30b658 100644 --- a/packages/lib/package.json +++ b/packages/lib/package.json @@ -74,6 +74,7 @@ "eslint-plugin-sonarjs": "^4.0.3", "eslint-plugin-sort-destructure-keys": "^3.0.0", "eslint-plugin-unicorn": "^64.0.0", + "fast-check": "^3.23.2", "@vitest/eslint-plugin": "^1.6.17", "globals": "^17.6.0", "jscpd": "^4.1.1", diff --git a/packages/lib/src/core/gpu.ts b/packages/lib/src/core/gpu.ts new file mode 100644 index 00000000..b2d90c68 --- /dev/null +++ b/packages/lib/src/core/gpu.ts @@ -0,0 +1,37 @@ +import type { GpuMode } from "./domain.js" + +const nvidiaFailureMarkers = [ + "nvidia-container-cli", + "libnvidia-ml.so.1", + "could not select device driver" +] + +// CHANGE: classify Docker/NVIDIA runtime failures from compose output. +// WHY: GPU device requests fail before the container entrypoint can run, so recovery must happen outside Docker. +// QUOTE(ТЗ): "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall s: contains_nvidia_runtime_marker(s) -> nvidia_runtime_failure(s) +// PURITY: CORE +// EFFECT: n/a +// INVARIANT: detection is monotonic over output text; adding unrelated text cannot flip true to false +// COMPLEXITY: O(n * m) where n = |details| and m = marker count +export const isNvidiaRuntimeFailure = (details: string | undefined): boolean => { + const normalized = details?.toLowerCase() ?? "" + return nvidiaFailureMarkers.some((marker) => normalized.includes(marker)) +} + +// CHANGE: derive the safe GPU fallback mode after a Docker runtime failure. +// WHY: non-GPU containers must remain startable on hosts without a working NVIDIA userspace stack. +// QUOTE(ТЗ): "load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall g,e: fallback(g,e) = none iff g = all and nvidia_runtime_failure(e) +// PURITY: CORE +// EFFECT: n/a +// INVARIANT: gpu=none is idempotent and never escalates to gpu=all +// COMPLEXITY: O(n * m) where n = |details| and m = marker count +export const gpuModeAfterDockerFailure = ( + gpu: GpuMode, + details: string | undefined +): GpuMode => gpu === "all" && isNvidiaRuntimeFailure(details) ? "none" : gpu diff --git a/packages/lib/src/shell/docker-compose.ts b/packages/lib/src/shell/docker-compose.ts index 4c60c6f5..78419e54 100644 --- a/packages/lib/src/shell/docker-compose.ts +++ b/packages/lib/src/shell/docker-compose.ts @@ -3,6 +3,7 @@ import type * as CommandExecutor from "@effect/platform/CommandExecutor" import type { PlatformError } from "@effect/platform/Error" import { Duration, Effect, pipe, Schedule } from "effect" +import { isNvidiaRuntimeFailure } from "../core/gpu.js" import { runCommandCapture, runCommandWithStreamingOutput } from "./command-runner.js" import { composeSpec, resolveDockerComposeEnv } from "./docker-compose-env.js" import { DockerCommandError } from "./errors.js" @@ -53,17 +54,40 @@ const dockerComposeUpRetrySchedule = Schedule.addDelay( () => Duration.seconds(2) ) +// CHANGE: classify compose-up failures that are worth retrying. +// WHY: host NVIDIA runtime misconfiguration is deterministic, so repeated compose attempts only delay fallback. +// QUOTE(ТЗ): "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall e: nvidia_runtime_failure(e) -> retryable(e)=false +// PURITY: SHELL +// EFFECT: n/a +// INVARIANT: non-Docker platform errors keep the existing retry behavior +// COMPLEXITY: O(n * m) where n = |details| and m = marker count +const isRetryableDockerComposeUpError = (error: DockerCommandError | PlatformError): boolean => { + if (error._tag !== "DockerCommandError") { + return true + } + + return !isNvidiaRuntimeFailure(error.details) +} + const retryDockerComposeUp = ( cwd: string, effect: Effect.Effect ): Effect.Effect => effect.pipe( - Effect.tapError(() => - Effect.logWarning( - `docker compose up failed in ${cwd}; retrying (possible transient Docker Hub/DNS issue)...` - ) + Effect.tapError((error) => + isRetryableDockerComposeUpError(error) + ? Effect.logWarning( + `docker compose up failed in ${cwd}; retrying (possible transient Docker Hub/DNS issue)...` + ) + : Effect.void ), - Effect.retry(dockerComposeUpRetrySchedule) + Effect.retry({ + schedule: dockerComposeUpRetrySchedule, + while: isRetryableDockerComposeUpError + }) ) export type DockerComposeUpBuildMode = "build" | "reuse" diff --git a/packages/lib/src/usecases/errors.ts b/packages/lib/src/usecases/errors.ts index a8966dae..0fde7c65 100644 --- a/packages/lib/src/usecases/errors.ts +++ b/packages/lib/src/usecases/errors.ts @@ -1,6 +1,7 @@ import type { PlatformError } from "@effect/platform/Error" import { Match } from "effect" import { type ParseError } from "../core/domain.js" +import { isNvidiaRuntimeFailure } from "../core/gpu.js" import { formatParseError } from "../core/parse-errors.js" import type { AgentFailedError, @@ -85,6 +86,11 @@ const renderDockerCommandError = ({ details, exitCode }: DockerCommandError): st "Hint: ensure Docker daemon is running and current user can access /var/run/docker.sock (for example via the docker group).", "Hint: if output above contains 'port is already allocated', retry with a free SSH port via --ssh-port (for example --ssh-port 2235), or stop the conflicting project/container.", "Hint: if output above contains 'all predefined address pools have been fully subnetted', run `docker network prune -f`, configure Docker `default-address-pools`, or use shared network mode (`--network-mode shared`).", + ...(isNvidiaRuntimeFailure(details) + ? [ + "Hint: NVIDIA GPU access is enabled but Docker cannot load the host NVIDIA runtime; run with GPU disabled (`--gpu none`) or install the NVIDIA driver and NVIDIA Container Toolkit." + ] + : []), "Hint: if output above contains 'lookup auth.docker.io' or 'read udp ... [::1]:53 ... connection refused', fix Docker DNS resolver (set working DNS in host/daemon config) and retry." ].join("\n") diff --git a/packages/lib/src/usecases/projects-up.ts b/packages/lib/src/usecases/projects-up.ts index c9938eeb..e9d28c71 100644 --- a/packages/lib/src/usecases/projects-up.ts +++ b/packages/lib/src/usecases/projects-up.ts @@ -5,6 +5,7 @@ import type { Path } from "@effect/platform/Path" import { Effect, pipe } from "effect" import type { ProjectConfig, TemplateConfig } from "../core/domain.js" +import { gpuModeAfterDockerFailure } from "../core/gpu.js" import { readProjectConfig } from "../shell/config.js" import { runDockerComposePsFormatted, @@ -35,6 +36,10 @@ export type RunDockerComposeUpWithPortCheckOptions = { readonly waitForPostStart?: boolean } +type ProjectComposeUpAttemptError = DockerCommandError | PlatformError +type ProjectComposeUpError = DockerCommandError | FileExistsError | PlatformError +type ProjectComposeUpRequirements = FileSystem | Path | CommandExecutor + const syncManagedProjectFiles = ( projectDir: string, template: TemplateConfig @@ -168,22 +173,68 @@ const startProjectPostStartSelfHealInBackground = ( yield* _(Effect.forkDaemon(runProjectPostStartSelfHeal(projectDir, template))) }) -const runProjectComposeUp = ( +// CHANGE: recover from host NVIDIA runtime failures by disabling per-project GPU access. +// WHY: Docker rejects gpus: all before the container starts when the host NVIDIA runtime is unavailable. +// QUOTE(ТЗ): "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1" +// REF: issue-291 +// SOURCE: n/a +// FORMAT THEOREM: forall t,e: gpu(t)=all and nvidia_runtime_failure(e) -> gpu(retry(t,e))=none +// PURITY: SHELL +// EFFECT: Effect +// INVARIANT: fallback never escalates GPU access and terminates after at most one all -> none downgrade +// COMPLEXITY: O(1) compose attempts plus O(file-size) template rewrite +const recoverProjectComposeGpuFailure = ( projectDir: string, + template: TemplateConfig, + effect: Effect.Effect, buildMode: "build" | "reuse" -): Effect.Effect => { - if (buildMode === "build") { - return runDockerComposeUp(projectDir) - } +): Effect.Effect => + effect.pipe( + Effect.as(template), + Effect.catchTag("DockerCommandError", (error) => { + const fallbackGpu = gpuModeAfterDockerFailure(template.gpu, error.details) + if (fallbackGpu === template.gpu) { + // Idempotence witness: non-NVIDIA errors and gpu=none cannot produce a lower GPU mode. + return Effect.fail(error) + } - return runDockerComposeUp(projectDir, { buildMode: "reuse" }).pipe( - Effect.catchTag("DockerCommandError", () => - Effect.logWarning( - `docker compose up -d failed in ${projectDir}; falling back to docker compose up -d --build.` - ).pipe( - Effect.zipRight(runDockerComposeUp(projectDir)) - )) + const fallbackTemplate: TemplateConfig = { ...template, gpu: fallbackGpu } + return Effect.gen(function*(_) { + yield* _( + Effect.logWarning( + `NVIDIA runtime failed while GPU access was enabled (${ + error.details ?? "no docker output" + }); rewriting project with GPU access disabled and retrying docker compose up.` + ) + ) + yield* _(syncManagedProjectFiles(projectDir, fallbackTemplate)) + return yield* _(runProjectComposeUp(projectDir, fallbackTemplate, buildMode)) + }) + }) ) + +const runProjectComposeUp = ( + projectDir: string, + template: TemplateConfig, + buildMode: "build" | "reuse" +): Effect.Effect => { + const composeUp = buildMode === "build" + ? runDockerComposeUp(projectDir) + : runDockerComposeUp(projectDir, { buildMode: "reuse" }).pipe( + Effect.catchTag("DockerCommandError", (error) => { + if (gpuModeAfterDockerFailure(template.gpu, error.details) !== template.gpu) { + return Effect.fail(error) + } + + return Effect.logWarning( + `docker compose up -d failed in ${projectDir}; falling back to docker compose up -d --build.` + ).pipe( + Effect.zipRight(runDockerComposeUp(projectDir)) + ) + }) + ) + + return recoverProjectComposeGpuFailure(projectDir, template, composeUp, buildMode) } // CHANGE: update template port when the preferred SSH port is reserved or busy @@ -259,10 +310,10 @@ export const runDockerComposeUpWithPortCheck = ( yield* _(syncManagedProjectFiles(projectDir, resolvedTemplate)) yield* _(ensureComposeNetworkReady(projectDir, resolvedTemplate)) yield* _(ensureSharedCodexVolumeReady(projectDir, resolvedTemplate)) - yield* _(runProjectComposeUp(projectDir, options.buildMode ?? "build")) + const startedTemplate = yield* _(runProjectComposeUp(projectDir, resolvedTemplate, options.buildMode ?? "build")) yield* (options.waitForPostStart === false - ? _(startProjectPostStartSelfHealInBackground(projectDir, resolvedTemplate)) - : _(runProjectPostStartSelfHeal(projectDir, resolvedTemplate))) + ? _(startProjectPostStartSelfHealInBackground(projectDir, startedTemplate)) + : _(runProjectPostStartSelfHeal(projectDir, startedTemplate))) - return resolvedTemplate + return startedTemplate }) diff --git a/packages/lib/tests/usecases/errors.test.ts b/packages/lib/tests/usecases/errors.test.ts index 354943c7..81ac3dd9 100644 --- a/packages/lib/tests/usecases/errors.test.ts +++ b/packages/lib/tests/usecases/errors.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from "@effect/vitest" +import fc from "fast-check" import { DockerAccessError, @@ -27,6 +28,65 @@ describe("renderError", () => { expect(message).toContain("auth.docker.io") }) + it("includes NVIDIA runtime recovery hint for DockerCommandError", () => { + const message = renderError( + new DockerCommandError({ + exitCode: 1, + details: + "nvidia-container-cli: initialization error: load library failed: libnvidia-ml.so.1: cannot open shared object file" + }) + ) + + expect(message).toContain("NVIDIA GPU access is enabled") + expect(message).toContain("--gpu none") + expect(message).toContain("NVIDIA Container Toolkit") + }) + + it("shows NVIDIA hint iff docker output contains NVIDIA runtime markers", () => { + const nvidiaContainerCliMarker = "nvidia-container-cli" + const libNvidiaMlMarker = "libnvidia-ml.so.1" + const missingDeviceDriverMarker = "could not select device driver" + const markers: ReadonlyArray = [ + nvidiaContainerCliMarker, + libNvidiaMlMarker, + missingDeviceDriverMarker + ] + const includesMarker = (details: string): boolean => markers.some((marker) => details.includes(marker)) + + fc.assert( + fc.property( + fc.string(), + fc.constantFrom(nvidiaContainerCliMarker, libNvidiaMlMarker, missingDeviceDriverMarker), + fc.string(), + (left, marker, right) => { + const message = renderError( + new DockerCommandError({ + exitCode: 1, + details: `${left}${marker}${right}` + }) + ) + + expect(message).toContain("--gpu none") + } + ), + { numRuns: 50 } + ) + + fc.assert( + fc.property(fc.string().filter((details) => !includesMarker(details)), (details) => { + const message = renderError( + new DockerCommandError({ + exitCode: 1, + details + }) + ) + + expect(message).not.toContain("--gpu none") + }), + { numRuns: 50 } + ) + }) + it("renders actionable recovery for DockerAccessError", () => { const message = renderError( new DockerAccessError({ diff --git a/packages/lib/tests/usecases/projects-up.test.ts b/packages/lib/tests/usecases/projects-up.test.ts index 9b3d5a41..22d31ec1 100644 --- a/packages/lib/tests/usecases/projects-up.test.ts +++ b/packages/lib/tests/usecases/projects-up.test.ts @@ -4,18 +4,22 @@ import * as FileSystem from "@effect/platform/FileSystem" import * as Path from "@effect/platform/Path" import { NodeContext } from "@effect/platform-node" import { describe, expect, it } from "@effect/vitest" -import { Effect } from "effect" +import { Effect, Logger } from "effect" import * as Inspectable from "effect/Inspectable" +import * as Option from "effect/Option" import * as Sink from "effect/Sink" import * as Stream from "effect/Stream" +import fc from "fast-check" import type { TemplateConfig } from "../../src/core/domain.js" +import { gpuModeAfterDockerFailure } from "../../src/core/gpu.js" import { prepareProjectFiles } from "../../src/usecases/actions/prepare-files.js" import { runDockerComposeUpWithPortCheck } from "../../src/usecases/projects-up.js" type RecordedCommand = { readonly command: string readonly args: ReadonlyArray + readonly cwd?: string | undefined } const encode = (value: string): Uint8Array => new TextEncoder().encode(value) @@ -88,26 +92,83 @@ const decideStdout = (cmd: RecordedCommand): string => { return "" } -const makeFakeExecutor = (recorded: Array): CommandExecutor.CommandExecutor => { +const nvidiaContainerCliMarker = "nvidia-container-cli" +const libNvidiaMlMarker = "libnvidia-ml.so.1" +const missingDeviceDriverMarker = "could not select device driver" + +const nvidiaRuntimeFailure = `Error response from daemon: failed to create task for container: ${nvidiaContainerCliMarker}: initialization error: load library failed: ${libNvidiaMlMarker}` + +const nvidiaMissingDeviceDriverFailure = + `Error response from daemon: ${missingDeviceDriverMarker} "" with capabilities: [[gpu]]` + +const arbitraryComposeFailure = + "Error response from daemon: network sandbox setup failed" + +const gpuAllComposeYamlPattern = /(^|\s)gpus:\s*["']?all["']?(\s|$)/m + +const nvidiaFailureMarkers: ReadonlyArray = [ + nvidiaContainerCliMarker, + libNvidiaMlMarker, + missingDeviceDriverMarker +] + +const containsNvidiaFailureMarker = (details: string): boolean => { + const normalized = details.toLowerCase() + return nvidiaFailureMarkers.some((marker) => normalized.includes(marker)) +} + +const hasNvidiaFallbackWarning = (logs: ReadonlyArray, expectedDetail: string): boolean => + logs.some((entry) => + entry.includes("NVIDIA runtime failed") && + entry.includes(expectedDetail) && + entry.includes("GPU access disabled") + ) + +const isDockerComposeUpAttempt = (cmd: RecordedCommand): boolean => + isDockerComposeUpWithBuild(cmd) || isDockerComposeUpReuse(cmd) + +type FakeExecutorOptions = { + readonly failGpuComposeUp?: boolean + readonly gpuFailureStderr?: string +} + +const makeFakeExecutor = ( + recorded: Array, + options: FakeExecutorOptions = {} +): CommandExecutor.CommandExecutor => { + let shouldFailGpuComposeUp = options.failGpuComposeUp === true + const gpuFailureStderr = options.gpuFailureStderr ?? nvidiaRuntimeFailure + const start = (command: Command.Command): Effect.Effect => Effect.gen(function*(_) { const flattened = Command.flatten(command) for (const entry of flattened) { - recorded.push({ command: entry.command, args: entry.args }) + recorded.push({ + command: entry.command, + args: entry.args, + cwd: Option.getOrUndefined(entry.cwd) + }) } const last = flattened[flattened.length - 1]! - const invocation: RecordedCommand = { command: last.command, args: last.args } + const invocation: RecordedCommand = { + command: last.command, + args: last.args, + cwd: Option.getOrUndefined(last.cwd) + } const stdoutText = decideStdout(invocation) const stdout = stdoutText.length === 0 ? Stream.empty : Stream.succeed(encode(stdoutText)) + const failed = shouldFailGpuComposeUp && isDockerComposeUpAttempt(invocation) + shouldFailGpuComposeUp = shouldFailGpuComposeUp && !failed + const stderr = failed ? Stream.succeed(encode(gpuFailureStderr)) : Stream.empty const process: CommandExecutor.Process = { [CommandExecutor.ProcessTypeId]: CommandExecutor.ProcessTypeId, pid: CommandExecutor.ProcessId(1), - exitCode: Effect.succeed(CommandExecutor.ExitCode(0)), + exitCode: Effect.succeed(CommandExecutor.ExitCode(failed ? 1 : 0)), isRunning: Effect.succeed(false), kill: (_signal) => Effect.void, - stderr: Stream.empty, + stderr, stdin: Sink.drain, stdout, toJSON: () => ({ _tag: "ProjectsUpTestProcess", command: invocation.command, args: invocation.args }), @@ -223,6 +284,176 @@ describe("runDockerComposeUpWithPortCheck", () => { }) ).pipe(Effect.provide(NodeContext.layer))) + it.effect("falls back to GPU none when the host NVIDIA runtime is unavailable", () => + withTempDir((root) => + Effect.gen(function*(_) { + const fs = yield* _(FileSystem.FileSystem) + const path = yield* _(Path.Path) + const outDir = path.join(root, "project") + const targetDir = "/home/dev/workspaces/org/repo" + const globalConfig = makeTemplateConfig(root, outDir, path, targetDir) + const projectConfig: TemplateConfig = { + ...makeTemplateConfig(root, outDir, path, targetDir), + gpu: "all" + } + const recorded: Array = [] + const logs: Array = [] + const executor = makeFakeExecutor(recorded, { failGpuComposeUp: true }) + const logger = Logger.make(({ message }) => { + logs.push(String(message)) + }) + + yield* _( + prepareProjectFiles(outDir, root, globalConfig, projectConfig, { + force: false, + forceEnv: false + }) + ) + + const started = yield* _( + runDockerComposeUpWithPortCheck(outDir).pipe( + Effect.provideService(CommandExecutor.CommandExecutor, executor), + Effect.provide(Logger.replace(Logger.defaultLogger, logger)) + ) + ) + + expect(started.gpu).toBe("none") + + const composeAfter = yield* _(fs.readFileString(path.join(outDir, "docker-compose.yml"))) + expect(composeAfter).not.toMatch(gpuAllComposeYamlPattern) + + const configAfter = yield* _(fs.readFileString(path.join(outDir, "docker-git.json"))) + expect(configAfter).toContain('"gpu": "none"') + expect(recorded.filter((entry) => isDockerComposeUpWithBuild(entry)).length).toBe(2) + expect(hasNvidiaFallbackWarning(logs, "libnvidia-ml.so.1")).toBe(true) + }) + ).pipe(Effect.provide(NodeContext.layer))) + + it.effect("falls back to GPU none on missing-device-driver NVIDIA runtime failure", () => + withTempDir((root) => + Effect.gen(function*(_) { + const fs = yield* _(FileSystem.FileSystem) + const path = yield* _(Path.Path) + const outDir = path.join(root, "project") + const targetDir = "/home/dev/workspaces/org/repo" + const globalConfig = makeTemplateConfig(root, outDir, path, targetDir) + const projectConfig: TemplateConfig = { + ...makeTemplateConfig(root, outDir, path, targetDir), + gpu: "all" + } + const recorded: Array = [] + const logs: Array = [] + const executor = makeFakeExecutor(recorded, { + failGpuComposeUp: true, + gpuFailureStderr: nvidiaMissingDeviceDriverFailure + }) + const logger = Logger.make(({ message }) => { + logs.push(String(message)) + }) + + yield* _( + prepareProjectFiles(outDir, root, globalConfig, projectConfig, { + force: false, + forceEnv: false + }) + ) + + const started = yield* _( + runDockerComposeUpWithPortCheck(outDir).pipe( + Effect.provideService(CommandExecutor.CommandExecutor, executor), + Effect.provide(Logger.replace(Logger.defaultLogger, logger)) + ) + ) + + expect(started.gpu).toBe("none") + + const composeAfter = yield* _(fs.readFileString(path.join(outDir, "docker-compose.yml"))) + expect(composeAfter).not.toMatch(gpuAllComposeYamlPattern) + + const configAfter = yield* _(fs.readFileString(path.join(outDir, "docker-git.json"))) + expect(configAfter).toContain('"gpu": "none"') + expect(recorded.filter((entry) => isDockerComposeUpWithBuild(entry)).length).toBe(2) + expect(hasNvidiaFallbackWarning(logs, "could not select device driver")).toBe(true) + }) + ).pipe(Effect.provide(NodeContext.layer))) + + it("keeps GPU access unchanged for arbitrary docker compose up failures", () => { + expect(gpuModeAfterDockerFailure("all", arbitraryComposeFailure)).toBe("all") + expect(gpuModeAfterDockerFailure("none", arbitraryComposeFailure)).toBe("none") + }) + + it("satisfies the GPU fallback classifier invariant", () => { + const dockerFailureDetails = fc.oneof( + fc.string(), + fc + .tuple( + fc.string(), + fc.constantFrom(nvidiaContainerCliMarker, libNvidiaMlMarker, missingDeviceDriverMarker), + fc.string() + ) + .map(([left, marker, right]) => `${left}${marker}${right}`) + ) + + fc.assert( + fc.property(dockerFailureDetails, (details) => { + const expectedGpu = containsNvidiaFailureMarker(details) ? "none" : "all" + + expect(gpuModeAfterDockerFailure("all", details)).toBe(expectedGpu) + expect(gpuModeAfterDockerFailure("none", details)).toBe("none") + }), + { numRuns: 50 } + ) + }) + + it.effect("falls back to GPU none before retrying reuse mode when the host NVIDIA runtime is unavailable", () => + withTempDir((root) => + Effect.gen(function*(_) { + const fs = yield* _(FileSystem.FileSystem) + const path = yield* _(Path.Path) + const outDir = path.join(root, "project") + const targetDir = "/home/dev/workspaces/org/repo" + const globalConfig = makeTemplateConfig(root, outDir, path, targetDir) + const projectConfig: TemplateConfig = { + ...makeTemplateConfig(root, outDir, path, targetDir), + gpu: "all" + } + const recorded: Array = [] + const logs: Array = [] + const executor = makeFakeExecutor(recorded, { failGpuComposeUp: true }) + const logger = Logger.make(({ message }) => { + logs.push(String(message)) + }) + + yield* _( + prepareProjectFiles(outDir, root, globalConfig, projectConfig, { + force: false, + forceEnv: false + }) + ) + + const started = yield* _( + runDockerComposeUpWithPortCheck(outDir, { + buildMode: "reuse", + waitForPostStart: false + }).pipe( + Effect.provideService(CommandExecutor.CommandExecutor, executor), + Effect.provide(Logger.replace(Logger.defaultLogger, logger)) + ) + ) + + expect(started.gpu).toBe("none") + + const composeAfter = yield* _(fs.readFileString(path.join(outDir, "docker-compose.yml"))) + expect(composeAfter).not.toMatch(gpuAllComposeYamlPattern) + + const configAfter = yield* _(fs.readFileString(path.join(outDir, "docker-git.json"))) + expect(configAfter).toContain('"gpu": "none"') + expect(recorded.filter((entry) => isDockerComposeUpReuse(entry)).length).toBe(2) + expect(recorded.filter((entry) => isDockerComposeUpWithBuild(entry)).length).toBe(0) + expect(hasNvidiaFallbackWarning(logs, "libnvidia-ml.so.1")).toBe(true) + }) + ).pipe(Effect.provide(NodeContext.layer))) + it.effect("can reuse the existing image path for SSH-open cold start", () => withTempDir((root) => Effect.gen(function*(_) {