From 73f5f07549bade6cb881ca0f85c22a35c6329f24 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 16:41:12 -0700 Subject: [PATCH 01/13] ci: add docker build in the pipeline to use with a basic k8s install test Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 129 ++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 .github/workflows/docker-cpu-smoketest.yaml diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml new file mode 100644 index 0000000000..5967a54540 --- /dev/null +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -0,0 +1,129 @@ +name: Docker CPU smoke-test images + +on: + push: + branches: [main] + paths: + - ".github/workflows/docker-cpu-smoketest.yaml" + - "docker-bake.hcl" + - "docker/**" + - "Makefile" + - "packages/**" + - "plugins/**" + - "sdk/**" + - "services/**" + - "src/**" + - "uv.lock" + - "pyproject.toml" + pull_request: + branches: [main] + paths: + - ".github/workflows/docker-cpu-smoketest.yaml" + - "docker-bake.hcl" + - "docker/**" + - "Makefile" + - "packages/**" + - "plugins/**" + - "sdk/**" + - "services/**" + - "src/**" + - "uv.lock" + - "pyproject.toml" + workflow_dispatch: + inputs: + image_tag: + description: Optional image tag. Defaults to the commit SHA. + required: false + type: string + default: "" + publish: + description: Publish images to GHCR. + required: false + type: boolean + default: true + +permissions: + contents: read + packages: write + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref || github.run_id }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +jobs: + build-cpu-images: + name: Build CPU images + runs-on: ubuntu-latest + timeout-minutes: 90 + steps: + - name: Checkout code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Free disk space + uses: ./.github/actions/free-disk-space + with: + disable_swap: "true" + remove_haskell: "true" + remove_java: "true" + remove_ruby: "true" + remove_swift: "true" + prune_docker: "true" + + - name: Set up Docker Buildx + shell: bash + run: | + set -euo pipefail + docker buildx create --name nmp-builder --driver docker-container --use + docker buildx inspect --bootstrap + + - name: Configure bake variables + shell: bash + env: + INPUT_IMAGE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.image_tag || '' }} + INPUT_PUBLISH: ${{ inputs.publish }} + run: | + set -euo pipefail + + image_registry="ghcr.io/${GITHUB_REPOSITORY,,}" + bake_tag="${INPUT_IMAGE_TAG:-$GITHUB_SHA}" + publish_images="false" + + if [ "$GITHUB_EVENT_NAME" = "push" ] && [ "$GITHUB_REF" = "refs/heads/main" ]; then + publish_images="true" + fi + + if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ] && [ "$INPUT_PUBLISH" = "true" ]; then + publish_images="true" + fi + + { + printf 'IMAGE_REGISTRY=%s\n' "$image_registry" + printf 'BASE_REGISTRY=%s\n' "$image_registry" + printf 'CACHE_REGISTRY=%s\n' "$image_registry" + printf 'BAKE_TAG=%s\n' "$bake_tag" + printf 'CI_COMMIT_SHA=%s\n' "$GITHUB_SHA" + printf 'PUBLISH_IMAGES=%s\n' "$publish_images" + } >> "$GITHUB_ENV" + + - name: Log in to GHCR + if: env.PUBLISH_IMAGES == 'true' + shell: bash + env: + GHCR_TOKEN: ${{ github.token }} + run: | + set -euo pipefail + echo "$GHCR_TOKEN" | docker login ghcr.io -u "$GITHUB_ACTOR" --password-stdin + + - name: Print Docker bake graph + shell: bash + run: make docker-print TARGET=docker-cpu + + - name: Build CPU images + if: env.PUBLISH_IMAGES != 'true' + shell: bash + run: make docker-load TARGET=docker-cpu + + - name: Build and publish CPU images + if: env.PUBLISH_IMAGES == 'true' + shell: bash + run: make docker-push TARGET=docker-cpu From b082f8bac1335a03ab254aac5386cc1c86c7c2ed Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 17:07:11 -0700 Subject: [PATCH 02/13] ci: add kind install smoke test Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 180 +++++++++++++++++++- 1 file changed, 178 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index 5967a54540..2107e3f065 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -7,6 +7,8 @@ on: - ".github/workflows/docker-cpu-smoketest.yaml" - "docker-bake.hcl" - "docker/**" + - "e2e/k8s/**" + - "k8s/helm/**" - "Makefile" - "packages/**" - "plugins/**" @@ -21,6 +23,8 @@ on: - ".github/workflows/docker-cpu-smoketest.yaml" - "docker-bake.hcl" - "docker/**" + - "e2e/k8s/**" + - "k8s/helm/**" - "Makefile" - "packages/**" - "plugins/**" @@ -55,6 +59,10 @@ jobs: name: Build CPU images runs-on: ubuntu-latest timeout-minutes: 90 + outputs: + image_registry: ${{ steps.bake-vars.outputs.image_registry }} + image_tag: ${{ steps.bake-vars.outputs.image_tag }} + publish_images: ${{ steps.bake-vars.outputs.publish_images }} steps: - name: Checkout code uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 @@ -77,21 +85,29 @@ jobs: docker buildx inspect --bootstrap - name: Configure bake variables + id: bake-vars shell: bash env: + HEAD_REPOSITORY: ${{ github.event.pull_request.head.repo.full_name || github.repository }} INPUT_IMAGE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.image_tag || '' }} INPUT_PUBLISH: ${{ inputs.publish }} + SOURCE_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | set -euo pipefail image_registry="ghcr.io/${GITHUB_REPOSITORY,,}" - bake_tag="${INPUT_IMAGE_TAG:-$GITHUB_SHA}" + source_sha="${SOURCE_SHA:-$GITHUB_SHA}" + bake_tag="${INPUT_IMAGE_TAG:-$source_sha}" publish_images="false" if [ "$GITHUB_EVENT_NAME" = "push" ] && [ "$GITHUB_REF" = "refs/heads/main" ]; then publish_images="true" fi + if [ "$GITHUB_EVENT_NAME" = "pull_request" ] && [ "$HEAD_REPOSITORY" = "$GITHUB_REPOSITORY" ]; then + publish_images="true" + fi + if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ] && [ "$INPUT_PUBLISH" = "true" ]; then publish_images="true" fi @@ -101,9 +117,14 @@ jobs: printf 'BASE_REGISTRY=%s\n' "$image_registry" printf 'CACHE_REGISTRY=%s\n' "$image_registry" printf 'BAKE_TAG=%s\n' "$bake_tag" - printf 'CI_COMMIT_SHA=%s\n' "$GITHUB_SHA" + printf 'CI_COMMIT_SHA=%s\n' "$source_sha" printf 'PUBLISH_IMAGES=%s\n' "$publish_images" } >> "$GITHUB_ENV" + { + printf 'image_registry=%s\n' "$image_registry" + printf 'image_tag=%s\n' "$bake_tag" + printf 'publish_images=%s\n' "$publish_images" + } >> "$GITHUB_OUTPUT" - name: Log in to GHCR if: env.PUBLISH_IMAGES == 'true' @@ -127,3 +148,158 @@ jobs: if: env.PUBLISH_IMAGES == 'true' shell: bash run: make docker-push TARGET=docker-cpu + + kind-smoke: + name: Set up kind CPU environment + needs: [build-cpu-images] + if: needs.build-cpu-images.outputs.publish_images == 'true' + runs-on: ubuntu-latest + timeout-minutes: 45 + env: + BUSYBOX_IMAGE: docker.io/library/busybox + HELM_CHART: k8s/helm + K8S_E2E_SCRIPTS: e2e/k8s/scripts + K8S_E2E_VALUES: e2e/k8s/values + KIND_CLUSTER_NAME: gha-${{ github.run_id }}-${{ github.run_attempt }}-kind-smoke + KUBE_GATEWAY_NAME: nmp-e2e-gateway + KUBE_NAMESPACE: nemo-platform + NAMESPACE: nemo-platform + NMP_E2E_INTERNAL_HOST: nemo-platform-api:8080 + NMP_E2E_REGISTRY: ${{ needs.build-cpu-images.outputs.image_registry }} + NMP_E2E_TAG: ${{ needs.build-cpu-images.outputs.image_tag }} + POSTGRES_IMAGE: docker.io/library/postgres + steps: + - name: Checkout code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Free disk space + uses: ./.github/actions/free-disk-space + with: + disable_swap: "true" + remove_haskell: "true" + remove_java: "true" + remove_ruby: "true" + remove_swift: "true" + prune_docker: "true" + + - name: Install kind + shell: bash + env: + KIND_VERSION: v0.32.0 + run: | + set -euo pipefail + + case "$(uname -m)" in + x86_64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) + echo "Unsupported architecture: $(uname -m)" >&2 + exit 1 + ;; + esac + + kind_url="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-${arch}" + curl -fsSLo "${RUNNER_TEMP}/kind" "${kind_url}" + curl -fsSLo "${RUNNER_TEMP}/kind.sha256sum" "${kind_url}.sha256sum" + sed "s# kind-linux-${arch}# ${RUNNER_TEMP}/kind#" "${RUNNER_TEMP}/kind.sha256sum" | sha256sum -c - + sudo install -m 0755 "${RUNNER_TEMP}/kind" /usr/local/bin/kind + + - name: Install kubectl + shell: bash + env: + KUBECTL_VERSION: v1.33.7 + run: | + set -euo pipefail + + case "$(uname -m)" in + x86_64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) + echo "Unsupported architecture: $(uname -m)" >&2 + exit 1 + ;; + esac + + kubectl_url="https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${arch}/kubectl" + curl -fsSLo "${RUNNER_TEMP}/kubectl" "${kubectl_url}" + curl -fsSLo "${RUNNER_TEMP}/kubectl.sha256" "${kubectl_url}.sha256" + echo "$(cat "${RUNNER_TEMP}/kubectl.sha256") ${RUNNER_TEMP}/kubectl" | sha256sum -c - + sudo install -m 0755 "${RUNNER_TEMP}/kubectl" /usr/local/bin/kubectl + + - name: Install Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + + - name: Start kind cluster + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + NGC_API_KEY: not-used-for-ghcr-cpu-smoke + run: bash "${K8S_E2E_SCRIPTS}/setup_local_kind_cpu.sh" + + - name: Set default kubectl namespace + shell: bash + run: kubectl config set-context --current --namespace="${NAMESPACE}" + + - name: Pre-pull GHCR images into kind + shell: bash + env: + KIND_IMAGE_PULL_TOKEN: ${{ github.token }} + KIND_IMAGE_PULL_USER: ${{ github.actor }} + run: | + "${K8S_E2E_SCRIPTS}/prepull_kind_images.sh" \ + "${NMP_E2E_REGISTRY}/nmp-api:${NMP_E2E_TAG}" \ + "${NMP_E2E_REGISTRY}/nmp-core:${NMP_E2E_TAG}" \ + "${NMP_E2E_REGISTRY}/nmp-cpu-tasks:${NMP_E2E_TAG}" + + - name: Build Helm dependencies + shell: bash + run: | + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + helm repo update + helm dependency build "${HELM_CHART}" + + - name: Install NeMo Platform + shell: bash + run: HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh" + + - name: Wait for API + shell: bash + run: | + test -n "${NMP_E2E_CLUSTER_URL}" + "${K8S_E2E_SCRIPTS}/wait_for_api.sh" "${NMP_E2E_CLUSTER_URL}/cluster-info" 120 + + - name: Collect Kubernetes logs + if: always() + shell: bash + run: | + "${K8S_E2E_SCRIPTS}/collect_k8s_logs.sh" + + - name: Disk usage summary + if: always() + shell: bash + run: | + echo "=== Host disk ===" + df -h / + echo "=== Docker system ===" + docker system df + echo "=== kind node storage ===" + for node in $(kind get nodes --name "${KIND_CLUSTER_NAME}" 2>/dev/null); do + echo "--- ${node} ---" + docker exec "${node}" sh -c "du -sh /var/lib/containerd /var/lib/kubelet /var/log 2>/dev/null | sort -h" || true + done + + - name: Upload Kubernetes artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: kind-smoke-kubernetes-artifacts + retention-days: 7 + if-no-files-found: ignore + path: k8s-logs/ + + - name: Delete kind cluster + if: always() + shell: bash + run: | + docker rm -f "cloud-provider-kind-${KIND_CLUSTER_NAME}" || true + kind delete cluster --name "${KIND_CLUSTER_NAME}" || true From 740ab60c086a407b07d4fe2f5e671be0f748122b Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 17:36:10 -0700 Subject: [PATCH 03/13] ci: sort perms better and troubleshoot Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index 2107e3f065..6e73748b31 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -48,7 +48,7 @@ on: permissions: contents: read - packages: write + packages: read concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref || github.run_id }} @@ -59,6 +59,9 @@ jobs: name: Build CPU images runs-on: ubuntu-latest timeout-minutes: 90 + permissions: + contents: read + packages: write outputs: image_registry: ${{ steps.bake-vars.outputs.image_registry }} image_tag: ${{ steps.bake-vars.outputs.image_tag }} @@ -155,6 +158,9 @@ jobs: if: needs.build-cpu-images.outputs.publish_images == 'true' runs-on: ubuntu-latest timeout-minutes: 45 + permissions: + contents: read + packages: read env: BUSYBOX_IMAGE: docker.io/library/busybox HELM_CHART: k8s/helm @@ -233,13 +239,21 @@ jobs: shell: bash env: HF_TOKEN: ${{ secrets.HF_TOKEN }} - NGC_API_KEY: not-used-for-ghcr-cpu-smoke run: bash "${K8S_E2E_SCRIPTS}/setup_local_kind_cpu.sh" - name: Set default kubectl namespace shell: bash run: kubectl config set-context --current --namespace="${NAMESPACE}" + - name: Verify Gateway API setup + shell: bash + run: | + set -euo pipefail + kubectl wait --for=condition=Established crd/gateways.gateway.networking.k8s.io --timeout=2m + kubectl wait --for=condition=Established crd/httproutes.gateway.networking.k8s.io --timeout=2m + kubectl get gatewayclass cloud-provider-kind + kubectl -n "${NAMESPACE}" get gateway "${KUBE_GATEWAY_NAME}" + - name: Pre-pull GHCR images into kind shell: bash env: From 986859a858a299d63a0022a32f9b5356517273aa Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 17:44:22 -0700 Subject: [PATCH 04/13] ci: digging a little more Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index 6e73748b31..4f82219873 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -170,6 +170,8 @@ jobs: KUBE_GATEWAY_NAME: nmp-e2e-gateway KUBE_NAMESPACE: nemo-platform NAMESPACE: nemo-platform + CORE_STORAGE_BINDER_ENABLED: "false" + NMP_E2E_CLUSTER_URL: "" NMP_E2E_INTERNAL_HOST: nemo-platform-api:8080 NMP_E2E_REGISTRY: ${{ needs.build-cpu-images.outputs.image_registry }} NMP_E2E_TAG: ${{ needs.build-cpu-images.outputs.image_tag }} @@ -239,6 +241,7 @@ jobs: shell: bash env: HF_TOKEN: ${{ secrets.HF_TOKEN }} + NGC_API_KEY: not-used-for-ghcr-cpu-smoke run: bash "${K8S_E2E_SCRIPTS}/setup_local_kind_cpu.sh" - name: Set default kubectl namespace @@ -274,7 +277,16 @@ jobs: - name: Install NeMo Platform shell: bash - run: HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh" + run: | + if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh"; then + echo "--- helm list -A ---" + helm list -A || true + echo "--- helm status ${NAMESPACE}/nemo-platform ---" + helm status -n "${NAMESPACE}" nemo-platform || true + echo "--- kubectl get all -n ${NAMESPACE} ---" + kubectl get all -n "${NAMESPACE}" || true + exit 1 + fi - name: Wait for API shell: bash From 68b64220c9ae18a55c9aa1b4ca751882975fc500 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 18:12:20 -0700 Subject: [PATCH 05/13] ci: what is going on Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index 4f82219873..08454aabe3 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -278,6 +278,22 @@ jobs: - name: Install NeMo Platform shell: bash run: | + watch_pods() { + while true; do + echo "--- kubectl get pods -n ${NAMESPACE} $(date -u +%Y-%m-%dT%H:%M:%SZ) ---" + kubectl get pods -n "${NAMESPACE}" -o wide || true + sleep 5 + done + } + + watch_pods & + watch_pods_pid="$!" + cleanup_pod_watch() { + kill "${watch_pods_pid}" 2>/dev/null || true + wait "${watch_pods_pid}" 2>/dev/null || true + } + trap cleanup_pod_watch EXIT + if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh"; then echo "--- helm list -A ---" helm list -A || true From c9dcb98348ee66500fb966bf9ec8a3abc4290848 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 18:42:34 -0700 Subject: [PATCH 06/13] chore: troubleshooting what is happening in ci Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 63 +++++++++++++++------ e2e/k8s/scripts/install_nmp_e2e.sh | 54 +++++++++++++++++- 2 files changed, 97 insertions(+), 20 deletions(-) diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index 08454aabe3..3aaccde3aa 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -170,7 +170,6 @@ jobs: KUBE_GATEWAY_NAME: nmp-e2e-gateway KUBE_NAMESPACE: nemo-platform NAMESPACE: nemo-platform - CORE_STORAGE_BINDER_ENABLED: "false" NMP_E2E_CLUSTER_URL: "" NMP_E2E_INTERNAL_HOST: nemo-platform-api:8080 NMP_E2E_REGISTRY: ${{ needs.build-cpu-images.outputs.image_registry }} @@ -275,25 +274,55 @@ jobs: helm repo update helm dependency build "${HELM_CHART}" - - name: Install NeMo Platform + - name: Verify Helm install inputs shell: bash run: | - watch_pods() { - while true; do - echo "--- kubectl get pods -n ${NAMESPACE} $(date -u +%Y-%m-%dT%H:%M:%SZ) ---" - kubectl get pods -n "${NAMESPACE}" -o wide || true - sleep 5 - done - } - - watch_pods & - watch_pods_pid="$!" - cleanup_pod_watch() { - kill "${watch_pods_pid}" 2>/dev/null || true - wait "${watch_pods_pid}" 2>/dev/null || true - } - trap cleanup_pod_watch EXIT + set -euo pipefail + + required_vars=( + BUSYBOX_IMAGE + HELM_CHART + K8S_E2E_SCRIPTS + K8S_E2E_VALUES + KUBE_GATEWAY_NAME + NAMESPACE + NMP_E2E_REGISTRY + NMP_E2E_TAG + POSTGRES_IMAGE + ) + + for var_name in "${required_vars[@]}"; do + if [ -z "${!var_name:-}" ]; then + echo "${var_name} must be set before installing the Helm chart" >&2 + exit 1 + fi + done + test -d "${HELM_CHART}" + test -f "${K8S_E2E_VALUES}/kind.yaml" + test -x "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh" + test -x "${K8S_E2E_SCRIPTS}/wait_for_release_ready.sh" + + echo "Helm install context:" + printf ' kubectl context: ' + kubectl config current-context + printf ' namespace: %s\n' "${NAMESPACE}" + printf ' chart: %s\n' "${HELM_CHART}" + printf ' values: %s\n' "${K8S_E2E_VALUES}/kind.yaml" + printf ' image registry: %s\n' "${NMP_E2E_REGISTRY}" + printf ' image tag: %s\n' "${NMP_E2E_TAG}" + printf ' api image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" + printf ' core image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" + printf ' cpu tasks image: %s/nmp-cpu-tasks:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" + printf ' postgres image: %s\n' "${POSTGRES_IMAGE}" + printf ' busybox image: %s\n' "${BUSYBOX_IMAGE}" + helm version + kubectl get namespace "${NAMESPACE}" + kubectl -n "${NAMESPACE}" get gateway "${KUBE_GATEWAY_NAME}" + + - name: Install NeMo Platform + shell: bash + run: | if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh"; then echo "--- helm list -A ---" helm list -A || true diff --git a/e2e/k8s/scripts/install_nmp_e2e.sh b/e2e/k8s/scripts/install_nmp_e2e.sh index 275fd6a3a9..bd466947a6 100755 --- a/e2e/k8s/scripts/install_nmp_e2e.sh +++ b/e2e/k8s/scripts/install_nmp_e2e.sh @@ -1,13 +1,13 @@ #!/usr/bin/env bash -set -e +set -euo pipefail REPO_ROOT=$(git rev-parse --show-toplevel) NAMESPACE="${NAMESPACE:-default}" HELM_RELEASE_NAME="${HELM_RELEASE_NAME:-nemo-platform}" -NMP_E2E_REGISTRY="${NMP_E2E_REGISTRY:-${NMP_E2E_REGISTRY}}" -NMP_E2E_TAG="${NMP_E2E_TAG:-${NMP_E2E_TAG}}" +NMP_E2E_REGISTRY="${NMP_E2E_REGISTRY:-}" +NMP_E2E_TAG="${NMP_E2E_TAG:-}" HELM_EXTRA_ARGS="${HELM_EXTRA_ARGS:-}" HELM_CHART="${HELM_CHART:-${REPO_ROOT}/k8s/helm}" HELM_VALUES="${HELM_VALUES:-${HELM_VALUES_FILE:-${REPO_ROOT}/e2e/k8s/values/default.yaml}}" @@ -16,6 +16,39 @@ BUSYBOX_IMAGE="${BUSYBOX_IMAGE:-docker.io/library/busybox}" RELEASE_READY_SCRIPT="${RELEASE_READY_SCRIPT:-${REPO_ROOT}/e2e/k8s/scripts/wait_for_release_ready.sh}" EXTRA_HELM_ARGS=() +require_non_empty() { + local name="$1" + if [ -z "${!name:-}" ]; then + echo "${name} is required for ${HELM_RELEASE_NAME} Helm install" >&2 + exit 1 + fi +} + +require_non_empty NAMESPACE +require_non_empty HELM_RELEASE_NAME +require_non_empty NMP_E2E_REGISTRY +require_non_empty NMP_E2E_TAG +require_non_empty HELM_CHART +require_non_empty HELM_VALUES +require_non_empty POSTGRES_IMAGE +require_non_empty BUSYBOX_IMAGE +require_non_empty RELEASE_READY_SCRIPT + +if [ ! -d "${HELM_CHART}" ]; then + echo "HELM_CHART does not exist or is not a directory: ${HELM_CHART}" >&2 + exit 1 +fi + +if [ ! -f "${HELM_VALUES}" ]; then + echo "HELM_VALUES does not exist or is not a file: ${HELM_VALUES}" >&2 + exit 1 +fi + +if [ ! -x "${RELEASE_READY_SCRIPT}" ]; then + echo "RELEASE_READY_SCRIPT does not exist or is not executable: ${RELEASE_READY_SCRIPT}" >&2 + exit 1 +fi + if [ -n "${HELM_EXTRA_ARGS}" ]; then read -r -a EXTRA_HELM_ARGS <<< "${HELM_EXTRA_ARGS}" fi @@ -39,6 +72,21 @@ HELM_ARGS=( --wait ) +echo "Helm install inputs:" +printf ' release: %s\n' "${HELM_RELEASE_NAME}" +printf ' namespace: %s\n' "${NAMESPACE}" +printf ' chart: %s\n' "${HELM_CHART}" +printf ' values: %s\n' "${HELM_VALUES}" +printf ' api image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" +printf ' core image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" +printf ' platform image registry: %s\n' "${NMP_E2E_REGISTRY}" +printf ' platform image tag: %s\n' "${NMP_E2E_TAG}" +printf ' postgres image: %s\n' "${POSTGRES_IMAGE}" +printf ' core storage volume permissions image: %s\n' "${BUSYBOX_IMAGE}" +if [ -n "${HELM_EXTRA_ARGS}" ]; then + printf ' extra Helm args: %s\n' "${HELM_EXTRA_ARGS}" +fi + run_helm_with_release_monitor() { local helm_pid local monitor_pid From 75e5681d521b9fb903bf785ad04c7fc2ad781109 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 19:49:01 -0700 Subject: [PATCH 07/13] chore: simplify script a bit Signed-off-by: Brooke Storm --- e2e/k8s/scripts/install_nmp_e2e.sh | 92 ++++++------------------------ 1 file changed, 17 insertions(+), 75 deletions(-) diff --git a/e2e/k8s/scripts/install_nmp_e2e.sh b/e2e/k8s/scripts/install_nmp_e2e.sh index bd466947a6..55a529f0c0 100755 --- a/e2e/k8s/scripts/install_nmp_e2e.sh +++ b/e2e/k8s/scripts/install_nmp_e2e.sh @@ -69,7 +69,6 @@ HELM_ARGS=( --set core.storage.volumePermissionsImage="${BUSYBOX_IMAGE}" --create-namespace --timeout 15m - --wait ) echo "Helm install inputs:" @@ -87,81 +86,24 @@ if [ -n "${HELM_EXTRA_ARGS}" ]; then printf ' extra Helm args: %s\n' "${HELM_EXTRA_ARGS}" fi -run_helm_with_release_monitor() { - local helm_pid - local monitor_pid - local helm_status - local monitor_status - local helm_done=false - local monitor_done=false - local completed_pid - local completed_status - local wait_pids - - "${RELEASE_READY_SCRIPT}" & - monitor_pid="$!" - - helm upgrade -i "${HELM_ARGS[@]}" & - helm_pid="$!" - - while true; do - wait_pids=() - if [ "${helm_done}" = "false" ]; then - wait_pids+=("${helm_pid}") - fi - if [ "${monitor_done}" = "false" ]; then - wait_pids+=("${monitor_pid}") - fi - - if [ "${#wait_pids[@]}" -eq 0 ]; then - break - fi - - completed_pid="" - set +e - wait -n -p completed_pid "${wait_pids[@]}" - completed_status="$?" - set -e - - if [ "${completed_status}" -eq 127 ]; then - break - fi - - case "${completed_pid}" in - "${helm_pid}") - helm_done=true - helm_status="${completed_status}" - ;; - "${monitor_pid}") - monitor_done=true - monitor_status="${completed_status}" - ;; - esac - - if [ "${monitor_done}" = "true" ] && [ "${monitor_status}" -ne 0 ]; then - if [ "${helm_done}" = "false" ]; then - echo "Release readiness monitor failed; stopping Helm install" >&2 - kill "${helm_pid}" 2>/dev/null || true - wait "${helm_pid}" 2>/dev/null || true - fi - return "${monitor_status}" - fi - - if [ "${helm_done}" = "true" ] && [ "${helm_status}" -ne 0 ]; then - if [ "${monitor_done}" = "false" ]; then - echo "Helm install failed; stopping release readiness monitor" >&2 - kill "${monitor_pid}" 2>/dev/null || true - wait "${monitor_pid}" 2>/dev/null || true - fi - return "${helm_status}" - fi - done - - return 0 -} - # Install NMP platform -if ! run_helm_with_release_monitor; then +if ! helm upgrade -i "${HELM_ARGS[@]}"; then + echo "--- helm list -A ---" + helm list -A || true + echo "--- helm status ${NAMESPACE}/${HELM_RELEASE_NAME} ---" + helm status -n "${NAMESPACE}" "${HELM_RELEASE_NAME}" || true + echo "--- kubectl get pods -A ---" + kubectl get pods -A + echo "--- kubectl describe pods -n ${NAMESPACE} ---" + kubectl describe pods -n "${NAMESPACE}" + exit 1 +fi + +if ! "${RELEASE_READY_SCRIPT}"; then + echo "--- helm list -A ---" + helm list -A || true + echo "--- helm status ${NAMESPACE}/${HELM_RELEASE_NAME} ---" + helm status -n "${NAMESPACE}" "${HELM_RELEASE_NAME}" || true echo "--- kubectl get pods -A ---" kubectl get pods -A echo "--- kubectl describe pods -n ${NAMESPACE} ---" From 101c38d123cd336771b03b0a06635f6ca48e7488 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Mon, 15 Jun 2026 20:39:02 -0700 Subject: [PATCH 08/13] ci: add an e2e test Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 75 ++++++++------------- e2e/test_jobs.py | 71 +++++++++++-------- 2 files changed, 70 insertions(+), 76 deletions(-) diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index 3aaccde3aa..e44ade9a63 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -236,6 +236,14 @@ jobs: - name: Install Helm uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + - name: Install uv + uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + with: + python-version: "3.13" + enable-cache: true + version-file: pyproject.toml + cache-dependency-glob: uv.lock + - name: Start kind cluster shell: bash env: @@ -274,52 +282,6 @@ jobs: helm repo update helm dependency build "${HELM_CHART}" - - name: Verify Helm install inputs - shell: bash - run: | - set -euo pipefail - - required_vars=( - BUSYBOX_IMAGE - HELM_CHART - K8S_E2E_SCRIPTS - K8S_E2E_VALUES - KUBE_GATEWAY_NAME - NAMESPACE - NMP_E2E_REGISTRY - NMP_E2E_TAG - POSTGRES_IMAGE - ) - - for var_name in "${required_vars[@]}"; do - if [ -z "${!var_name:-}" ]; then - echo "${var_name} must be set before installing the Helm chart" >&2 - exit 1 - fi - done - - test -d "${HELM_CHART}" - test -f "${K8S_E2E_VALUES}/kind.yaml" - test -x "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh" - test -x "${K8S_E2E_SCRIPTS}/wait_for_release_ready.sh" - - echo "Helm install context:" - printf ' kubectl context: ' - kubectl config current-context - printf ' namespace: %s\n' "${NAMESPACE}" - printf ' chart: %s\n' "${HELM_CHART}" - printf ' values: %s\n' "${K8S_E2E_VALUES}/kind.yaml" - printf ' image registry: %s\n' "${NMP_E2E_REGISTRY}" - printf ' image tag: %s\n' "${NMP_E2E_TAG}" - printf ' api image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" - printf ' core image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" - printf ' cpu tasks image: %s/nmp-cpu-tasks:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" - printf ' postgres image: %s\n' "${POSTGRES_IMAGE}" - printf ' busybox image: %s\n' "${BUSYBOX_IMAGE}" - helm version - kubectl get namespace "${NAMESPACE}" - kubectl -n "${NAMESPACE}" get gateway "${KUBE_GATEWAY_NAME}" - - name: Install NeMo Platform shell: bash run: | @@ -339,6 +301,22 @@ jobs: test -n "${NMP_E2E_CLUSTER_URL}" "${K8S_E2E_SCRIPTS}/wait_for_api.sh" "${NMP_E2E_CLUSTER_URL}/cluster-info" 120 + - name: Run CPU job e2e smoke test + shell: bash + env: + _TYPER_FORCE_DISABLE_TERMINAL: "1" + E2E_SERVICES_LOG_DIR: ${{ runner.temp }}/e2e-services-logs + NGC_API_KEY: not-used-for-ghcr-cpu-smoke + run: | + test -n "${NMP_E2E_CLUSTER_URL}" + export NMP_BASE_URL="${NMP_E2E_CLUSTER_URL}" + uv run --frozen pytest \ + e2e/test_jobs.py::test_job_using_secret_environment_variable \ + -v \ + --run-e2e \ + --no-cov \ + --junitxml=report-kubernetes-smoke.xml + - name: Collect Kubernetes logs if: always() shell: bash @@ -366,7 +344,10 @@ jobs: name: kind-smoke-kubernetes-artifacts retention-days: 7 if-no-files-found: ignore - path: k8s-logs/ + path: | + k8s-logs/ + report-kubernetes-smoke.xml + ${{ runner.temp }}/e2e-services-logs/ - name: Delete kind cluster if: always() diff --git a/e2e/test_jobs.py b/e2e/test_jobs.py index 2bffe386c4..fae1bb4d74 100644 --- a/e2e/test_jobs.py +++ b/e2e/test_jobs.py @@ -217,39 +217,52 @@ def test_job_using_secret_environment_variable(sdk: NeMoPlatform, workspace: str secret = sdk.secrets.create(workspace=workspace, name=secret_name, value=secret_value) assert secret.name is not None, "Failed to create platform secret" - job = sdk.jobs.create( - workspace=workspace, - source=JOB_SOURCE, - spec={"test": "value"}, - platform_spec={ - "steps": [ - { - "name": "secret-envvar-step", - "executor": { - "provider": "cpu", - "container": { - "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], + secret_deleted = False + try: + job = sdk.jobs.create( + workspace=workspace, + source=JOB_SOURCE, + spec={"test": "value"}, + platform_spec={ + "steps": [ + { + "name": "secret-envvar-step", + "executor": { + "provider": "cpu", + "container": { + "command": ["sh", "-c", 'echo "Secret value is: $SECRET_ENV_VAR"'], + }, }, + "environment": [ + { + "name": "SECRET_ENV_VAR", + "from_secret": {"name": secret.name}, + }, + ], }, - "environment": [ - { - "name": "SECRET_ENV_VAR", - "from_secret": {"name": secret.name}, - }, - ], - }, - ], - }, - ) + ], + }, + ) - completed_job = wait_for_platform_job(sdk, job.name, workspace) - assert completed_job.status == "completed", _job_diagnostic_message( - sdk, completed_job, workspace, f"Job failed with status: {completed_job.status}" - ) + completed_job = wait_for_platform_job(sdk, job.name, workspace) + assert completed_job.status == "completed", _job_diagnostic_message( + sdk, completed_job, workspace, f"Job failed with status: {completed_job.status}" + ) - step_logs = wait_for_job_logs(sdk, job.name, workspace, min_log_count=1, timeout=120) - all_messages = " ".join(log.message for log in step_logs.data) - assert secret_value in all_messages, "Step logs do not show secret environment variable was used" + step_logs = wait_for_job_logs(sdk, job.name, workspace, min_log_count=1, timeout=120) + all_messages = " ".join(log.message for log in step_logs.data) + assert secret_value in all_messages, "Step logs do not show secret environment variable was used" + + sdk.secrets.delete(workspace=workspace, name=secret_name) + secret_deleted = True + secret_names = [listed_secret.name for listed_secret in sdk.secrets.list(workspace=workspace).data] + assert secret_name not in secret_names, "Secret should not appear in list after deletion" + finally: + if not secret_deleted: + try: + sdk.secrets.delete(workspace=workspace, name=secret_name) + except Exception: + pass def test_job_with_expected_failure(sdk: NeMoPlatform, workspace: str): From 4c1508aaa2f90250302378d5bd725178476b3d55 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Tue, 16 Jun 2026 08:35:07 -0700 Subject: [PATCH 09/13] ci: dedupe tests a bit and make things consistent Signed-off-by: Brooke Storm --- .github/workflows/docker-cpu-smoketest.yaml | 11 +- e2e/k8s/scripts/install_helm_e2e.sh | 302 +++++++++++++------- e2e/k8s/scripts/install_nmp_auth_e2e.sh | 4 +- e2e/k8s/scripts/install_nmp_e2e.sh | 112 -------- e2e/k8s/scripts/install_rustfs.sh | 26 -- 5 files changed, 203 insertions(+), 252 deletions(-) delete mode 100755 e2e/k8s/scripts/install_nmp_e2e.sh delete mode 100755 e2e/k8s/scripts/install_rustfs.sh diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml index e44ade9a63..911d42f96c 100644 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ b/.github/workflows/docker-cpu-smoketest.yaml @@ -275,17 +275,12 @@ jobs: "${NMP_E2E_REGISTRY}/nmp-core:${NMP_E2E_TAG}" \ "${NMP_E2E_REGISTRY}/nmp-cpu-tasks:${NMP_E2E_TAG}" - - name: Build Helm dependencies - shell: bash - run: | - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia - helm repo update - helm dependency build "${HELM_CHART}" - - name: Install NeMo Platform shell: bash + env: + REQUIRE_NMP_E2E_IMAGES: "true" run: | - if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_nmp_e2e.sh"; then + if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_helm_e2e.sh"; then echo "--- helm list -A ---" helm list -A || true echo "--- helm status ${NAMESPACE}/nemo-platform ---" diff --git a/e2e/k8s/scripts/install_helm_e2e.sh b/e2e/k8s/scripts/install_helm_e2e.sh index 04a5a608c2..c5c0869dc0 100755 --- a/e2e/k8s/scripts/install_helm_e2e.sh +++ b/e2e/k8s/scripts/install_helm_e2e.sh @@ -1,28 +1,39 @@ #!/usr/bin/env bash -# Script: install_helm_e2e.sh -# Description: Installs NeMo Platform via Helm using e2e.yaml values (expects minikube cluster and secrets already set up) -# Usage: ./install_helm_e2e.sh # -# Prerequisites: Run setup_local_minikube_gpu.sh first (or ensure minikube is running with ingress and secrets). +# Install NeMo Platform through Helm for local and CI Kubernetes E2E runs. # -# Environment Variables: -# MINIKUBE_PROFILE (optional) - Minikube profile name (defaults to minikube) -# NMP_E2E_REGISTRY (optional) - Container image registry for NMP services (e.g. ghcr.io/nvidia-nemo/platform) -# NMP_E2E_TAG (optional) - Container image tag for NMP services (e.g. a commit SHA) -# HELM_CHART (optional) - Override the helm chart source (default: local k8s/helm) -# HELM_VALUES_FILE (optional) - Override the default helm values file (default: e2e/k8s/values/default.yaml) -# HELM_EXTRA_ARGS (optional) - Additional helm install/upgrade arguments (e.g. --set api.image.repository=...) -# NGC_API_KEY (optional) - Required for helm dependency build (chart depends on NGC nvidia repo) +# This script intentionally handles both kind and minikube installs. Cluster +# setup remains in the setup_local_* scripts; this script owns Helm values, +# optional RustFS setup, chart install, and release readiness. set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(git -C "${SCRIPT_DIR}" rev-parse --show-toplevel)" + +NAMESPACE="${NAMESPACE:-${KUBE_NAMESPACE:-default}}" +HELM_RELEASE_NAME="${HELM_RELEASE_NAME:-nemo-platform}" +HELM_CHART="${HELM_CHART:-${REPO_ROOT}/k8s/helm}" +HELM_VALUES="${HELM_VALUES:-${HELM_VALUES_FILE:-${REPO_ROOT}/e2e/k8s/values/default.yaml}}" +HELM_EXTRA_ARGS="${HELM_EXTRA_ARGS:-}" +NMP_E2E_REGISTRY="${NMP_E2E_REGISTRY:-}" +NMP_E2E_TAG="${NMP_E2E_TAG:-}" +REQUIRE_NMP_E2E_IMAGES="${REQUIRE_NMP_E2E_IMAGES:-false}" +POSTGRES_IMAGE="${POSTGRES_IMAGE:-docker.io/library/postgres}" +BUSYBOX_IMAGE="${BUSYBOX_IMAGE:-docker.io/library/busybox}" +RELEASE_READY_SCRIPT="${RELEASE_READY_SCRIPT:-${SCRIPT_DIR}/wait_for_release_ready.sh}" +INSTALL_RUSTFS="${INSTALL_RUSTFS:-false}" +RUSTFS_STORAGECLASS="${RUSTFS_STORAGECLASS:-standard}" +RUSTFS_BUCKET="${RUSTFS_BUCKET:-e2e-k8s-test}" +RUSTFS_ACCESS_KEY="${RUSTFS_ACCESS_KEY:-rustfsadmin}" +RUSTFS_SECRET_KEY="${RUSTFS_SECRET_KEY:-rustfsadmin}" MINIKUBE_PROFILE="${MINIKUBE_PROFILE:-minikube}" -REPO_ROOT=$(git rev-parse --show-toplevel) +EXTRA_HELM_ARGS=() GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' -NC='\033[0m' # No Color +NC='\033[0m' log_info() { echo -e "${GREEN}[INFO]${NC} $*" @@ -36,150 +47,231 @@ log_error() { echo -e "${RED}[ERROR]${NC} $*" } -log_info "Validating environment..." +require_non_empty() { + local name="$1" + if [ -z "${!name:-}" ]; then + log_error "${name} is required for ${HELM_RELEASE_NAME} Helm install" + exit 1 + fi +} + +validate_file_inputs() { + require_non_empty NAMESPACE + require_non_empty HELM_RELEASE_NAME + require_non_empty HELM_CHART + require_non_empty HELM_VALUES + require_non_empty POSTGRES_IMAGE + require_non_empty BUSYBOX_IMAGE + require_non_empty RELEASE_READY_SCRIPT -for tool in kubectl helm curl; do - if ! command -v $tool &> /dev/null; then - log_error "$tool is not installed. Please install it first." + if [ ! -d "${HELM_CHART}" ]; then + log_error "HELM_CHART does not exist or is not a directory: ${HELM_CHART}" exit 1 fi -done -# Install RustFS for S3 storage scenarios + if [ ! -f "${HELM_VALUES}" ]; then + log_error "HELM_VALUES does not exist or is not a file: ${HELM_VALUES}" + exit 1 + fi + + if [ ! -x "${RELEASE_READY_SCRIPT}" ]; then + log_error "RELEASE_READY_SCRIPT does not exist or is not executable: ${RELEASE_READY_SCRIPT}" + exit 1 + fi +} + +validate_image_inputs() { + if [ "${REQUIRE_NMP_E2E_IMAGES}" = "true" ]; then + require_non_empty NMP_E2E_REGISTRY + require_non_empty NMP_E2E_TAG + fi + + if [ -n "${NMP_E2E_REGISTRY}" ] && [ -z "${NMP_E2E_TAG}" ]; then + log_error "NMP_E2E_TAG is required when NMP_E2E_REGISTRY is set" + exit 1 + fi + + if [ -z "${NMP_E2E_REGISTRY}" ] && [ -n "${NMP_E2E_TAG}" ]; then + log_error "NMP_E2E_REGISTRY is required when NMP_E2E_TAG is set" + exit 1 + fi +} + install_rustfs() { - log_info "Installing RustFS for S3-compat storage..." + log_info "Installing RustFS for S3-compatible E2E storage in namespace ${NAMESPACE}" - # Add RustFS helm repo if not already added - if ! helm repo list | grep -q "rustfs"; then + if ! helm repo list 2>/dev/null | awk '{print $1}' | grep -Fxq "rustfs"; then helm repo add rustfs https://charts.rustfs.com fi helm repo update rustfs - # Install RustFS in standalone mode for E2E testing - # See https://github.com/rustfs/helm for parameter reference - # Uses default credentials (rustfsadmin/rustfsadmin) helm upgrade -i rustfs rustfs/rustfs \ + -n "${NAMESPACE}" \ + --create-namespace \ --version 0.0.85 \ --set mode.standalone.enabled=true \ --set mode.distributed.enabled=false \ - --set ingress.className=nginx \ - --set storageclass.name=standard \ + --set storageclass.name="${RUSTFS_STORAGECLASS}" \ --timeout 5m - # Wait for RustFS to be ready (needs time for image pull + 30s readiness delay) - log_info "Waiting for RustFS pod to be ready..." - if ! kubectl wait --for=condition=ready pod -l app.kubernetes.io/name=rustfs --timeout=300s; then + log_info "Waiting for RustFS pod to become ready" + if ! kubectl -n "${NAMESPACE}" wait --for=condition=ready pod -l app.kubernetes.io/name=rustfs --timeout=300s; then log_error "RustFS pod failed to become ready" - kubectl describe pods -l app.kubernetes.io/name=rustfs || true - kubectl logs -l app.kubernetes.io/name=rustfs --tail=50 || true + kubectl -n "${NAMESPACE}" describe pods -l app.kubernetes.io/name=rustfs || true + kubectl -n "${NAMESPACE}" logs -l app.kubernetes.io/name=rustfs --tail=50 || true return 1 fi - log_info "RustFS is ready" - # Create the test bucket using aws-cli - log_info "Creating E2E test bucket in RustFS..." - if ! kubectl run aws-cli --rm -i --restart=Never \ + log_info "Creating RustFS bucket ${RUSTFS_BUCKET}" + if ! kubectl -n "${NAMESPACE}" run aws-cli --rm -i --restart=Never \ --image=amazon/aws-cli:2.22.35 \ --pod-running-timeout=2m \ - --env="AWS_ACCESS_KEY_ID=rustfsadmin" \ - --env="AWS_SECRET_ACCESS_KEY=rustfsadmin" \ - -- --endpoint-url http://rustfs-svc:9000 s3 mb s3://e2e-k8s-test; then - log_error "Failed to create E2E test bucket in RustFS" + --env="AWS_ACCESS_KEY_ID=${RUSTFS_ACCESS_KEY}" \ + --env="AWS_SECRET_ACCESS_KEY=${RUSTFS_SECRET_KEY}" \ + -- --endpoint-url http://rustfs-svc:9000 s3 mb "s3://${RUSTFS_BUCKET}"; then + log_error "Failed to create RustFS bucket ${RUSTFS_BUCKET}" return 1 fi - log_info "E2E test bucket created successfully" } -if ! minikube status -p "${MINIKUBE_PROFILE}" &> /dev/null; then - log_error "Minikube cluster is not running. Run setup_local_minikube_gpu.sh first." - exit 1 +add_nvidia_helm_repo() { + if helm repo list 2>/dev/null | awk '{print $1}' | grep -Fxq "nvidia"; then + return 0 + fi + + log_info "Adding NVIDIA Helm repo for chart dependencies" + if [ -n "${NGC_API_KEY:-}" ]; then + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --username='$oauthtoken' --password="${NGC_API_KEY}" + else + helm repo add nvidia https://helm.ngc.nvidia.com/nvidia + fi +} + +collect_install_diagnostics() { + echo "--- helm list -A ---" + helm list -A || true + echo "--- helm status ${NAMESPACE}/${HELM_RELEASE_NAME} ---" + helm status -n "${NAMESPACE}" "${HELM_RELEASE_NAME}" || true + echo "--- kubectl get pods -A ---" + kubectl get pods -A || true + echo "--- kubectl describe pods -n ${NAMESPACE} ---" + kubectl describe pods -n "${NAMESPACE}" || true +} + +maybe_export_minikube_cluster_url() { + if [ -n "${NMP_E2E_CLUSTER_URL:-}" ]; then + return 0 + fi + + if ! command -v minikube >/dev/null 2>&1; then + return 0 + fi + + if ! minikube status -p "${MINIKUBE_PROFILE}" >/dev/null 2>&1; then + return 0 + fi + + local minikube_ip + minikube_ip="$(minikube ip -p "${MINIKUBE_PROFILE}")" + NMP_E2E_CLUSTER_URL="http://${minikube_ip}" + export NMP_E2E_CLUSTER_URL + + if [ -n "${GITHUB_ENV:-}" ]; then + echo "NMP_E2E_CLUSTER_URL=${NMP_E2E_CLUSTER_URL}" >> "${GITHUB_ENV}" + fi +} + +log_info "Validating Helm install environment" +for tool in kubectl helm; do + if ! command -v "${tool}" >/dev/null 2>&1; then + log_error "${tool} is not installed. Please install it first." + exit 1 + fi +done + +validate_file_inputs +validate_image_inputs + +if [ -n "${HELM_EXTRA_ARGS}" ]; then + read -r -a EXTRA_HELM_ARGS <<< "${HELM_EXTRA_ARGS}" + if echo "${HELM_EXTRA_ARGS}" | grep -q "s3-rustfs"; then + INSTALL_RUSTFS=true + fi fi -log_info "Installing NeMo Platform via Helm..." +if [ "${INSTALL_RUSTFS}" = "true" ]; then + install_rustfs +fi -HELM_CHART="${HELM_CHART:-${REPO_ROOT}/k8s/helm}" -HELM_VALUES="${HELM_VALUES_FILE:-${REPO_ROOT}/e2e/k8s/values/default.yaml}" HELM_ARGS=( - nemo-platform + "${HELM_RELEASE_NAME}" "${HELM_CHART}" + -n "${NAMESPACE}" -f "${HELM_VALUES}" + "${EXTRA_HELM_ARGS[@]}" + --set postgresql.image.repository="${POSTGRES_IMAGE}" + --set core.storage.volumePermissionsImage="${BUSYBOX_IMAGE}" + --create-namespace --timeout 15m - --wait ) -if [ -n "${NMP_E2E_REGISTRY:-}" ]; then - log_info "Using image registry: ${NMP_E2E_REGISTRY}" +if [ -n "${NMP_E2E_REGISTRY}" ]; then HELM_ARGS+=( - --set "api.image.repository=${NMP_E2E_REGISTRY}/nmp-api" - --set "core.image.repository=${NMP_E2E_REGISTRY}/nmp-api" - --set-string "platformConfig.platform.image_registry=${NMP_E2E_REGISTRY}" + --set api.image.repository="${NMP_E2E_REGISTRY}/nmp-api" + --set core.image.repository="${NMP_E2E_REGISTRY}/nmp-api" + --set-string platformConfig.platform.image_registry="${NMP_E2E_REGISTRY}" ) fi -if [ -n "${NMP_E2E_TAG:-}" ]; then - log_info "Using image tag: ${NMP_E2E_TAG}" +if [ -n "${NMP_E2E_TAG}" ]; then HELM_ARGS+=( - --set "api.image.tag=${NMP_E2E_TAG}" - --set "core.image.tag=${NMP_E2E_TAG}" - --set-string "platformConfig.platform.image_tag=${NMP_E2E_TAG}" + --set api.image.tag="${NMP_E2E_TAG}" + --set core.image.tag="${NMP_E2E_TAG}" + --set-string platformConfig.platform.image_tag="${NMP_E2E_TAG}" ) fi -# Append any extra helm args (applied last, so they can override anything above). -# Note: HELM_EXTRA_ARGS is word-split, so values must not contain spaces. -if [ -n "${HELM_EXTRA_ARGS:-}" ]; then - # shellcheck disable=SC2206 - HELM_ARGS+=(${HELM_EXTRA_ARGS}) - - # If using s3-rustfs scenario, install RustFS first - if echo "${HELM_EXTRA_ARGS}" | grep -q "s3-rustfs"; then - install_rustfs - fi +log_info "Helm install inputs:" +printf ' release: %s\n' "${HELM_RELEASE_NAME}" +printf ' namespace: %s\n' "${NAMESPACE}" +printf ' chart: %s\n' "${HELM_CHART}" +printf ' values: %s\n' "${HELM_VALUES}" +if [ -n "${NMP_E2E_REGISTRY}" ]; then + printf ' api image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" + printf ' core image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" + printf ' platform image registry: %s\n' "${NMP_E2E_REGISTRY}" + printf ' platform image tag: %s\n' "${NMP_E2E_TAG}" +else + printf ' image overrides: chart defaults\n' fi - -log_info "Helm chart: ${HELM_CHART}" -log_info "Helm values file: ${HELM_VALUES}" - -# Chart depends on k8s-nim-operator from NGC; add repo so dependency build can fetch it -if ! helm repo list 2>/dev/null | grep -q "helm.ngc.nvidia.com"; then - if [ -z "${NGC_API_KEY:-}" ]; then - log_error "NGC_API_KEY is required to add the NGC Helm repo (needed for chart dependencies). Export NGC_API_KEY and re-run." - exit 1 - fi - log_info "Adding NGC Helm repo for chart dependencies..." - helm repo add nvidia https://helm.ngc.nvidia.com/nvidia --username='$oauthtoken' --password="${NGC_API_KEY}" +printf ' postgres image: %s\n' "${POSTGRES_IMAGE}" +printf ' core storage volume permissions image: %s\n' "${BUSYBOX_IMAGE}" +if [ -n "${HELM_EXTRA_ARGS}" ]; then + printf ' extra Helm args: %s\n' "${HELM_EXTRA_ARGS}" fi -helm repo update nvidia 2>/dev/null || true +add_nvidia_helm_repo +helm repo update nvidia 2>/dev/null || true helm dependency build "${HELM_CHART}" if ! helm upgrade -i "${HELM_ARGS[@]}"; then - log_error "Helm install/upgrade failed (possible timeout). Collecting diagnostics..." - "$(dirname "$0")/collect_k8s_logs.sh" + log_error "Helm install/upgrade failed" + collect_install_diagnostics exit 1 fi -log_info "Helm values from chart (nemo-platform):" -helm get values nemo-platform +if ! "${RELEASE_READY_SCRIPT}"; then + log_error "Release readiness check failed" + collect_install_diagnostics + exit 1 +fi -log_info "Verifying deployment..." -kubectl get pods -o wide +maybe_export_minikube_cluster_url -MINIKUBE_IP=$(minikube ip -p "${MINIKUBE_PROFILE}") -CLUSTER_URL="http://${MINIKUBE_IP}" +log_info "Helm values from chart (${HELM_RELEASE_NAME}):" +helm get values -n "${NAMESPACE}" "${HELM_RELEASE_NAME}" || true -if curl -f -s --max-time 10 "${CLUSTER_URL}/cluster-info" > /dev/null 2>&1; then - log_info "Cluster info endpoint check passed" -else - log_warn "Could not reach cluster-info endpoint" +log_info "NeMo Platform Helm install complete" +if [ -n "${NMP_E2E_CLUSTER_URL:-}" ]; then + log_info "Cluster URL: ${NMP_E2E_CLUSTER_URL}" fi - -log_info "==========================================" -log_info "NeMo Platform Helm Install Complete!" -log_info "==========================================" -log_info "" -log_info "Cluster URL: ${CLUSTER_URL}" -log_info "" -log_info "To run e2e GPU tests:" -log_info " NMP_E2E_INTERNAL_HOST=nemo-platform-api:8080 uv run --project platform --frozen pytest e2e --kubernetes --feature gpu --cluster-url=\"${CLUSTER_URL}\" -v" -log_info "==========================================" diff --git a/e2e/k8s/scripts/install_nmp_auth_e2e.sh b/e2e/k8s/scripts/install_nmp_auth_e2e.sh index 2e53cffae7..ceff9c1488 100755 --- a/e2e/k8s/scripts/install_nmp_auth_e2e.sh +++ b/e2e/k8s/scripts/install_nmp_auth_e2e.sh @@ -14,4 +14,6 @@ export NMP_E2E_TAG="${NMP_E2E_TAG:-local}" export POSTGRES_IMAGE="${POSTGRES_IMAGE:-docker.io/library/postgres}" export BUSYBOX_IMAGE="${BUSYBOX_IMAGE:-busybox}" -exec "${SCRIPT_DIR}/install_nmp_e2e.sh" +export REQUIRE_NMP_E2E_IMAGES="${REQUIRE_NMP_E2E_IMAGES:-true}" + +exec "${SCRIPT_DIR}/install_helm_e2e.sh" diff --git a/e2e/k8s/scripts/install_nmp_e2e.sh b/e2e/k8s/scripts/install_nmp_e2e.sh deleted file mode 100755 index 55a529f0c0..0000000000 --- a/e2e/k8s/scripts/install_nmp_e2e.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -REPO_ROOT=$(git rev-parse --show-toplevel) - -NAMESPACE="${NAMESPACE:-default}" -HELM_RELEASE_NAME="${HELM_RELEASE_NAME:-nemo-platform}" -NMP_E2E_REGISTRY="${NMP_E2E_REGISTRY:-}" -NMP_E2E_TAG="${NMP_E2E_TAG:-}" -HELM_EXTRA_ARGS="${HELM_EXTRA_ARGS:-}" -HELM_CHART="${HELM_CHART:-${REPO_ROOT}/k8s/helm}" -HELM_VALUES="${HELM_VALUES:-${HELM_VALUES_FILE:-${REPO_ROOT}/e2e/k8s/values/default.yaml}}" -POSTGRES_IMAGE="${POSTGRES_IMAGE:-docker.io/library/postgres}" -BUSYBOX_IMAGE="${BUSYBOX_IMAGE:-docker.io/library/busybox}" -RELEASE_READY_SCRIPT="${RELEASE_READY_SCRIPT:-${REPO_ROOT}/e2e/k8s/scripts/wait_for_release_ready.sh}" -EXTRA_HELM_ARGS=() - -require_non_empty() { - local name="$1" - if [ -z "${!name:-}" ]; then - echo "${name} is required for ${HELM_RELEASE_NAME} Helm install" >&2 - exit 1 - fi -} - -require_non_empty NAMESPACE -require_non_empty HELM_RELEASE_NAME -require_non_empty NMP_E2E_REGISTRY -require_non_empty NMP_E2E_TAG -require_non_empty HELM_CHART -require_non_empty HELM_VALUES -require_non_empty POSTGRES_IMAGE -require_non_empty BUSYBOX_IMAGE -require_non_empty RELEASE_READY_SCRIPT - -if [ ! -d "${HELM_CHART}" ]; then - echo "HELM_CHART does not exist or is not a directory: ${HELM_CHART}" >&2 - exit 1 -fi - -if [ ! -f "${HELM_VALUES}" ]; then - echo "HELM_VALUES does not exist or is not a file: ${HELM_VALUES}" >&2 - exit 1 -fi - -if [ ! -x "${RELEASE_READY_SCRIPT}" ]; then - echo "RELEASE_READY_SCRIPT does not exist or is not executable: ${RELEASE_READY_SCRIPT}" >&2 - exit 1 -fi - -if [ -n "${HELM_EXTRA_ARGS}" ]; then - read -r -a EXTRA_HELM_ARGS <<< "${HELM_EXTRA_ARGS}" -fi - -HELM_ARGS=( - "${HELM_RELEASE_NAME}" - "${HELM_CHART}" - -n "${NAMESPACE}" - -f "${HELM_VALUES}" - "${EXTRA_HELM_ARGS[@]}" - --set api.image.repository="${NMP_E2E_REGISTRY}/nmp-api" - --set api.image.tag="${NMP_E2E_TAG}" - --set core.image.repository="${NMP_E2E_REGISTRY}/nmp-api" - --set core.image.tag="${NMP_E2E_TAG}" - --set-string platformConfig.platform.image_registry="${NMP_E2E_REGISTRY}" - --set-string platformConfig.platform.image_tag="${NMP_E2E_TAG}" - --set postgresql.image.repository="${POSTGRES_IMAGE}" - --set core.storage.volumePermissionsImage="${BUSYBOX_IMAGE}" - --create-namespace - --timeout 15m -) - -echo "Helm install inputs:" -printf ' release: %s\n' "${HELM_RELEASE_NAME}" -printf ' namespace: %s\n' "${NAMESPACE}" -printf ' chart: %s\n' "${HELM_CHART}" -printf ' values: %s\n' "${HELM_VALUES}" -printf ' api image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" -printf ' core image: %s/nmp-api:%s\n' "${NMP_E2E_REGISTRY}" "${NMP_E2E_TAG}" -printf ' platform image registry: %s\n' "${NMP_E2E_REGISTRY}" -printf ' platform image tag: %s\n' "${NMP_E2E_TAG}" -printf ' postgres image: %s\n' "${POSTGRES_IMAGE}" -printf ' core storage volume permissions image: %s\n' "${BUSYBOX_IMAGE}" -if [ -n "${HELM_EXTRA_ARGS}" ]; then - printf ' extra Helm args: %s\n' "${HELM_EXTRA_ARGS}" -fi - -# Install NMP platform -if ! helm upgrade -i "${HELM_ARGS[@]}"; then - echo "--- helm list -A ---" - helm list -A || true - echo "--- helm status ${NAMESPACE}/${HELM_RELEASE_NAME} ---" - helm status -n "${NAMESPACE}" "${HELM_RELEASE_NAME}" || true - echo "--- kubectl get pods -A ---" - kubectl get pods -A - echo "--- kubectl describe pods -n ${NAMESPACE} ---" - kubectl describe pods -n "${NAMESPACE}" - exit 1 -fi - -if ! "${RELEASE_READY_SCRIPT}"; then - echo "--- helm list -A ---" - helm list -A || true - echo "--- helm status ${NAMESPACE}/${HELM_RELEASE_NAME} ---" - helm status -n "${NAMESPACE}" "${HELM_RELEASE_NAME}" || true - echo "--- kubectl get pods -A ---" - kubectl get pods -A - echo "--- kubectl describe pods -n ${NAMESPACE} ---" - kubectl describe pods -n "${NAMESPACE}" - exit 1 -fi diff --git a/e2e/k8s/scripts/install_rustfs.sh b/e2e/k8s/scripts/install_rustfs.sh deleted file mode 100755 index e6236af2c8..0000000000 --- a/e2e/k8s/scripts/install_rustfs.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash -# -# Install RustFS in standalone mode and create the E2E test bucket. -# Expects NAMESPACE to be set (e.g. current kubectl context namespace). -# -# See https://github.com/rustfs/helm for parameter reference. -# Uses default credentials (rustfsadmin/rustfsadmin). -# -set -e - -STORAGECLASS="${STORAGECLASS:-standard}" - -if [ -z "${NAMESPACE}" ]; then - echo "NAMESPACE must be set" - exit 1 -fi - -helm repo add rustfs https://charts.rustfs.com -helm repo update rustfs - -helm upgrade -i -n "${NAMESPACE}" rustfs rustfs/rustfs \ - --version 0.0.85 \ - --set mode.standalone.enabled=true \ - --set mode.distributed.enabled=false \ - --set storageclass.name="${STORAGECLASS}" \ - --timeout 5m From db71c2ad99c8ad6b42bbe6e9f51bad756afa78a3 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Tue, 16 Jun 2026 08:43:05 -0700 Subject: [PATCH 10/13] ci: integrate into main tests Signed-off-by: Brooke Storm --- .github/actions/changes/action.yaml | 8 + .github/workflows/ci.yaml | 326 ++++++++++++++++++ .github/workflows/docker-cpu-smoketest.yaml | 352 -------------------- 3 files changed, 334 insertions(+), 352 deletions(-) delete mode 100644 .github/workflows/docker-cpu-smoketest.yaml diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml index d91f2d8715..ab8d42bf94 100644 --- a/.github/actions/changes/action.yaml +++ b/.github/actions/changes/action.yaml @@ -31,6 +31,9 @@ outputs: helm: description: "'true' if any Helm chart files changed" value: ${{ steps.filter.outputs.helm }} + k8s-smoke: + description: "'true' if Kubernetes smoke test support files changed" + value: ${{ steps.filter.outputs.k8s-smoke }} runs: using: "composite" @@ -72,3 +75,8 @@ runs: - '.github/workflows/ci.yaml' - '.github/actions/changes/action.yaml' - '.pre-commit-config.yaml' + k8s-smoke: + - 'e2e/k8s/scripts/**' + - 'e2e/k8s/values/**' + - 'e2e/test_jobs.py' + - '.github/actions/free-disk-space/action.yaml' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 552c110268..97dbbfa688 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -8,10 +8,22 @@ on: merge_group: types: [checks_requested] workflow_dispatch: + inputs: + image_tag: + description: Optional CPU smoke image tag. Defaults to the commit SHA. + required: false + type: string + default: "" + publish: + description: Publish CPU smoke images to GHCR. + required: false + type: boolean + default: true permissions: actions: read contents: read + packages: read pull-requests: read concurrency: @@ -32,6 +44,7 @@ jobs: tools: ${{ steps.changes.outputs.tools }} docker: ${{ steps.changes.outputs.docker }} helm: ${{ steps.changes.outputs.helm }} + k8s-smoke: ${{ steps.changes.outputs.k8s-smoke }} steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - uses: ./.github/actions/changes @@ -83,6 +96,317 @@ jobs: make docker-print TARGET=nmp-automodel make docker-print TARGET=nmp-unsloth + build-cpu-smoke-images: + name: Build CPU smoke images + needs: [changes] + if: > + !cancelled() && ( + github.event_name == 'workflow_dispatch' || + needs.changes.outputs.docker == 'true' || + needs.changes.outputs.helm == 'true' || + needs.changes.outputs.k8s-smoke == 'true' + ) + runs-on: ubuntu-latest + timeout-minutes: 90 + permissions: + contents: read + packages: write + outputs: + image_registry: ${{ steps.bake-vars.outputs.image_registry }} + image_tag: ${{ steps.bake-vars.outputs.image_tag }} + publish_images: ${{ steps.bake-vars.outputs.publish_images }} + steps: + - name: Checkout code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Free disk space + uses: ./.github/actions/free-disk-space + with: + disable_swap: "true" + remove_haskell: "true" + remove_java: "true" + remove_ruby: "true" + remove_swift: "true" + prune_docker: "true" + + - name: Set up Docker Buildx + shell: bash + run: | + set -euo pipefail + docker buildx create --name nmp-builder --driver docker-container --use + docker buildx inspect --bootstrap + + - name: Configure bake variables + id: bake-vars + shell: bash + env: + HEAD_REPOSITORY: ${{ github.event.pull_request.head.repo.full_name || github.repository }} + INPUT_IMAGE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.image_tag || '' }} + INPUT_PUBLISH: ${{ inputs.publish }} + SOURCE_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + run: | + set -euo pipefail + + image_registry="ghcr.io/${GITHUB_REPOSITORY,,}" + source_sha="${SOURCE_SHA:-$GITHUB_SHA}" + bake_tag="${INPUT_IMAGE_TAG:-$source_sha}" + publish_images="false" + + if [ "$GITHUB_EVENT_NAME" = "push" ] && [ "$GITHUB_REF" = "refs/heads/main" ]; then + publish_images="true" + fi + + if [ "$GITHUB_EVENT_NAME" = "pull_request" ] && [ "$HEAD_REPOSITORY" = "$GITHUB_REPOSITORY" ]; then + publish_images="true" + fi + + if [ "$GITHUB_EVENT_NAME" = "merge_group" ]; then + publish_images="true" + fi + + if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ] && [ "$INPUT_PUBLISH" = "true" ]; then + publish_images="true" + fi + + { + printf 'IMAGE_REGISTRY=%s\n' "$image_registry" + printf 'BASE_REGISTRY=%s\n' "$image_registry" + printf 'CACHE_REGISTRY=%s\n' "$image_registry" + printf 'BAKE_TAG=%s\n' "$bake_tag" + printf 'CI_COMMIT_SHA=%s\n' "$source_sha" + printf 'PUBLISH_IMAGES=%s\n' "$publish_images" + } >> "$GITHUB_ENV" + { + printf 'image_registry=%s\n' "$image_registry" + printf 'image_tag=%s\n' "$bake_tag" + printf 'publish_images=%s\n' "$publish_images" + } >> "$GITHUB_OUTPUT" + + - name: Log in to GHCR + if: env.PUBLISH_IMAGES == 'true' + shell: bash + env: + GHCR_TOKEN: ${{ github.token }} + run: | + set -euo pipefail + echo "$GHCR_TOKEN" | docker login ghcr.io -u "$GITHUB_ACTOR" --password-stdin + + - name: Print Docker bake graph + shell: bash + run: make docker-print TARGET=docker-cpu + + - name: Build CPU images + if: env.PUBLISH_IMAGES != 'true' + shell: bash + run: make docker-load TARGET=docker-cpu + + - name: Build and publish CPU images + if: env.PUBLISH_IMAGES == 'true' + shell: bash + run: make docker-push TARGET=docker-cpu + + kind-cpu-smoke: + name: Kind CPU smoke test + needs: [changes, build-cpu-smoke-images] + if: > + !cancelled() && + needs.build-cpu-smoke-images.result == 'success' && + needs.build-cpu-smoke-images.outputs.publish_images == 'true' + runs-on: ubuntu-latest + timeout-minutes: 45 + permissions: + contents: read + packages: read + env: + BUSYBOX_IMAGE: docker.io/library/busybox + HELM_CHART: k8s/helm + K8S_E2E_SCRIPTS: e2e/k8s/scripts + K8S_E2E_VALUES: e2e/k8s/values + KIND_CLUSTER_NAME: gha-${{ github.run_id }}-${{ github.run_attempt }}-kind-smoke + KUBE_GATEWAY_NAME: nmp-e2e-gateway + KUBE_NAMESPACE: nemo-platform + NAMESPACE: nemo-platform + NMP_E2E_CLUSTER_URL: "" + NMP_E2E_INTERNAL_HOST: nemo-platform-api:8080 + NMP_E2E_REGISTRY: ${{ needs.build-cpu-smoke-images.outputs.image_registry }} + NMP_E2E_TAG: ${{ needs.build-cpu-smoke-images.outputs.image_tag }} + POSTGRES_IMAGE: docker.io/library/postgres + steps: + - name: Checkout code + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + + - name: Free disk space + uses: ./.github/actions/free-disk-space + with: + disable_swap: "true" + remove_haskell: "true" + remove_java: "true" + remove_ruby: "true" + remove_swift: "true" + prune_docker: "true" + + - name: Install kind + shell: bash + env: + KIND_VERSION: v0.32.0 + run: | + set -euo pipefail + + case "$(uname -m)" in + x86_64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) + echo "Unsupported architecture: $(uname -m)" >&2 + exit 1 + ;; + esac + + kind_url="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-${arch}" + curl -fsSLo "${RUNNER_TEMP}/kind" "${kind_url}" + curl -fsSLo "${RUNNER_TEMP}/kind.sha256sum" "${kind_url}.sha256sum" + sed "s# kind-linux-${arch}# ${RUNNER_TEMP}/kind#" "${RUNNER_TEMP}/kind.sha256sum" | sha256sum -c - + sudo install -m 0755 "${RUNNER_TEMP}/kind" /usr/local/bin/kind + + - name: Install kubectl + shell: bash + env: + KUBECTL_VERSION: v1.33.7 + run: | + set -euo pipefail + + case "$(uname -m)" in + x86_64) arch=amd64 ;; + aarch64|arm64) arch=arm64 ;; + *) + echo "Unsupported architecture: $(uname -m)" >&2 + exit 1 + ;; + esac + + kubectl_url="https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${arch}/kubectl" + curl -fsSLo "${RUNNER_TEMP}/kubectl" "${kubectl_url}" + curl -fsSLo "${RUNNER_TEMP}/kubectl.sha256" "${kubectl_url}.sha256" + echo "$(cat "${RUNNER_TEMP}/kubectl.sha256") ${RUNNER_TEMP}/kubectl" | sha256sum -c - + sudo install -m 0755 "${RUNNER_TEMP}/kubectl" /usr/local/bin/kubectl + + - name: Install Helm + uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 + + - name: Install uv + uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 + with: + python-version: "3.13" + enable-cache: true + version-file: pyproject.toml + cache-dependency-glob: uv.lock + + - name: Start kind cluster + shell: bash + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + NGC_API_KEY: not-used-for-ghcr-cpu-smoke + run: bash "${K8S_E2E_SCRIPTS}/setup_local_kind_cpu.sh" + + - name: Set default kubectl namespace + shell: bash + run: kubectl config set-context --current --namespace="${NAMESPACE}" + + - name: Verify Gateway API setup + shell: bash + run: | + set -euo pipefail + kubectl wait --for=condition=Established crd/gateways.gateway.networking.k8s.io --timeout=2m + kubectl wait --for=condition=Established crd/httproutes.gateway.networking.k8s.io --timeout=2m + kubectl get gatewayclass cloud-provider-kind + kubectl -n "${NAMESPACE}" get gateway "${KUBE_GATEWAY_NAME}" + + - name: Pre-pull GHCR images into kind + shell: bash + env: + KIND_IMAGE_PULL_TOKEN: ${{ github.token }} + KIND_IMAGE_PULL_USER: ${{ github.actor }} + run: | + "${K8S_E2E_SCRIPTS}/prepull_kind_images.sh" \ + "${NMP_E2E_REGISTRY}/nmp-api:${NMP_E2E_TAG}" \ + "${NMP_E2E_REGISTRY}/nmp-core:${NMP_E2E_TAG}" \ + "${NMP_E2E_REGISTRY}/nmp-cpu-tasks:${NMP_E2E_TAG}" + + - name: Install NeMo Platform + shell: bash + env: + REQUIRE_NMP_E2E_IMAGES: "true" + run: | + if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_helm_e2e.sh"; then + echo "--- helm list -A ---" + helm list -A || true + echo "--- helm status ${NAMESPACE}/nemo-platform ---" + helm status -n "${NAMESPACE}" nemo-platform || true + echo "--- kubectl get all -n ${NAMESPACE} ---" + kubectl get all -n "${NAMESPACE}" || true + exit 1 + fi + + - name: Wait for API + shell: bash + run: | + test -n "${NMP_E2E_CLUSTER_URL}" + "${K8S_E2E_SCRIPTS}/wait_for_api.sh" "${NMP_E2E_CLUSTER_URL}/cluster-info" 120 + + - name: Run CPU job e2e smoke test + shell: bash + env: + _TYPER_FORCE_DISABLE_TERMINAL: "1" + E2E_SERVICES_LOG_DIR: ${{ runner.temp }}/e2e-services-logs + NGC_API_KEY: not-used-for-ghcr-cpu-smoke + run: | + test -n "${NMP_E2E_CLUSTER_URL}" + export NMP_BASE_URL="${NMP_E2E_CLUSTER_URL}" + uv run --frozen pytest \ + e2e/test_jobs.py::test_job_using_secret_environment_variable \ + -v \ + --run-e2e \ + --no-cov \ + --junitxml=report-kubernetes-smoke.xml + + - name: Collect Kubernetes logs + if: always() + shell: bash + run: | + "${K8S_E2E_SCRIPTS}/collect_k8s_logs.sh" + + - name: Disk usage summary + if: always() + shell: bash + run: | + echo "=== Host disk ===" + df -h / + echo "=== Docker system ===" + docker system df + echo "=== kind node storage ===" + for node in $(kind get nodes --name "${KIND_CLUSTER_NAME}" 2>/dev/null); do + echo "--- ${node} ---" + docker exec "${node}" sh -c "du -sh /var/lib/containerd /var/lib/kubelet /var/log 2>/dev/null | sort -h" || true + done + + - name: Upload Kubernetes artifacts + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: kind-smoke-kubernetes-artifacts + retention-days: 7 + if-no-files-found: ignore + path: | + k8s-logs/ + report-kubernetes-smoke.xml + ${{ runner.temp }}/e2e-services-logs/ + + - name: Delete kind cluster + if: always() + shell: bash + run: | + docker rm -f "cloud-provider-kind-${KIND_CLUSTER_NAME}" || true + kind delete cluster --name "${KIND_CLUSTER_NAME}" || true + helm-lint: name: Helm lint needs: [changes] @@ -911,6 +1235,8 @@ jobs: - changes - actionlint - docker-bake-graph + - build-cpu-smoke-images + - kind-cpu-smoke - helm-lint - helm-chart-verifier - lint diff --git a/.github/workflows/docker-cpu-smoketest.yaml b/.github/workflows/docker-cpu-smoketest.yaml deleted file mode 100644 index 911d42f96c..0000000000 --- a/.github/workflows/docker-cpu-smoketest.yaml +++ /dev/null @@ -1,352 +0,0 @@ -name: Docker CPU smoke-test images - -on: - push: - branches: [main] - paths: - - ".github/workflows/docker-cpu-smoketest.yaml" - - "docker-bake.hcl" - - "docker/**" - - "e2e/k8s/**" - - "k8s/helm/**" - - "Makefile" - - "packages/**" - - "plugins/**" - - "sdk/**" - - "services/**" - - "src/**" - - "uv.lock" - - "pyproject.toml" - pull_request: - branches: [main] - paths: - - ".github/workflows/docker-cpu-smoketest.yaml" - - "docker-bake.hcl" - - "docker/**" - - "e2e/k8s/**" - - "k8s/helm/**" - - "Makefile" - - "packages/**" - - "plugins/**" - - "sdk/**" - - "services/**" - - "src/**" - - "uv.lock" - - "pyproject.toml" - workflow_dispatch: - inputs: - image_tag: - description: Optional image tag. Defaults to the commit SHA. - required: false - type: string - default: "" - publish: - description: Publish images to GHCR. - required: false - type: boolean - default: true - -permissions: - contents: read - packages: read - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref || github.run_id }} - cancel-in-progress: ${{ github.event_name == 'pull_request' }} - -jobs: - build-cpu-images: - name: Build CPU images - runs-on: ubuntu-latest - timeout-minutes: 90 - permissions: - contents: read - packages: write - outputs: - image_registry: ${{ steps.bake-vars.outputs.image_registry }} - image_tag: ${{ steps.bake-vars.outputs.image_tag }} - publish_images: ${{ steps.bake-vars.outputs.publish_images }} - steps: - - name: Checkout code - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - - name: Free disk space - uses: ./.github/actions/free-disk-space - with: - disable_swap: "true" - remove_haskell: "true" - remove_java: "true" - remove_ruby: "true" - remove_swift: "true" - prune_docker: "true" - - - name: Set up Docker Buildx - shell: bash - run: | - set -euo pipefail - docker buildx create --name nmp-builder --driver docker-container --use - docker buildx inspect --bootstrap - - - name: Configure bake variables - id: bake-vars - shell: bash - env: - HEAD_REPOSITORY: ${{ github.event.pull_request.head.repo.full_name || github.repository }} - INPUT_IMAGE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.image_tag || '' }} - INPUT_PUBLISH: ${{ inputs.publish }} - SOURCE_SHA: ${{ github.event.pull_request.head.sha || github.sha }} - run: | - set -euo pipefail - - image_registry="ghcr.io/${GITHUB_REPOSITORY,,}" - source_sha="${SOURCE_SHA:-$GITHUB_SHA}" - bake_tag="${INPUT_IMAGE_TAG:-$source_sha}" - publish_images="false" - - if [ "$GITHUB_EVENT_NAME" = "push" ] && [ "$GITHUB_REF" = "refs/heads/main" ]; then - publish_images="true" - fi - - if [ "$GITHUB_EVENT_NAME" = "pull_request" ] && [ "$HEAD_REPOSITORY" = "$GITHUB_REPOSITORY" ]; then - publish_images="true" - fi - - if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ] && [ "$INPUT_PUBLISH" = "true" ]; then - publish_images="true" - fi - - { - printf 'IMAGE_REGISTRY=%s\n' "$image_registry" - printf 'BASE_REGISTRY=%s\n' "$image_registry" - printf 'CACHE_REGISTRY=%s\n' "$image_registry" - printf 'BAKE_TAG=%s\n' "$bake_tag" - printf 'CI_COMMIT_SHA=%s\n' "$source_sha" - printf 'PUBLISH_IMAGES=%s\n' "$publish_images" - } >> "$GITHUB_ENV" - { - printf 'image_registry=%s\n' "$image_registry" - printf 'image_tag=%s\n' "$bake_tag" - printf 'publish_images=%s\n' "$publish_images" - } >> "$GITHUB_OUTPUT" - - - name: Log in to GHCR - if: env.PUBLISH_IMAGES == 'true' - shell: bash - env: - GHCR_TOKEN: ${{ github.token }} - run: | - set -euo pipefail - echo "$GHCR_TOKEN" | docker login ghcr.io -u "$GITHUB_ACTOR" --password-stdin - - - name: Print Docker bake graph - shell: bash - run: make docker-print TARGET=docker-cpu - - - name: Build CPU images - if: env.PUBLISH_IMAGES != 'true' - shell: bash - run: make docker-load TARGET=docker-cpu - - - name: Build and publish CPU images - if: env.PUBLISH_IMAGES == 'true' - shell: bash - run: make docker-push TARGET=docker-cpu - - kind-smoke: - name: Set up kind CPU environment - needs: [build-cpu-images] - if: needs.build-cpu-images.outputs.publish_images == 'true' - runs-on: ubuntu-latest - timeout-minutes: 45 - permissions: - contents: read - packages: read - env: - BUSYBOX_IMAGE: docker.io/library/busybox - HELM_CHART: k8s/helm - K8S_E2E_SCRIPTS: e2e/k8s/scripts - K8S_E2E_VALUES: e2e/k8s/values - KIND_CLUSTER_NAME: gha-${{ github.run_id }}-${{ github.run_attempt }}-kind-smoke - KUBE_GATEWAY_NAME: nmp-e2e-gateway - KUBE_NAMESPACE: nemo-platform - NAMESPACE: nemo-platform - NMP_E2E_CLUSTER_URL: "" - NMP_E2E_INTERNAL_HOST: nemo-platform-api:8080 - NMP_E2E_REGISTRY: ${{ needs.build-cpu-images.outputs.image_registry }} - NMP_E2E_TAG: ${{ needs.build-cpu-images.outputs.image_tag }} - POSTGRES_IMAGE: docker.io/library/postgres - steps: - - name: Checkout code - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - - - name: Free disk space - uses: ./.github/actions/free-disk-space - with: - disable_swap: "true" - remove_haskell: "true" - remove_java: "true" - remove_ruby: "true" - remove_swift: "true" - prune_docker: "true" - - - name: Install kind - shell: bash - env: - KIND_VERSION: v0.32.0 - run: | - set -euo pipefail - - case "$(uname -m)" in - x86_64) arch=amd64 ;; - aarch64|arm64) arch=arm64 ;; - *) - echo "Unsupported architecture: $(uname -m)" >&2 - exit 1 - ;; - esac - - kind_url="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-${arch}" - curl -fsSLo "${RUNNER_TEMP}/kind" "${kind_url}" - curl -fsSLo "${RUNNER_TEMP}/kind.sha256sum" "${kind_url}.sha256sum" - sed "s# kind-linux-${arch}# ${RUNNER_TEMP}/kind#" "${RUNNER_TEMP}/kind.sha256sum" | sha256sum -c - - sudo install -m 0755 "${RUNNER_TEMP}/kind" /usr/local/bin/kind - - - name: Install kubectl - shell: bash - env: - KUBECTL_VERSION: v1.33.7 - run: | - set -euo pipefail - - case "$(uname -m)" in - x86_64) arch=amd64 ;; - aarch64|arm64) arch=arm64 ;; - *) - echo "Unsupported architecture: $(uname -m)" >&2 - exit 1 - ;; - esac - - kubectl_url="https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${arch}/kubectl" - curl -fsSLo "${RUNNER_TEMP}/kubectl" "${kubectl_url}" - curl -fsSLo "${RUNNER_TEMP}/kubectl.sha256" "${kubectl_url}.sha256" - echo "$(cat "${RUNNER_TEMP}/kubectl.sha256") ${RUNNER_TEMP}/kubectl" | sha256sum -c - - sudo install -m 0755 "${RUNNER_TEMP}/kubectl" /usr/local/bin/kubectl - - - name: Install Helm - uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0 - - - name: Install uv - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0 - with: - python-version: "3.13" - enable-cache: true - version-file: pyproject.toml - cache-dependency-glob: uv.lock - - - name: Start kind cluster - shell: bash - env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} - NGC_API_KEY: not-used-for-ghcr-cpu-smoke - run: bash "${K8S_E2E_SCRIPTS}/setup_local_kind_cpu.sh" - - - name: Set default kubectl namespace - shell: bash - run: kubectl config set-context --current --namespace="${NAMESPACE}" - - - name: Verify Gateway API setup - shell: bash - run: | - set -euo pipefail - kubectl wait --for=condition=Established crd/gateways.gateway.networking.k8s.io --timeout=2m - kubectl wait --for=condition=Established crd/httproutes.gateway.networking.k8s.io --timeout=2m - kubectl get gatewayclass cloud-provider-kind - kubectl -n "${NAMESPACE}" get gateway "${KUBE_GATEWAY_NAME}" - - - name: Pre-pull GHCR images into kind - shell: bash - env: - KIND_IMAGE_PULL_TOKEN: ${{ github.token }} - KIND_IMAGE_PULL_USER: ${{ github.actor }} - run: | - "${K8S_E2E_SCRIPTS}/prepull_kind_images.sh" \ - "${NMP_E2E_REGISTRY}/nmp-api:${NMP_E2E_TAG}" \ - "${NMP_E2E_REGISTRY}/nmp-core:${NMP_E2E_TAG}" \ - "${NMP_E2E_REGISTRY}/nmp-cpu-tasks:${NMP_E2E_TAG}" - - - name: Install NeMo Platform - shell: bash - env: - REQUIRE_NMP_E2E_IMAGES: "true" - run: | - if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_helm_e2e.sh"; then - echo "--- helm list -A ---" - helm list -A || true - echo "--- helm status ${NAMESPACE}/nemo-platform ---" - helm status -n "${NAMESPACE}" nemo-platform || true - echo "--- kubectl get all -n ${NAMESPACE} ---" - kubectl get all -n "${NAMESPACE}" || true - exit 1 - fi - - - name: Wait for API - shell: bash - run: | - test -n "${NMP_E2E_CLUSTER_URL}" - "${K8S_E2E_SCRIPTS}/wait_for_api.sh" "${NMP_E2E_CLUSTER_URL}/cluster-info" 120 - - - name: Run CPU job e2e smoke test - shell: bash - env: - _TYPER_FORCE_DISABLE_TERMINAL: "1" - E2E_SERVICES_LOG_DIR: ${{ runner.temp }}/e2e-services-logs - NGC_API_KEY: not-used-for-ghcr-cpu-smoke - run: | - test -n "${NMP_E2E_CLUSTER_URL}" - export NMP_BASE_URL="${NMP_E2E_CLUSTER_URL}" - uv run --frozen pytest \ - e2e/test_jobs.py::test_job_using_secret_environment_variable \ - -v \ - --run-e2e \ - --no-cov \ - --junitxml=report-kubernetes-smoke.xml - - - name: Collect Kubernetes logs - if: always() - shell: bash - run: | - "${K8S_E2E_SCRIPTS}/collect_k8s_logs.sh" - - - name: Disk usage summary - if: always() - shell: bash - run: | - echo "=== Host disk ===" - df -h / - echo "=== Docker system ===" - docker system df - echo "=== kind node storage ===" - for node in $(kind get nodes --name "${KIND_CLUSTER_NAME}" 2>/dev/null); do - echo "--- ${node} ---" - docker exec "${node}" sh -c "du -sh /var/lib/containerd /var/lib/kubelet /var/log 2>/dev/null | sort -h" || true - done - - - name: Upload Kubernetes artifacts - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: kind-smoke-kubernetes-artifacts - retention-days: 7 - if-no-files-found: ignore - path: | - k8s-logs/ - report-kubernetes-smoke.xml - ${{ runner.temp }}/e2e-services-logs/ - - - name: Delete kind cluster - if: always() - shell: bash - run: | - docker rm -f "cloud-provider-kind-${KIND_CLUSTER_NAME}" || true - kind delete cluster --name "${KIND_CLUSTER_NAME}" || true From f31a17373c33e38b9d601376d6e7e0cbc7817977 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Tue, 16 Jun 2026 09:38:56 -0700 Subject: [PATCH 11/13] ci: fix needless perms Signed-off-by: Brooke Storm --- .github/workflows/ci.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 97dbbfa688..6b4fba52de 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -23,7 +23,6 @@ on: permissions: actions: read contents: read - packages: read pull-requests: read concurrency: From d0cca1180606cfad732586aae7cc4d6dd509be02 Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Tue, 16 Jun 2026 09:42:27 -0700 Subject: [PATCH 12/13] ci: fix test and perms a bit more Signed-off-by: Brooke Storm --- .github/workflows/ci.yaml | 4 ++++ e2e/test_jobs.py | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 6b4fba52de..b58a7a7694 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -117,6 +117,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false - name: Free disk space uses: ./.github/actions/free-disk-space @@ -233,6 +235,8 @@ jobs: steps: - name: Checkout code uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false - name: Free disk space uses: ./.github/actions/free-disk-space diff --git a/e2e/test_jobs.py b/e2e/test_jobs.py index fae1bb4d74..12a857ac4e 100644 --- a/e2e/test_jobs.py +++ b/e2e/test_jobs.py @@ -12,7 +12,7 @@ import uuid import pytest -from nemo_platform import NeMoPlatform +from nemo_platform import NeMoPlatform, NotFoundError from nmp.testing.e2e import wait_for_job_logs, wait_for_platform_job JOB_SOURCE = "e2e-test-jobs" @@ -255,8 +255,8 @@ def test_job_using_secret_environment_variable(sdk: NeMoPlatform, workspace: str sdk.secrets.delete(workspace=workspace, name=secret_name) secret_deleted = True - secret_names = [listed_secret.name for listed_secret in sdk.secrets.list(workspace=workspace).data] - assert secret_name not in secret_names, "Secret should not appear in list after deletion" + with pytest.raises(NotFoundError): + sdk.secrets.retrieve(secret_name, workspace=workspace) finally: if not secret_deleted: try: From aca0a980867015cf243a371a1e5a571c45f5aa6b Mon Sep 17 00:00:00 2001 From: Brooke Storm Date: Tue, 16 Jun 2026 10:23:20 -0700 Subject: [PATCH 13/13] ci: add a derived output to simplify the condition for smoke test Signed-off-by: Brooke Storm --- .github/actions/changes/action.yaml | 17 +++++++++++++++++ .github/workflows/ci.yaml | 6 ++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.github/actions/changes/action.yaml b/.github/actions/changes/action.yaml index ab8d42bf94..f6a7a27d2e 100644 --- a/.github/actions/changes/action.yaml +++ b/.github/actions/changes/action.yaml @@ -28,12 +28,21 @@ outputs: docker: description: "'true' if any Docker build files changed" value: ${{ steps.filter.outputs.docker }} + docker-scripts: + description: "'true' if scripts directly invoked by Docker builds changed" + value: ${{ steps.filter.outputs.docker-scripts }} helm: description: "'true' if any Helm chart files changed" value: ${{ steps.filter.outputs.helm }} + python-runtime: + description: "'true' if Python runtime packages, services, plugins, or SDK files changed" + value: ${{ steps.filter.outputs.python-runtime }} k8s-smoke: description: "'true' if Kubernetes smoke test support files changed" value: ${{ steps.filter.outputs.k8s-smoke }} + cpu-smoke: + description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed" + value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }} runs: using: "composite" @@ -69,12 +78,20 @@ runs: - 'docker-bake.hcl' - 'docker/**' - 'Makefile' + docker-scripts: + - 'script/install_duckdb_extensions.sh' + - 'script/build_policy_wasm.sh' helm: - 'k8s/**' - 'tools/lint/lint-helm.sh' - '.github/workflows/ci.yaml' - '.github/actions/changes/action.yaml' - '.pre-commit-config.yaml' + python-runtime: + - 'packages/**' + - 'services/**' + - 'plugins/**' + - 'sdk/python/nemo-platform/**' k8s-smoke: - 'e2e/k8s/scripts/**' - 'e2e/k8s/values/**' diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b58a7a7694..b2dca05e61 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -43,7 +43,7 @@ jobs: tools: ${{ steps.changes.outputs.tools }} docker: ${{ steps.changes.outputs.docker }} helm: ${{ steps.changes.outputs.helm }} - k8s-smoke: ${{ steps.changes.outputs.k8s-smoke }} + cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }} steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - uses: ./.github/actions/changes @@ -101,9 +101,7 @@ jobs: if: > !cancelled() && ( github.event_name == 'workflow_dispatch' || - needs.changes.outputs.docker == 'true' || - needs.changes.outputs.helm == 'true' || - needs.changes.outputs.k8s-smoke == 'true' + needs.changes.outputs.cpu-smoke == 'true' ) runs-on: ubuntu-latest timeout-minutes: 90