Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .github/actions/changes/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,21 @@ outputs:
docker:
description: "'true' if any Docker build files changed"
value: ${{ steps.filter.outputs.docker }}
docker-scripts:
description: "'true' if scripts directly invoked by Docker builds changed"
value: ${{ steps.filter.outputs.docker-scripts }}
helm:
description: "'true' if any Helm chart files changed"
value: ${{ steps.filter.outputs.helm }}
python-runtime:
description: "'true' if Python runtime packages, services, plugins, or SDK files changed"
value: ${{ steps.filter.outputs.python-runtime }}
k8s-smoke:
description: "'true' if Kubernetes smoke test support files changed"
value: ${{ steps.filter.outputs.k8s-smoke }}
cpu-smoke:
description: "'true' if CPU smoke image or Kubernetes smoke test inputs changed"
value: ${{ steps.filter.outputs.deps == 'true' || steps.filter.outputs.docker == 'true' || steps.filter.outputs.docker-scripts == 'true' || steps.filter.outputs.helm == 'true' || steps.filter.outputs.openapi == 'true' || steps.filter.outputs.python-runtime == 'true' || steps.filter.outputs.web-studio == 'true' || steps.filter.outputs.k8s-smoke == 'true' }}

runs:
using: "composite"
Expand Down Expand Up @@ -66,9 +78,22 @@ runs:
- 'docker-bake.hcl'
- 'docker/**'
- 'Makefile'
docker-scripts:
- 'script/install_duckdb_extensions.sh'
- 'script/build_policy_wasm.sh'
helm:
- 'k8s/**'
- 'tools/lint/lint-helm.sh'
- '.github/workflows/ci.yaml'
- '.github/actions/changes/action.yaml'
- '.pre-commit-config.yaml'
python-runtime:
- 'packages/**'
- 'services/**'
- 'plugins/**'
- 'sdk/python/nemo-platform/**'
k8s-smoke:
- 'e2e/k8s/scripts/**'
- 'e2e/k8s/values/**'
- 'e2e/test_jobs.py'
- '.github/actions/free-disk-space/action.yaml'
327 changes: 327 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ on:
merge_group:
types: [checks_requested]
workflow_dispatch:
inputs:
image_tag:
description: Optional CPU smoke image tag. Defaults to the commit SHA.
required: false
type: string
default: ""
publish:
description: Publish CPU smoke images to GHCR.
required: false
type: boolean
default: true

permissions:
actions: read
Expand All @@ -32,6 +43,7 @@ jobs:
tools: ${{ steps.changes.outputs.tools }}
docker: ${{ steps.changes.outputs.docker }}
helm: ${{ steps.changes.outputs.helm }}
cpu-smoke: ${{ steps.changes.outputs.cpu-smoke }}
steps:
- uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
- uses: ./.github/actions/changes
Expand Down Expand Up @@ -83,6 +95,319 @@ jobs:
make docker-print TARGET=nmp-automodel
make docker-print TARGET=nmp-unsloth

build-cpu-smoke-images:
name: Build CPU smoke images
needs: [changes]
if: >
!cancelled() && (
github.event_name == 'workflow_dispatch' ||
needs.changes.outputs.cpu-smoke == 'true'
)
runs-on: ubuntu-latest
timeout-minutes: 90
permissions:
contents: read
packages: write
outputs:
image_registry: ${{ steps.bake-vars.outputs.image_registry }}
image_tag: ${{ steps.bake-vars.outputs.image_tag }}
publish_images: ${{ steps.bake-vars.outputs.publish_images }}
steps:
- name: Checkout code
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
Comment thread
coderabbitai[bot] marked this conversation as resolved.
with:
persist-credentials: false

- name: Free disk space
uses: ./.github/actions/free-disk-space
with:
disable_swap: "true"
remove_haskell: "true"
remove_java: "true"
remove_ruby: "true"
remove_swift: "true"
prune_docker: "true"

- name: Set up Docker Buildx
shell: bash
run: |
set -euo pipefail
docker buildx create --name nmp-builder --driver docker-container --use
docker buildx inspect --bootstrap

- name: Configure bake variables
id: bake-vars
shell: bash
env:
HEAD_REPOSITORY: ${{ github.event.pull_request.head.repo.full_name || github.repository }}
INPUT_IMAGE_TAG: ${{ github.event_name == 'workflow_dispatch' && inputs.image_tag || '' }}
INPUT_PUBLISH: ${{ inputs.publish }}
SOURCE_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
run: |
set -euo pipefail

image_registry="ghcr.io/${GITHUB_REPOSITORY,,}"
source_sha="${SOURCE_SHA:-$GITHUB_SHA}"
bake_tag="${INPUT_IMAGE_TAG:-$source_sha}"
publish_images="false"

if [ "$GITHUB_EVENT_NAME" = "push" ] && [ "$GITHUB_REF" = "refs/heads/main" ]; then
publish_images="true"
fi

if [ "$GITHUB_EVENT_NAME" = "pull_request" ] && [ "$HEAD_REPOSITORY" = "$GITHUB_REPOSITORY" ]; then
publish_images="true"
fi

if [ "$GITHUB_EVENT_NAME" = "merge_group" ]; then
publish_images="true"
fi

if [ "$GITHUB_EVENT_NAME" = "workflow_dispatch" ] && [ "$INPUT_PUBLISH" = "true" ]; then
publish_images="true"
fi

{
printf 'IMAGE_REGISTRY=%s\n' "$image_registry"
printf 'BASE_REGISTRY=%s\n' "$image_registry"
printf 'CACHE_REGISTRY=%s\n' "$image_registry"
printf 'BAKE_TAG=%s\n' "$bake_tag"
printf 'CI_COMMIT_SHA=%s\n' "$source_sha"
printf 'PUBLISH_IMAGES=%s\n' "$publish_images"
} >> "$GITHUB_ENV"
{
printf 'image_registry=%s\n' "$image_registry"
printf 'image_tag=%s\n' "$bake_tag"
printf 'publish_images=%s\n' "$publish_images"
} >> "$GITHUB_OUTPUT"

- name: Log in to GHCR
if: env.PUBLISH_IMAGES == 'true'
shell: bash
env:
GHCR_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
echo "$GHCR_TOKEN" | docker login ghcr.io -u "$GITHUB_ACTOR" --password-stdin

- name: Print Docker bake graph
shell: bash
run: make docker-print TARGET=docker-cpu

- name: Build CPU images
if: env.PUBLISH_IMAGES != 'true'
shell: bash
run: make docker-load TARGET=docker-cpu

- name: Build and publish CPU images
if: env.PUBLISH_IMAGES == 'true'
shell: bash
run: make docker-push TARGET=docker-cpu

kind-cpu-smoke:
name: Kind CPU smoke test
needs: [changes, build-cpu-smoke-images]
if: >
!cancelled() &&
needs.build-cpu-smoke-images.result == 'success' &&
needs.build-cpu-smoke-images.outputs.publish_images == 'true'
runs-on: ubuntu-latest
timeout-minutes: 45
permissions:
contents: read
packages: read
env:
BUSYBOX_IMAGE: docker.io/library/busybox
HELM_CHART: k8s/helm
K8S_E2E_SCRIPTS: e2e/k8s/scripts
K8S_E2E_VALUES: e2e/k8s/values
KIND_CLUSTER_NAME: gha-${{ github.run_id }}-${{ github.run_attempt }}-kind-smoke
KUBE_GATEWAY_NAME: nmp-e2e-gateway
KUBE_NAMESPACE: nemo-platform
NAMESPACE: nemo-platform
NMP_E2E_CLUSTER_URL: ""
Comment thread
crookedstorm marked this conversation as resolved.
NMP_E2E_INTERNAL_HOST: nemo-platform-api:8080
NMP_E2E_REGISTRY: ${{ needs.build-cpu-smoke-images.outputs.image_registry }}
NMP_E2E_TAG: ${{ needs.build-cpu-smoke-images.outputs.image_tag }}
POSTGRES_IMAGE: docker.io/library/postgres
steps:
- name: Checkout code
uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
with:
persist-credentials: false

- name: Free disk space
uses: ./.github/actions/free-disk-space
with:
disable_swap: "true"
remove_haskell: "true"
remove_java: "true"
remove_ruby: "true"
remove_swift: "true"
prune_docker: "true"

- name: Install kind
shell: bash
env:
KIND_VERSION: v0.32.0
run: |
set -euo pipefail

case "$(uname -m)" in
x86_64) arch=amd64 ;;
aarch64|arm64) arch=arm64 ;;
*)
echo "Unsupported architecture: $(uname -m)" >&2
exit 1
;;
esac

kind_url="https://github.com/kubernetes-sigs/kind/releases/download/${KIND_VERSION}/kind-linux-${arch}"
curl -fsSLo "${RUNNER_TEMP}/kind" "${kind_url}"
curl -fsSLo "${RUNNER_TEMP}/kind.sha256sum" "${kind_url}.sha256sum"
sed "s# kind-linux-${arch}# ${RUNNER_TEMP}/kind#" "${RUNNER_TEMP}/kind.sha256sum" | sha256sum -c -
sudo install -m 0755 "${RUNNER_TEMP}/kind" /usr/local/bin/kind

- name: Install kubectl
shell: bash
env:
KUBECTL_VERSION: v1.33.7
run: |
set -euo pipefail

case "$(uname -m)" in
x86_64) arch=amd64 ;;
aarch64|arm64) arch=arm64 ;;
*)
echo "Unsupported architecture: $(uname -m)" >&2
exit 1
;;
esac

kubectl_url="https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/${arch}/kubectl"
curl -fsSLo "${RUNNER_TEMP}/kubectl" "${kubectl_url}"
curl -fsSLo "${RUNNER_TEMP}/kubectl.sha256" "${kubectl_url}.sha256"
echo "$(cat "${RUNNER_TEMP}/kubectl.sha256") ${RUNNER_TEMP}/kubectl" | sha256sum -c -
sudo install -m 0755 "${RUNNER_TEMP}/kubectl" /usr/local/bin/kubectl

- name: Install Helm
uses: azure/setup-helm@dda3372f752e03dde6b3237bc9431cdc2f7a02a2 # v5.0.0

- name: Install uv
uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # v7.6.0
with:
python-version: "3.13"
enable-cache: true
version-file: pyproject.toml
cache-dependency-glob: uv.lock

- name: Start kind cluster
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
NGC_API_KEY: not-used-for-ghcr-cpu-smoke
run: bash "${K8S_E2E_SCRIPTS}/setup_local_kind_cpu.sh"

- name: Set default kubectl namespace
shell: bash
run: kubectl config set-context --current --namespace="${NAMESPACE}"

- name: Verify Gateway API setup
shell: bash
run: |
set -euo pipefail
kubectl wait --for=condition=Established crd/gateways.gateway.networking.k8s.io --timeout=2m
kubectl wait --for=condition=Established crd/httproutes.gateway.networking.k8s.io --timeout=2m
kubectl get gatewayclass cloud-provider-kind
kubectl -n "${NAMESPACE}" get gateway "${KUBE_GATEWAY_NAME}"

- name: Pre-pull GHCR images into kind
shell: bash
env:
KIND_IMAGE_PULL_TOKEN: ${{ github.token }}
KIND_IMAGE_PULL_USER: ${{ github.actor }}
run: |
"${K8S_E2E_SCRIPTS}/prepull_kind_images.sh" \
"${NMP_E2E_REGISTRY}/nmp-api:${NMP_E2E_TAG}" \
"${NMP_E2E_REGISTRY}/nmp-core:${NMP_E2E_TAG}" \
"${NMP_E2E_REGISTRY}/nmp-cpu-tasks:${NMP_E2E_TAG}"

- name: Install NeMo Platform
shell: bash
env:
REQUIRE_NMP_E2E_IMAGES: "true"
run: |
if ! HELM_VALUES="${K8S_E2E_VALUES}/kind.yaml" "${K8S_E2E_SCRIPTS}/install_helm_e2e.sh"; then
echo "--- helm list -A ---"
helm list -A || true
echo "--- helm status ${NAMESPACE}/nemo-platform ---"
helm status -n "${NAMESPACE}" nemo-platform || true
echo "--- kubectl get all -n ${NAMESPACE} ---"
kubectl get all -n "${NAMESPACE}" || true
exit 1
fi

- name: Wait for API
shell: bash
run: |
test -n "${NMP_E2E_CLUSTER_URL}"
"${K8S_E2E_SCRIPTS}/wait_for_api.sh" "${NMP_E2E_CLUSTER_URL}/cluster-info" 120

- name: Run CPU job e2e smoke test
shell: bash
env:
_TYPER_FORCE_DISABLE_TERMINAL: "1"
E2E_SERVICES_LOG_DIR: ${{ runner.temp }}/e2e-services-logs
NGC_API_KEY: not-used-for-ghcr-cpu-smoke
run: |
test -n "${NMP_E2E_CLUSTER_URL}"
export NMP_BASE_URL="${NMP_E2E_CLUSTER_URL}"
uv run --frozen pytest \
e2e/test_jobs.py::test_job_using_secret_environment_variable \
-v \
--run-e2e \
--no-cov \
--junitxml=report-kubernetes-smoke.xml

- name: Collect Kubernetes logs
if: always()
shell: bash
run: |
"${K8S_E2E_SCRIPTS}/collect_k8s_logs.sh"

- name: Disk usage summary
if: always()
shell: bash
run: |
echo "=== Host disk ==="
df -h /
echo "=== Docker system ==="
docker system df
echo "=== kind node storage ==="
for node in $(kind get nodes --name "${KIND_CLUSTER_NAME}" 2>/dev/null); do
echo "--- ${node} ---"
docker exec "${node}" sh -c "du -sh /var/lib/containerd /var/lib/kubelet /var/log 2>/dev/null | sort -h" || true
done

- name: Upload Kubernetes artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: kind-smoke-kubernetes-artifacts
retention-days: 7
if-no-files-found: ignore
path: |
k8s-logs/
report-kubernetes-smoke.xml
${{ runner.temp }}/e2e-services-logs/

- name: Delete kind cluster
if: always()
shell: bash
run: |
docker rm -f "cloud-provider-kind-${KIND_CLUSTER_NAME}" || true
kind delete cluster --name "${KIND_CLUSTER_NAME}" || true

helm-lint:
name: Helm lint
needs: [changes]
Expand Down Expand Up @@ -911,6 +1236,8 @@ jobs:
- changes
- actionlint
- docker-bake-graph
- build-cpu-smoke-images
- kind-cpu-smoke
- helm-lint
- helm-chart-verifier
- lint
Expand Down
Loading
Loading