diff --git a/.gitignore b/.gitignore index 7950ab8d6..d7bb3220f 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ # Emacs *~ + +gpu/install_gpu_driver.sh.d \ No newline at end of file diff --git a/gpu/README.md b/gpu/README.md index c4b2935eb..de050fc33 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -28,8 +28,8 @@ CUDA | Full Version | Driver | cuDNN | NCCL | Tested Dataproc Image Ver -----| ------------ | --------- | --------- | -------| --------------------------- 11.8 | 11.8.0 | 525.147.05| 9.5.1.17 | 2.21.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky); 2.2 (Ubuntu 22.04) 12.0 | 12.0.1 | 525.147.05| 8.8.1.3 | 2.16.5 | 2.0, 2.1 (Debian/Ubuntu/Rocky); 2.2 (Rocky 9, Ubuntu 22.04) -12.4 | 12.4.1 | 550.135 | 9.1.0.70 | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+ -12.6 | 12.6.3 | 550.142 | 9.6.0.74 | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+ +12.4 | 12.4.1 | 590.48.01| 9.1.0.70 | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+ +12.6 | 12.6.3 | 590.48.01| 9.6.0.74 | 2.23.4 | 2.1 (Ubuntu 20.04, Rocky 8); Dataproc 2.2+ **Supported Operating Systems:** @@ -189,6 +189,7 @@ This script accepts the following metadata parameters: Determines preference for OS-provided vs. NVIDIA-direct drivers. The script often prioritizes `.run` files or source builds for reliability. * `cudnn-version`: (Optional) Specify cuDNN version (e.g., `8.9.7.29`). + * `cudnn-install-source`: (Optional) `tarball`|`package`. Default: `package` (except for `2.0-rocky8` and `2.1-rocky8` where it defaults to `tarball` to bypass CDN flakes). Determines whether cuDNN is installed via the OS package manager or extracted from the standalone NVIDIA tarball cached in GCS. * `nccl-version`: (Optional) Specify NCCL version. * `include-pytorch`: (Optional) `yes`|`no`. Default: `no`. If `yes`, installs PyTorch, TensorFlow, RAPIDS, and PySpark in a Conda @@ -289,6 +290,80 @@ handles metric creation and reporting. older versions of the `report_gpu_metrics.py` service. The current script and agent versions aim to mitigate this. If encountered, check agent logs. +## Development and Testing + +For instructions on how to manually test changes to this initialization action, including iterative development on a live cluster, please see the [TESTING.md](./TESTING.md) guide. + +If you are modifying this initialization action, you can use the provided test infrastructure to validate your changes locally before deploying them to production. + +### Local Integration Testing (Bazel / Podman) + +Before pushing any changes to GitHub, you **must** run the integration tests locally to validate your modifications against the full test matrix (`test_gpu.py`). These tests use `absl.testing.parameterized` and the `integration_tests.dataproc_test_case` framework to spin up ephemeral Dataproc clusters and validate GPU functionality (SINGLE, STANDARD, KERBEROS, MIG, etc.). + +We provide a Podman wrapper to execute the Bazel test suite locally, perfectly simulating the remote CI sandbox environment. + +1. **Credentials:** Ensure you have your Google Cloud Application Default Credentials (ADC) saved locally, typically at `~/.config/gcloud/application_default_credentials.json`, and copy it to `initialization-actions/key.json`. +2. **Environment:** You must have a configured `env.json` in the `gpu/` directory. + +To run the full suite in the Podman container (Unfiltered): + +> ⚠️ **WARNING: HIGH RESOURCE CONSUMPTION** +> An unfiltered run executes the entire test matrix (currently ~12 shards). Because the script is configured to run up to 10 jobs in parallel, this will concurrently provision up to 10 separate Dataproc clusters. This requires massive GCP quota (e.g., ~900 vCPUs and ~30 GPUs simultaneously if using `n1-standard-32` profiles) and will take 60-90 minutes. + +```bash +cd initialization-actions +# Test a specific Dataproc image version against the full suite +./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" +``` + +To run a specific test filter to iterate quickly on a failure (Recommended): + +```bash +cd initialization-actions + +# Filter by a specific test function +./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" "--test_filter=test_gpu_allocation" + +# Filter by another specific test function +./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" "--test_filter=test_install_gpu_cuda_nvidia_with_spark_job" + +# Filter by the entire class +./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" "--test_filter=NvidiaGpuDriverTestCase" +``` + +### Manual Verification Scripts + +If you have already provisioned a Dataproc cluster (e.g., `my-cluster`) and want to verify its GPU configuration without running the full Bazel test suite, you can use the standalone verification scripts. + +```bash +# Verify using the local Python script +python3 gpu/verify_external_cluster.py \ + --cluster=my-cluster \ + --region=us-east4 \ + --zone=us-east4-b \ + --project=my-project \ + --tests smi agent spark torch tf numa + +# Or using the bash equivalent +export CLUSTER_NAME=my-cluster PROJECT_ID=my-project REGION=us-east4 ZONE=us-east4-b +./gpu/verify_external_gpu_cluster.sh +``` + +### Advanced Spark / ML Validation + +For comprehensive validation of Spark RAPIDS, PyTorch, and TensorFlow on a running cluster, an external testing script is available in the associated `cloud-dataproc/gcloud` repository. + +```bash +# Configure the gcloud test environment +cd ../cloud-dataproc/gcloud +source lib/env.sh # Populates environment variables from env.json + +# Execute the comprehensive Spark GPU test suite against the configured cluster +./t/spark-gpu-test.sh +``` + +This script will remotely execute SSH commands to validate NUMA configurations, run PyTorch/TensorFlow isolated in their Conda environments, verify NVCC/cuDNN, and submit `SparkPi` and `JavaIndexToStringExample` Spark jobs configured to use the RAPIDS accelerator plugin. + ## Important notes * This initialization script will install NVIDIA GPU drivers in all nodes in diff --git a/gpu/TESTING.md b/gpu/TESTING.md new file mode 100644 index 000000000..67c604123 --- /dev/null +++ b/gpu/TESTING.md @@ -0,0 +1,172 @@ +# Testing the GPU Initialization Script + +This document details the recommended iterative development and testing process for the `install_gpu_driver.sh` script, bypassing the slow integration runner when developing and ensuring comprehensive testing when complete. + +## Fast Iterative Development (SSH/Manual) + +This initialization action is designed to be **idempotent**, meaning it can be run multiple times on the same node without breaking the environment. It achieves this by writing "completion sentinels" to `/opt/install-dpgce/complete/` after successfully finishing each phase (e.g., `build-dependencies`, `nccl`, `cuda`). + +To facilitate rapid iteration, we use the tooling provided in the companion `cloud-dataproc/gcloud` repository. This repo contains the test infrastructure, environment configuration (`env.json`), and lifecycle management scripts (`recreate-dpgce`, `ssh-m`, `scp-m`) necessary to provision and interact with test clusters efficiently. + +When making structural or execution logic changes, you want to avoid destroying and recreating the entire Dataproc cluster during each test cycle. Instead, follow this incremental workflow: + +### 1. Provision a "Bare" GPU Cluster +First, configure your target OS and versions in `cloud-dataproc/gcloud/env.json`. Then, use the `--no-init-action` flag on the recreation script to provision a cluster with GPUs attached, but *without* running any initialization actions during boot. + +```bash +cd cloud-dataproc/gcloud +./bin/recreate-dpgce --gpu --no-init-action +``` + +### 2. Compile and Stage the Script +The `install_gpu_driver.sh` script is built from fragments. First, compile the fragments, then use the optimized `scp-m` command to transfer your local changes to the -m node. This script stages the file in the GCS temp bucket and pulls it down to `/tmp/install_gpu_driver.sh` over SSH. + +```bash +cd initialization-actions +cat gpu/install_gpu_driver.sh.d/*.sh > gpu/install_gpu_driver.sh +cd ../cloud-dataproc/gcloud +./bin/scp-m ../../initialization-actions/gpu/install_gpu_driver.sh +``` + +### 3. Execute and Monitor (Incremental Testing) +Execute the script manually over SSH as root. Pumping the output through `tee` captures the logs identically to how Dataproc normally records initialization scripts. + +**Crucially, when re-running the script to test a specific fix, you must purge the relevant completion sentinels** (and partial build directories like `nccl`) so the script doesn't skip the phase you are trying to test. + +* To run the *entire* script from scratch: `sudo rm -rf /opt/install-dpgce/complete` +* To re-test only the NCCL build: `sudo rm -f /opt/install-dpgce/complete/nccl && sudo rm -rf /opt/install-dpgce/nccl` + +```bash +cd cloud-dataproc/gcloud +./bin/ssh-m 'sudo rm -rf /opt/install-dpgce/complete' # Example: clear everything +cd ../../initialization-actions +./gpu/install-in-screen.sh +``` + +If your SSH connection drops, simply run `./gpu/install-in-screen.sh` again to instantly re-attach to the running session without losing context or interrupting the installation. + +### 4. Verify with the Test Suite +Once the installation script completes without errors, run the external testing suite to ensure all Conda environments (PyTorch, TensorFlow, RAPIDS) and Spark services correctly bind to the GPU. + +```bash +cd cloud-dataproc/gcloud +bash t/spark-gpu-test.sh +``` + +## Fast Iterative Development (SSH/Manual) + +This initialization action is designed to be **idempotent**, meaning it can be run multiple times on the same node without breaking the environment. It achieves this by writing "completion sentinels" to `/opt/install-dpgce/complete/` after successfully finishing each phase (e.g., `build-dependencies`, `nccl`, `cuda`). + +To facilitate rapid iteration, we use the tooling provided in the companion `cloud-dataproc/gcloud` repository. This repo contains the test infrastructure, environment configuration (`env.json`), and lifecycle management scripts (`recreate-dpgce`, `ssh-m`, `scp-m`) necessary to provision and interact with test clusters efficiently. + +When making structural or execution logic changes, you want to avoid destroying and recreating the entire Dataproc cluster during each test cycle. Instead, follow this incremental workflow: + +### 1. Provision a "Bare" GPU Cluster +First, configure your target OS and versions in `cloud-dataproc/gcloud/env.json`. Then, use the `--no-init-action` flag on the recreation script to provision a cluster with GPUs attached, but *without* running any initialization actions during boot. + +```bash +cd ../cloud-dataproc/gcloud +# Edit env.json to set IMAGE_VERSION, REGION, ZONE, ACCELERATOR_TYPE, etc. +./bin/recreate-dpgce --gpu --no-init-action +``` +*Note: `recreate-dpgce` will delete and recreate the cluster if it already exists.* + +### 2. Compile, Stage, and Execute in Screen +The `install-in-screen.sh` script automates compiling the fragments, staging the script to the -m node, and running it within a detached `screen` session. + +```bash +cd ../initialization-actions/gpu +./install-in-screen.sh +``` + +This command will: +* Concatenate scripts from `install_gpu_driver.sh.d/` into `install_gpu_driver.sh`. +* Use `../cloud-dataproc/gcloud/bin/scp-m` to upload the script to `/tmp/install_gpu_driver.sh` on the -m node. +* SSH to the -m node and start the script in a `screen` session named `gpu_install`. If the session already exists, it reattaches. + +**Monitoring:** +* Logs are streamed to `/tmp/install_gpu_driver.log` on the -m node. You can tail this file via a separate SSH session: + ```bash + cd ../cloud-dataproc/gcloud + ./bin/ssh-m "tail -f /tmp/install_gpu_driver.log" + ``` +* Re-run `./install-in-screen.sh` to reattach to the screen session. + +### 3. Incremental Testing & Clearing Sentinels +To re-run specific parts of the script after making fixes, you MUST clear the completion sentinels for those parts on the -m node. + +* To run the *entire* script from scratch: + ```bash + cd ../cloud-dataproc/gcloud + ./bin/ssh-m 'sudo rm -rf /opt/install-dpgce/complete' + ``` +* To re-test only the NCCL build: + ```bash + cd ../cloud-dataproc/gcloud + ./bin/ssh-m 'sudo rm -f /opt/install-dpgce/complete/nccl && sudo rm -rf /opt/install-dpgce/nccl' + ``` +Then, run `./initialization-actions/gpu/install-in-screen.sh` again. + +### 4. Verify with the Test Suite +Once the installation script completes without errors in the screen session, run the external testing suite from the `cloud-dataproc/gcloud` repository to ensure all Conda environments (PyTorch, TensorFlow, RAPIDS) and Spark services correctly bind to the GPU. + +```bash +cd ../cloud-dataproc/gcloud +bash t/spark-gpu-test.sh +``` + +## Continuous Integration Testing (Bazel/Podman) + +Once the manual tests pass, you **must** verify the script behaves correctly within the isolated Python `absl` test harness (`test_gpu.py`) before pushing your changes to GitHub. This validates the full matrix of installation scenarios (SINGLE, STANDARD, KERBEROS, MIG, etc.). + +We use a Podman wrapper to execute the Bazel test suite locally, perfectly simulating the remote CI environment. + +1. **Credentials:** Ensure your Google Cloud Application Default Credentials (ADC) are saved locally (typically `~/.config/gcloud/application_default_credentials.json`). Copy them to the root of the repository: + ```bash + cp ~/.config/gcloud/application_default_credentials.json ./key.json + ``` + +2. **Execute Full Suite (Unfiltered):** To execute the entire parameterized test matrix, run the wrapper script without a test filter. + + > ⚠️ **WARNING: HIGH RESOURCE CONSUMPTION** + > An unfiltered run executes all ~12 active parameterized shards. Because the script runs with `--jobs=10`, this will concurrently provision up to 10 separate Dataproc clusters. This requires massive GCP quota (roughly ~900 vCPUs and ~30 GPUs simultaneously if using `n1-standard-32` profiles) and will take approximately 60 to 90 minutes to complete. Do not run this unless you are finalizing a major PR. + + ```bash + cd initialization-actions + ./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" + ``` + +3. **Execute Specific Tests (Recommended for Iteration):** When iterating on a specific feature or failure, always pass Bazel arguments to filter the test execution. This saves significant time and quota. You can filter by test function name or class. + + *Filter by a specific test function:* + ```bash + cd initialization-actions + ./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" "--test_filter=test_gpu_allocation" + ``` + + *Filter by a specific test function that executes spark jobs:* + ```bash + cd initialization-actions + ./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" "--test_filter=test_install_gpu_cuda_nvidia_with_spark_job" + ``` + + *Filter by test class (runs all tests in the class):* + ```bash + cd initialization-actions + ./gpu/run-bazel-tests-with-podman.sh "2.2-ubuntu22" "--test_filter=NvidiaGpuDriverTestCase" + ``` + +## Compiling the AST Splitter Tool (`split.go`) + +If you need to re-split `install_gpu_driver.sh` into its `.d/` fragments (e.g. if the main script was modified instead of the fragments), we use a Go-based AST parsing tool (`split.go`) to accurately chunk the bash script. + +To compile the tool locally: + +```bash +cd initialization-actions/gpu +go mod init split +go get mvdan.cc/sh/v3/syntax +go build -o split_ast split.go +``` + +Once compiled, executing `./split_ast install_gpu_driver.sh` will parse the script and populate the `install_gpu_driver.sh.d/` directory with the chunked components. diff --git a/gpu/install_gpu_driver.sh b/gpu/install_gpu_driver.sh old mode 100644 new mode 100755 index 9a1ee94cd..62dac309c --- a/gpu/install_gpu_driver.sh +++ b/gpu/install_gpu_driver.sh @@ -38,7 +38,7 @@ if [[ "$(os_id)" == "rocky" ]]; else _os_version="$(os_version)" fi for os_id_val in 'rocky' 'ubuntu' 'debian' ; do - eval "function is_${os_id_val}() { [[ \"$(os_id)\" == '${os_id_val}' ]] ; }" + eval "function is_${os_id_val}() { [[ \"$(os_id)\" == \"${os_id_val}\" ]] ; }" for osver in $(echo "${supported_os["${os_id_val}"]}") ; do eval "function is_${os_id_val}${osver%%.*}() { is_${os_id_val} && [[ \"${_os_version}\" == \"${osver}\" ]] ; }" @@ -62,9 +62,9 @@ function repair_old_backports { # https://github.com/GoogleCloudDataproc/initialization-actions/issues/1157 debdists="https://deb.debian.org/debian/dists" - oldoldstable=$(curl ${curl_retry_args} "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); - oldstable=$( curl ${curl_retry_args} "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); - stable=$( curl ${curl_retry_args} "${debdists}/stable/Release" | awk '/^Codename/ {print $2}'); + oldoldstable=$(curl "${curl_retry_args[@]}" "${debdists}/oldoldstable/Release" | awk '/^Codename/ {print $2}'); + oldstable=$( curl "${curl_retry_args[@]}" "${debdists}/oldstable/Release" | awk '/^Codename/ {print $2}'); + stable=$( curl "${curl_retry_args[@]}" "${debdists}/stable/Release" 2>/dev/null | awk '/^Codename/ {print $2}'); matched_files=( $(test -d /etc/apt && grep -rsil '\-backports' /etc/apt/sources.list*||:) ) @@ -78,22 +78,22 @@ function repair_old_backports { function print_metadata_value() { local readonly tmpfile=$(mktemp) http_code=$(curl -f "${1}" -H "Metadata-Flavor: Google" -w "%{http_code}" \ - -s -o ${tmpfile} 2>/dev/null) + -s -o "${tmpfile}" 2>/dev/null) local readonly return_code=$? # If the command completed successfully, print the metadata value to stdout. - if [[ ${return_code} == 0 && ${http_code} == 200 ]]; then - cat ${tmpfile} + if [[ "${return_code}" == 0 && "${http_code}" == 200 ]]; then + cat "${tmpfile}" fi - rm -f ${tmpfile} - return ${return_code} + rm -f "${tmpfile}" + return "${return_code}" } function print_metadata_value_if_exists() { local return_code=1 - local readonly url=$1 - print_metadata_value ${url} + local readonly url="$1" + print_metadata_value "${url}" return_code=$? - return ${return_code} + return "${return_code}" } # replicates /usr/share/google/get_metadata_value @@ -101,14 +101,14 @@ function get_metadata_value() { local readonly varname=$1 local -r MDS_PREFIX=http://metadata.google.internal/computeMetadata/v1 # Print the instance metadata value. - print_metadata_value_if_exists ${MDS_PREFIX}/instance/${varname} + print_metadata_value_if_exists "${MDS_PREFIX}/instance/${varname}" return_code=$? # If the instance doesn't have the value, try the project. - if [[ ${return_code} != 0 ]]; then - print_metadata_value_if_exists ${MDS_PREFIX}/project/${varname} + if [[ "${return_code}" != 0 ]]; then + print_metadata_value_if_exists "${MDS_PREFIX}/project/${varname}" return_code=$? fi - return ${return_code} + return "${return_code}" } function get_metadata_attribute() { @@ -140,7 +140,9 @@ readonly -A DRIVER_FOR_CUDA=( ["11.4"]="470.256.02" ["11.5"]="495.46" ["11.6"]="510.108.03" ["11.7"]="515.65.01" ["11.8"]="525.147.05" ["12.0"]="525.147.05" ["12.1"]="530.30.02" ["12.2"]="535.216.01" ["12.3"]="545.29.06" - ["12.4"]="550.135" ["12.5"]="550.142" ["12.6"]="550.142" + ["12.4"]="590.48.01" ["12.5"]="590.48.01" ["12.6"]="590.48.01" + ["12.8"]="590.48.01" ["12.9"]="575.64.05" + ["13.0"]="580.126.20" ["13.1"]="590.48.01" ) readonly -A DRIVER_SUBVER=( ["410"]="410.104" ["415"]="415.27" ["418"]="418.113" @@ -150,7 +152,8 @@ readonly -A DRIVER_SUBVER=( ["510"]="510.108.03" ["515"]="515.48.07" ["520"]="525.147.05" ["525"]="525.147.05" ["535"]="535.216.01" ["545"]="545.29.06" ["550"]="550.142" ["555"]="555.58.02" ["560"]="560.35.03" - ["565"]="565.77" + ["565"]="565.77" ["570"]="570.211.01" ["575"]="575.64.05" + ["580"]="580.126.20" ["590"]="590.48.01" ) # https://developer.nvidia.com/cudnn-downloads readonly -A CUDNN_FOR_CUDA=( @@ -160,7 +163,8 @@ readonly -A CUDNN_FOR_CUDA=( ["11.6"]="8.4.0.27" ["11.7"]="8.9.7.29" ["11.8"]="9.5.1.17" ["12.0"]="8.8.1.3" ["12.1"]="8.9.3.28" ["12.2"]="8.9.5" ["12.3"]="9.0.0.306" ["12.4"]="9.1.0.70" ["12.5"]="9.2.1.18" - ["12.6"]="9.6.0.74" + ["12.6"]="9.6.0.74" ["12.8"]="9.8.0.87" ["12.9"]="9.10.2.21" + ["13.0"]="9.14.0.64" ["13.1"]="9.17.1.4" ) # https://developer.nvidia.com/nccl/nccl-download readonly -A NCCL_FOR_CUDA=( @@ -169,7 +173,8 @@ readonly -A NCCL_FOR_CUDA=( ["11.5"]="2.11.4" ["11.6"]="2.12.10" ["11.7"]="2.12.12" ["11.8"]="2.21.5" ["12.0"]="2.16.5" ["12.1"]="2.18.3" ["12.2"]="2.19.3" ["12.3"]="2.19.4" ["12.4"]="2.23.4" - ["12.5"]="2.22.3" ["12.6"]="2.23.4" + ["12.5"]="2.22.3" ["12.6"]="2.23.4" ["12.8"]="2.25.1" + ["12.9"]="2.27.3" ["13.0"]="2.27.7" ["13.1"]="2.29.2" ) readonly -A CUDA_SUBVER=( ["10.0"]="10.0.130" ["10.1"]="10.1.234" ["10.2"]="10.2.89" @@ -178,16 +183,16 @@ readonly -A CUDA_SUBVER=( ["11.6"]="11.6.2" ["11.7"]="11.7.1" ["11.8"]="11.8.0" ["12.0"]="12.0.1" ["12.1"]="12.1.1" ["12.2"]="12.2.2" ["12.3"]="12.3.2" ["12.4"]="12.4.1" ["12.5"]="12.5.1" - ["12.6"]="12.6.3" + ["12.6"]="12.6.3" ["12.8"]="12.8.1" ["12.9"]="12.9.1" + ["13.0"]="13.0.2" ["13.1"]="13.1.1" ) - function set_cuda_version() { case "${DATAPROC_IMAGE_VERSION}" in - "1.5" ) DEFAULT_CUDA_VERSION="11.6.2" ;; - "2.0" ) DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) - "2.1" ) DEFAULT_CUDA_VERSION="12.4.1" ;; - "2.2" ) DEFAULT_CUDA_VERSION="12.6.3" ;; - "2.3" ) DEFAULT_CUDA_VERSION="12.6.3" ;; + "1.5" ) local DEFAULT_CUDA_VERSION="11.6.2" ;; + "2.0" ) local DEFAULT_CUDA_VERSION="12.1.1" ;; # Cuda 12.1.1 - Driver v530.30.02 is the latest version supported by Ubuntu 18) + "2.1" ) local DEFAULT_CUDA_VERSION="12.4.1" ;; + "2.2" ) local DEFAULT_CUDA_VERSION="13.1.1" ;; + "2.3" ) local DEFAULT_CUDA_VERSION="13.1.1" ;; * ) echo "unrecognized Dataproc image version: ${DATAPROC_IMAGE_VERSION}" exit 1 @@ -205,7 +210,27 @@ function set_cuda_version() { fi readonly DEFAULT_CUDA_VERSION - CUDA_VERSION=$(get_metadata_attribute 'cuda-version' "${DEFAULT_CUDA_VERSION}") + local raw_cuda_version + raw_cuda_version=$(get_metadata_attribute 'cuda-version' '') # Get raw value, default to empty + + if [[ -n "${raw_cuda_version}" ]]; then + # Use metadata value only if it's not empty + CUDA_VERSION="${raw_cuda_version}" + echo "DEBUG: Using cuda-version from metadata: '${CUDA_VERSION}'" + else + # Fallback to DEFAULT_CUDA_VERSION if metadata is empty or not found + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" + echo "DEBUG: cuda-version metadata not found or empty, using default: '${CUDA_VERSION}'" + fi + + # Validate the chosen CUDA_VERSION + if ! test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+/')" ; then + echo "ERROR: Invalid CUDA_VERSION obtained: '${CUDA_VERSION}'. Attempting to use DEFAULT: '${DEFAULT_CUDA_VERSION}'" >&2 + CUDA_VERSION="${DEFAULT_CUDA_VERSION}" + fi + + echo "DEBUG: Effective CUDA_VERSION: '${CUDA_VERSION}'" + if test -n "$(echo "${CUDA_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then CUDA_FULL_VERSION="${CUDA_VERSION}" CUDA_VERSION="${CUDA_VERSION%.*}" @@ -245,10 +270,10 @@ function set_driver_version() { if [[ "${CUDA_URL_DRIVER_VERSION}" =~ ^[0-9]+.*[0-9]$ ]] ; then major_driver_version="${CUDA_URL_DRIVER_VERSION%%.*}" driver_max_maj_version=${DRIVER_SUBVER["${major_driver_version}"]} - if curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then + if curl "${curl_retry_args[@]}" --head "${nv_xf86_x64_base}/${CUDA_URL_DRIVER_VERSION}/NVIDIA-Linux-x86_64-${CUDA_URL_DRIVER_VERSION}.run" | grep -E -q 'HTTP.*200' ; then # use the version indicated by the cuda url as the default if it exists DEFAULT_DRIVER="${CUDA_URL_DRIVER_VERSION}" - elif curl ${curl_retry_args} --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then + elif curl "${curl_retry_args[@]}" --head "${nv_xf86_x64_base}/${driver_max_maj_version}/NVIDIA-Linux-x86_64-${driver_max_maj_version}.run" | grep -E -q 'HTTP.*200' ; then # use the maximum sub-version available for the major version indicated in cuda url as the default DEFAULT_DRIVER="${driver_max_maj_version}" fi @@ -260,8 +285,23 @@ function set_driver_version() { DEFAULT_DRIVER=${DRIVER_FOR_CUDA["${CUDA_VERSION}"]} fi - DRIVER_VERSION=$(get_metadata_attribute 'gpu-driver-version' "${DEFAULT_DRIVER}") + local raw_driver_version + raw_driver_version=$(get_metadata_attribute 'gpu-driver-version' '') + + if [[ -n "${raw_driver_version}" ]]; then + DRIVER_VERSION="${raw_driver_version}" + echo "DEBUG: Using gpu-driver-version from metadata: '${DRIVER_VERSION}'" + else + DRIVER_VERSION="${DEFAULT_DRIVER}" + echo "DEBUG: gpu-driver-version metadata not found or empty, using default: '${DRIVER_VERSION}'" + fi + if ! test -n "$(echo "${DRIVER_VERSION}" | perl -ne 'print if /\d+\.\d+\.\d+/')" ; then + echo "ERROR: Invalid DRIVER_VERSION obtained: '${DRIVER_VERSION}'. Attempting to use DEFAULT: '${DEFAULT_DRIVER}'" >&2 + DRIVER_VERSION="${DEFAULT_DRIVER}" + fi + + echo "DEBUG: Effective DRIVER_VERSION: '${DRIVER_VERSION}'" readonly DRIVER_VERSION readonly DRIVER="${DRIVER_VERSION%%.*}" @@ -279,16 +319,16 @@ function set_driver_version() { if ! gsutil -q stat "${gcs_cache_path}"; then echo "Driver not found in GCS cache. Validating URL: ${gpu_driver_url}" # Use curl to check if the URL is valid (HEAD request) - if curl -sSLfI --connect-timeout 10 --max-time 30 "${gpu_driver_url}" 2>/dev/null | grep -E -q 'HTTP.*200'; then + if curl "${curl_retry_args[@]}" --head "${gpu_driver_url}" | grep -E -q 'HTTP.*200'; then echo "NVIDIA URL is valid. Downloading to cache..." local temp_driver_file="${tmpdir}/${driver_filename}" # Download the file echo "Downloading from ${gpu_driver_url} to ${temp_driver_file}" - if curl -sSLf -o "${temp_driver_file}" "${gpu_driver_url}"; then + if curl "${curl_retry_args[@]}" -o "${temp_driver_file}" "${gpu_driver_url}"; then echo "Download complete. Uploading to ${gcs_cache_path}" # Upload to GCS - if gsutil cp "${temp_driver_file}" "${gcs_cache_path}"; then + if "${gsutil_cmd[@]}" cp "${temp_driver_file}" "${gcs_cache_path}"; then echo "Successfully cached to GCS." rm -f "${temp_driver_file}" else @@ -429,6 +469,10 @@ function set_cuda_runfile_url() { ["12.4.0"]="550.54.14" ["12.4.1"]="550.54.15" # 550.54.15 is not a driver indexed at https://us.download.nvidia.com/XFree86/Linux-x86_64/ ["12.5.0"]="555.42.02" ["12.5.1"]="555.42.06" # 555.42.02 is indexed, 555.42.06 is not ["12.6.0"]="560.28.03" ["12.6.1"]="560.35.03" ["12.6.2"]="560.35.03" ["12.6.3"]="560.35.05" + ["12.8.0"]="570.86.10" ["12.8.1"]="570.124.06" + ["12.9.0"]="575.51.03" ["12.9.1"]="575.57.08" + ["13.0.0"]="580.65.06" ["13.0.1"]="580.82.07" ["13.0.2"]="580.95.05" + ["13.1.0"]="590.44.01" ["13.1.1"]="590.48.01" ) # Verify that the file with the indicated combination exists @@ -439,7 +483,7 @@ function set_cuda_runfile_url() { NVIDIA_CUDA_URL=$(get_metadata_attribute 'cuda-url' "${DEFAULT_NVIDIA_CUDA_URL}") - if ! curl ${curl_retry_args} --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then + if ! curl "${curl_retry_args[@]}" --head "${NVIDIA_CUDA_URL}" | grep -E -q 'HTTP.*200' ; then echo "No CUDA distribution exists for this combination of DRIVER_VERSION=${drv_ver}, CUDA_VERSION=${CUDA_FULL_VERSION}" if [[ "${DEFAULT_NVIDIA_CUDA_URL}" != "${NVIDIA_CUDA_URL}" ]]; then echo "consider [${DEFAULT_NVIDIA_CUDA_URL}] instead" @@ -451,6 +495,31 @@ function set_cuda_runfile_url() { CUDA_RUNFILE="$(echo ${NVIDIA_CUDA_URL} | perl -pe 's{^.+/}{}')" readonly CUDA_RUNFILE + export local_cuda_runfile="${tmpdir}/${CUDA_RUNFILE}" + local gcs_cache_path="${pkg_bucket}/nvidia/${CUDA_RUNFILE}" + + echo "Checking for cached CUDA runfile at: ${gcs_cache_path}" + if "${gsutil_stat_cmd[@]}" "${gcs_cache_path}" > /dev/null 2>&1; then + echo "CUDA runfile found in GCS cache. Downloading from ${gcs_cache_path}" + if ! "${gsutil_cmd[@]}" cp "${gcs_cache_path}" "${local_cuda_runfile}"; then + echo "ERROR: Failed to download CUDA runfile from GCS cache." + exit 1 + fi + else + echo "CUDA runfile not found in GCS cache. Downloading from NVIDIA: ${NVIDIA_CUDA_URL}" + # URL validity was already checked above + echo "Downloading from ${NVIDIA_CUDA_URL} to ${local_cuda_runfile}" + if curl "${curl_retry_args[@]}" -o "${local_cuda_runfile}" "${NVIDIA_CUDA_URL}"; then + echo "Download complete. Uploading to GCS cache: ${gcs_cache_path}" + if ! "${gsutil_cmd[@]}" cp "${local_cuda_runfile}" "${gcs_cache_path}"; then + echo "WARN: Failed to upload CUDA runfile to GCS cache." + fi + else + echo "ERROR: Failed to download CUDA runfile from NVIDIA." + exit 1 + fi + fi + echo "DEBUG: Local CUDA runfile path: ${local_cuda_runfile}" if ( version_lt "${CUDA_FULL_VERSION}" "12.3.0" && ge_debian12 ) ; then echo "CUDA 12.3.0 is the minimum CUDA 12 version supported on Debian 12" @@ -512,22 +581,26 @@ IS_CUSTOM_IMAGE_BUILD="false" # Default function execute_with_retries() ( local -r cmd="$*" - if [[ "$cmd" =~ "^apt-get install" ]] ; then + if [[ "$cmd" =~ ^apt-get ]] ; then apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove + apt-get -y autoremove fi for ((i = 0; i < 3; i++)); do - time eval "$cmd" > "${install_log}" 2>&1 && retval=$? || { retval=$? ; cat "${install_log}" ; } + set +e + time eval "$cmd" 2>&1 | tee "${install_log}" + retval=${PIPESTATUS[0]} + set -e if [[ $retval == 0 ]] ; then return 0 ; fi sleep 5 done + echo "ERROR: Command failed after 3 retries: ${cmd}" >&2 return 1 ) function install_cuda_keyring_pkg() { is_complete cuda-keyring-installed && return local kr_ver=1.1 - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${NVIDIA_REPO_URL}/cuda-keyring_${kr_ver}-1_all.deb" \ -o "${tmpdir}/cuda-keyring.deb" dpkg -i "${tmpdir}/cuda-keyring.deb" @@ -549,15 +622,15 @@ function install_local_cuda_repo() { readonly LOCAL_DEB_URL="${NVIDIA_BASE_DL_URL}/cuda/${CUDA_FULL_VERSION}/local_installers/${LOCAL_INSTALLER_DEB}" readonly DIST_KEYRING_DIR="/var/${pkgname}" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${LOCAL_DEB_URL}" -o "${tmpdir}/${LOCAL_INSTALLER_DEB}" dpkg -i "${tmpdir}/${LOCAL_INSTALLER_DEB}" rm "${tmpdir}/${LOCAL_INSTALLER_DEB}" - cp ${DIST_KEYRING_DIR}/cuda-*-keyring.gpg /usr/share/keyrings/ + cp "${DIST_KEYRING_DIR}"/cuda-*-keyring.gpg /usr/share/keyrings/ if is_ubuntu ; then - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${NVIDIA_REPO_URL}/cuda-${shortname}.pin" \ -o /etc/apt/preferences.d/cuda-repository-pin-600 fi @@ -577,7 +650,7 @@ function install_local_cudnn_repo() { local_deb_url="${NVIDIA_BASE_DL_URL}/cudnn/${CUDNN_VERSION%.*}/local_installers/${local_deb_fn}" # ${NVIDIA_BASE_DL_URL}/redist/cudnn/v8.6.0/local_installers/11.8/cudnn-linux-x86_64-8.6.0.163_cuda11-archive.tar.xz - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${local_deb_url}" -o "${tmpdir}/local-installer.deb" dpkg -i "${tmpdir}/local-installer.deb" @@ -589,6 +662,216 @@ function install_local_cudnn_repo() { mark_complete install-local-cudnn-repo } +function create_conda_env() { + local env_name="$1" + shift + local packages=("$@") + + local conda_root_path="/opt/conda/default" + [[ -d ${conda_root_path} ]] || return 1 + local envpath="${conda_root_path}/envs/${env_name}" + + # Set numa node to 0 for all GPUs + for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > "${f}" || true ; done + + local build_tarball="${env_name}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" + local local_tarball="${tmpdir}/${build_tarball}" + local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + + if is_complete "install_env_${env_name}"; then + echo "Environment '${env_name}' sentinel found, skipping creation." + # Still register kernel if not already done + if ! [[ -d "/usr/local/share/jupyter/kernels/${env_name}" ]]; then + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + fi + return 0 + fi + + echo "Creating Conda environment: ${env_name}" + + set +e + "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1 + local cache_exists_code=$? + set -e + + if [[ ${cache_exists_code} -eq 0 ]]; then + echo "Cache hit for ${env_name}. Unpacking from ${gcs_tarball}" + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory: ${envpath}" + rm -rf "${envpath}" + fi + mkdir -p "${envpath}" + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz + else + echo "Cache miss for ${env_name}. Building environment." + + # Wait for any other node to finish building this same tarball + if [[ "$(hostname -s)" =~ ^test ]] && (( $(nproc) < 32 )) ; then + sleep $(( ( RANDOM % 11 ) + 10 )) + fi + + # Check for the .building file + # Only respect the lock if we have a small number of cores; larger nodes + # should just build it concurrently to avoid 60 minute waits. + if (( $(nproc) < 16 )) ; then + local building_output + set +e # Don't exit if describe fails + building_output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" 2>/dev/null)" + local gcs_describe_exit_code=$? + set -e + if [[ ${gcs_describe_exit_code} -eq 0 ]] && [[ -n "${building_output}" ]]; then + local build_start_time + build_start_time=$(echo "${building_output}" | grep -oP 'Creation time:\s*\K.*' || echo "") + if [[ -n "${build_start_time}" ]]; then + local build_start_epoch + build_start_epoch="$(date -u -d "${build_start_time}" +%s)" + local timeout_epoch + timeout_epoch=$((build_start_epoch + 3600)) # 60 minutes + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" > /dev/null 2>&1 ; do + # Check if the main tarball has appeared in the meantime + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1; then + echo "INFO: Cache file ${gcs_tarball} appeared while waiting. Skipping build." + break # Exit while loop, will be caught by the next check + fi + local now_epoch + now_epoch="$(date -u +%s)" + if (( now_epoch > timeout_epoch )) ; then + echo "WARN: Timeout waiting for ${gcs_tarball}.building to be removed. Removing it myself." + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" + break + fi + echo "INFO: Waiting for existing build of ${gcs_tarball} to complete..." + sleep 1m # Shorter sleep for faster detection + done + fi + fi + fi + + # Re-check if the tarball was created while we were waiting + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" > /dev/null 2>&1 ; then + echo "Cache hit for ${env_name}. Unpacking from ${gcs_tarball}" + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory: ${envpath}" + rm -rf "${envpath}" + fi + mkdir -p "${envpath}" + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C "${envpath}" -xz + # Skip the rest of the build, go directly to jupyter kernel registration + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m pip install ipykernel + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + mark_complete "install_env_${env_name}" + return 0 + fi + + echo "INFO: Proceeding to build ${env_name}." + # Clean up any previous partial build attempt (if timeout occurred) + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || echo "WARN: No .building file to remove." + if [[ -d "${envpath}" ]]; then + echo "INFO: Removing existing local Conda env directory for rebuild: ${envpath}" + rm -rf "${envpath}" + fi + + touch "${local_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" + building_file="${gcs_tarball}.building" + + local conda_path="${conda_root_path}/bin/mamba" + if ! command -v "${conda_path}" > /dev/null 2>&1; then + echo "Mamba not found, installing..." + "${conda_root_path}/bin/conda" install -n base -c conda-forge mamba -y \ + || echo "WARN: Mamba installation failed." + if ! command -v "${conda_path}" > /dev/null 2>&1; then + echo "Mamba not found, falling back to conda." + conda_path="${conda_root_path}/bin/conda" + fi + fi + + # Fallback to conda for older OSes due to download issues with mamba + if version_le "${DATAPROC_IMAGE_VERSION}" "2.0"; then + echo "INFO: Dataproc <= 2.0 detected, using conda instead of mamba for environment ${env_name}" + conda_path="${conda_root_path}/bin/conda" + fi + echo "Using installer: ${conda_path}" + + local conda_err_file="${tmpdir}/conda_create_${env_name}.err" + echo "DEBUG: About to run ${conda_path} create for ${env_name}" + set +e + + if version_le "${DATAPROC_IMAGE_VERSION}" "2.0"; then + timeout 3m "${conda_path}" create -y -n "${env_name}" "${packages[@]}" 2>&1 | tee "${conda_err_file}" + local conda_exit_code=${PIPESTATUS[0]} + + if [[ "${conda_exit_code}" == 124 ]]; then + echo "WARN: Timed out (3m) attempting to resolve ${env_name} dependencies." >&2 + echo "WARN: The classic Conda dependency solver frequently deadlocks when installing massive packages like PyTorch or RAPIDS." >&2 + echo "WARN: GPU-accelerated Machine Learning environments are not supported on Dataproc 2.0 (Debian 10/Ubuntu 18.04/Rocky 8)." >&2 + echo "WARN: Please upgrade to Dataproc 2.1 or newer (Debian 11+/Ubuntu 20.04+/Rocky 8 on 2.1) to utilize these features." >&2 + if [[ -n "${building_file:-}" ]]; then + "${gsutil_cmd[@]}" rm "${building_file}" || true + building_file="" + fi + set -e + return 0 + fi + else + time "${conda_path}" create -y -n "${env_name}" "${packages[@]}" 2>&1 | tee "${conda_err_file}" + local conda_exit_code=${PIPESTATUS[0]} + fi + set -e + echo "DEBUG: ${conda_path} create finished with exit code ${conda_exit_code}" + + if [[ "${conda_exit_code}" -ne 0 ]]; then + cat "${conda_err_file}" >&2 + if [[ "${conda_path}" == *mamba ]] && grep -q "RuntimeError: Multi-download failed." "${conda_err_file}"; then + echo "ERROR: Mamba failed to create the environment, likely due to a proxy issue on this platform." >&2 + echo "ERROR: Please run this initialization action in a non-proxied environment at least once to build and populate the GCS cache for '${gcs_tarball}'." >&2 + echo "ERROR: Once the cache exists, subsequent runs in the proxied environment should succeed." >&2 + exit 1 + else + echo "ERROR: Conda/Mamba environment creation failed with exit code ${conda_exit_code}." >&2 + exit "${conda_exit_code}" + fi + fi + rm -f "${conda_err_file}" + + # Activate environment for any pip installs + echo "Activating ${env_name} environment..." + source "${conda_root_path}/etc/profile.d/conda.sh" + set +u # Temporarily disable unbound variable check + conda activate "${env_name}" + set -u # Re-enable unbound variable check + echo "Activated $(which python)" + + if [[ "${env_name}" == "tensorflow" ]]; then + echo "Installing TensorFlow with GPU support using pip in '${env_name}' env..." + python -m pip install --upgrade pip + python -m pip install --no-cache-dir 'tensorflow[and-cuda]>=2.16.0,<2.17.0' + fi + + set +u # Temporarily disable unbound variable check + conda deactivate + set -u # Re-enable unbound variable check + + echo "Packaging environment '${env_name}'" + pushd "${envpath}" + tar czf "${local_tarball}" . + popd + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if [[ -n "${building_file:-}" ]]; then + "${gsutil_cmd[@]}" rm "${building_file}" || true + building_file="" + fi + rm -f "${local_tarball}" + echo "Environment '${env_name}' built and cached." + fi + + echo "Registering Jupyter kernel for '${env_name}'" + "${envpath}/bin/python3" -m pip install ipykernel + "${envpath}/bin/python3" -m ipykernel install --user --name "${env_name}" --display-name "Python (${env_name})" + mark_complete "install_env_${env_name}" +} function uninstall_local_cudnn_repo() { apt-get purge -yq "${CUDNN_PKG_NAME}" mark_incomplete install-local-cudnn-repo @@ -631,7 +914,60 @@ function install_local_cudnn8_repo() { cp "${cudnn_path}"/cudnn-local-*-keyring.gpg /usr/share/keyrings mark_complete install-local-cudnn8-repo } +function install_tensorflow() { + include_tensorflow="$(get_metadata_attribute 'include-tensorflow' 'false')" + echo "DEBUG: include-tensorflow metadata value: [${include_tensorflow}]" + if [[ "${include_tensorflow^^}" != "TRUE" && "${include_tensorflow^^}" != "YES" && "${include_tensorflow}" != "1" ]]; then + echo "Skipping TensorFlow installation." + return 0 + fi + is_complete install_env_tensorflow && return + + local channels=('-c' 'conda-forge') + local packages=( + "python=3.11" "pyspark" "pandas" "numba" "pyarrow" + ) + create_conda_env "tensorflow" "${channels[@]}" "${packages[@]}" +} +function install_pytorch() { + include_pytorch="$(get_metadata_attribute 'include-pytorch' 'false')" + echo "DEBUG: 062: include-pytorch metadata value: [${include_pytorch}]" + if [[ "${include_pytorch^^}" != "TRUE" && "${include_pytorch^^}" != "YES" && "${include_pytorch}" != "1" ]]; then + echo "DEBUG: 062: Skipping PyTorch/Rapids installation." + return 0 + fi + + echo "DEBUG: 062: Passed include-pytorch check" + # Create isolated PyTorch environment + if ! is_complete install_env_pytorch; then + echo "DEBUG: 062: About to create pytorch env" + local channels=('-c' 'pytorch' '-c' 'nvidia') + local pt_packages=( + "python=3.11" "pytorch" "torchvision" "torchaudio" "pytorch-cuda=${CUDA_VERSION}" "pyspark" "numba" + ) + create_conda_env "pytorch" "${channels[@]}" "${pt_packages[@]}" + echo "DEBUG: 062: create_conda_env pytorch finished with exit code $?" + else + echo "DEBUG: 062: pytorch sentinel found, skipping creation" + fi + + echo "DEBUG: 062: After pytorch env block" + + # Create isolated Rapids environment + if ! is_complete install_env_rapids; then + echo "DEBUG: 062: About to create rapids env" + local channels=('-c' 'rapidsai' '-c' 'nvidia' '-c' 'conda-forge') + local rapids_packages=( + "python=3.11" "rapids" "pyspark" "numba" + ) + create_conda_env "rapids" "${channels[@]}" "${rapids_packages[@]}" + echo "DEBUG: 062: create_conda_env rapids finished with exit code $?" + else + echo "DEBUG: 062: rapids sentinel found, skipping creation" + fi + echo "DEBUG: 062: End of install_pytorch function" +} function uninstall_local_cudnn8_repo() { apt-get purge -yq "${CUDNN8_PKG_NAME}" mark_incomplete install-local-cudnn8-repo @@ -650,12 +986,26 @@ function install_nvidia_nccl() { local -r nccl_version="${NCCL_VERSION}-1+cuda${CUDA_VERSION}" + if is_debuntu && dpkg-query -W "libnccl2" > /dev/null 2>&1 ; then + local installed_nccl + installed_nccl="$(dpkg-query -W -f='${Version}' libnccl2 2>/dev/null)" + if [[ "${installed_nccl}" == "${nccl_version}"* ]]; then + echo "INFO: NCCL ${nccl_version} is already installed." + mark_complete nccl + return 0 + fi + elif is_rocky && rpm -q "libnccl-${nccl_version}.x86_64" > /dev/null 2>&1; then + echo "INFO: NCCL ${nccl_version} is already installed." + mark_complete nccl + return 0 + fi + mkdir -p "${workdir}" pushd "${workdir}" test -d "${workdir}/nccl" || { local tarball_fn="v${NCCL_VERSION}-1.tar.gz" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "https://github.com/NVIDIA/nccl/archive/refs/tags/${tarball_fn}" \ | tar xz mv "nccl-${NCCL_VERSION}-1" nccl @@ -670,20 +1020,20 @@ function install_nvidia_nccl() { local local_tarball="${workdir}/${build_tarball}" local gcs_tarball="${pkg_bucket}/nvidia/nccl/${_shortname}/${build_tarball}" - if [[ "$(hostname -s)" =~ ^test-gpu && "$(nproc)" < 32 ]] ; then + if [[ "$(hostname -s)" =~ ^test-gpu ]] && (( $(nproc) < 32 )) ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -691,14 +1041,14 @@ function install_nvidia_nccl() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then # cache hit - unpack from cache echo "cache hit" - ${gsutil_cmd} cat "${gcs_tarball}" | tar xvz + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar xvz else # build and cache touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd nccl # https://github.com/NVIDIA/nccl?tab=readme-ov-file#install @@ -719,8 +1069,10 @@ function install_nvidia_nccl() { # Ada: SM_89, compute_89 # Hopper: SM_90,SM_90a compute_90,compute_90a # Blackwell: SM_100, compute_100 - local nvcc_gencode=("-gencode=arch=compute_70,code=sm_70" "-gencode=arch=compute_72,code=sm_72" - "-gencode=arch=compute_80,code=sm_80" "-gencode=arch=compute_86,code=sm_86") + local nvcc_gencode=("-gencode=arch=compute_75,code=sm_75" "-gencode=arch=compute_80,code=sm_80" "-gencode=arch=compute_86,code=sm_86") + if version_lt "${CUDA_VERSION}" "13.0" ; then + nvcc_gencode+=("-gencode=arch=compute_70,code=sm_70" "-gencode=arch=compute_72,code=sm_72") + fi if version_gt "${CUDA_VERSION}" "11.6" ; then nvcc_gencode+=("-gencode=arch=compute_87,code=sm_87") @@ -747,11 +1099,11 @@ function install_nvidia_nccl() { execute_with_retries make -j$(nproc) pkg.redhat.build fi tar czvf "${local_tarball}" "../${build_path}" - make clean + make clean || true popd tar xzvf "${local_tarball}" - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" fi @@ -773,151 +1125,105 @@ function is_src_os() { [[ "${GPU_DRIVER_PROVIDER}" == "OS" ]] ; } function install_nvidia_cudnn() { is_complete cudnn && return if le_debian10 ; then return ; fi - local major_version - major_version="${CUDNN_VERSION%%.*}" - local cudnn_pkg_version - cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" - - if is_rocky ; then - if is_cudnn8 ; then - execute_with_retries dnf -y -q install \ - "libcudnn${major_version}" \ - "libcudnn${major_version}-devel" - sync - elif is_cudnn9 ; then - execute_with_retries dnf -y -q install \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" - sync + + local source_method="${1:-package}" + + if [[ "${source_method}" == "tarball" ]]; then + local local_tarball="${tmpdir}/${CUDNN_TARBALL}" + cache_fetched_package "${CUDNN_TARBALL_URL}" "${pkg_bucket}/nvidia/cudnn/${CUDNN_TARBALL}" "${local_tarball}" + + pushd "${tmpdir}" + if [[ "${CUDNN_TARBALL}" == *.tar.xz ]]; then + tar xJf "${local_tarball}" else - echo "Unsupported cudnn version: '${major_version}'" + tar xzf "${local_tarball}" fi - elif is_debuntu; then - if ge_debian12 && is_src_os ; then - apt-get -y install nvidia-cudnn - else - if is_cudnn8 ; then - add_repo_cuda - apt-get update -qq - # Ignore version requested and use the latest version in the package index - cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" + local extracted_dir + extracted_dir="$(find . -maxdepth 1 -type d -name 'cudnn-*' -o -name 'cuda' | grep -v '\.tar' | head -n1)" + + if [[ -d "${extracted_dir}/include" ]]; then + cp -P "${extracted_dir}"/include/cudnn*.h /usr/local/cuda/include/ + cp -P "${extracted_dir}"/lib/libcudnn* /usr/local/cuda/lib64/ + elif [[ -d "${extracted_dir}/cuda/include" ]]; then + cp -P "${extracted_dir}"/cuda/include/cudnn*.h /usr/local/cuda/include/ + cp -P "${extracted_dir}"/cuda/lib64/libcudnn* /usr/local/cuda/lib64/ + fi + chmod a+r /usr/local/cuda/include/cudnn*.h /usr/local/cuda/lib64/libcudnn* + + popd + rm -f "${local_tarball}" + rm -rf "${tmpdir}/${extracted_dir}" - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn8=${cudnn_pkg_version}" \ - "libcudnn8-dev=${cudnn_pkg_version}" + elif [[ "${source_method}" == "package" ]]; then + local major_version + major_version="${CUDNN_VERSION%%.*}" + local cudnn_pkg_version + cudnn_pkg_version="${CUDNN_VERSION}-1+cuda${CUDA_VERSION}" + if is_rocky ; then + if is_cudnn8 ; then + execute_with_retries dnf -y -q install \ + "libcudnn${major_version}" \ + "libcudnn${major_version}-devel" sync elif is_cudnn9 ; then - install_cuda_keyring_pkg - - apt-get update -qq - - execute_with_retries \ - apt-get -y install --no-install-recommends \ - "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ - "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - + execute_with_retries dnf -y -q install \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-devel-cuda-${CUDA_VERSION%%.*}" sync else - echo "Unsupported cudnn version: [${CUDNN_VERSION}]" + echo "Unsupported cudnn version: '${major_version}'" fi - fi - else - echo "Unsupported OS: '${OS_NAME}'" - exit 1 - fi - - ldconfig + elif is_debuntu; then + if ge_debian12 && is_src_os ; then + apt-get -y install nvidia-cudnn + else + if is_cudnn8 ; then + add_repo_cuda - echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." - mark_complete cudnn -} + apt-get update -qq + # Ignore version requested and use the latest version in the package index + cudnn_pkg_version="$(apt-cache show libcudnn8 | awk "/^Ver.*cuda${CUDA_VERSION%%.*}.*/ {print \$2}" | sort -V | tail -1)" -function install_pytorch() { - is_complete pytorch && return + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn8=${cudnn_pkg_version}" \ + "libcudnn8-dev=${cudnn_pkg_version}" - local env - env=$(get_metadata_attribute 'gpu-conda-env' 'dpgce') + sync + elif is_cudnn9 ; then + install_cuda_keyring_pkg - local conda_root_path - if version_lt "${DATAPROC_IMAGE_VERSION}" "2.3" ; then - conda_root_path="/opt/conda/miniconda3" - else - conda_root_path="/opt/conda" - fi - [[ -d ${conda_root_path} ]] || return - local envpath="${conda_root_path}/envs/${env}" - if [[ "${env}" == "base" ]]; then - echo "WARNING: installing to base environment known to cause solve issues" ; envpath="${conda_root_path}" ; fi - # Set numa node to 0 for all GPUs - for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done + apt-get update -qq - local build_tarball="pytorch_${env}_${_shortname}_cuda${CUDA_VERSION}.tar.gz" - local local_tarball="${workdir}/${build_tarball}" - local gcs_tarball="${pkg_bucket}/conda/${_shortname}/${build_tarball}" + execute_with_retries \ + apt-get -y install --no-install-recommends \ + "libcudnn9-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-dev-cuda-${CUDA_VERSION%%.*}" \ + "libcudnn9-static-cuda-${CUDA_VERSION%%.*}" - if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then - # when running with fewer than 32 cores, yield to in-progress build - sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" - if [[ "$?" == "0" ]] ; then - local build_start_time build_start_epoch timeout_epoch - build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" - build_start_epoch="$(date -u -d "${build_start_time}" +%s)" - timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do - local now_epoch="$(date -u +%s)" - if (( now_epoch > timeout_epoch )) ; then - # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" - break + sync + else + echo "Unsupported cudnn version: [${CUDNN_VERSION}]" fi - sleep 5m - done + fi + else + echo "Unsupported OS: '${OS_NAME}'" + exit 1 fi - fi - - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then - # cache hit - unpack from cache - echo "cache hit" - mkdir -p "${envpath}" - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C "${envpath}" -xz else - touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" - building_file="${gcs_tarball}.building" - local verb=create - if test -d "${envpath}" ; then verb=install ; fi - cudart_spec="cuda-cudart" - if le_cuda11 ; then cudart_spec="cudatoolkit" ; fi - - # Install pytorch and company to this environment - "${conda_root_path}/bin/mamba" "${verb}" -n "${env}" \ - -c conda-forge -c nvidia -c rapidsai \ - numba pytorch tensorflow[and-cuda] rapids pyspark \ - "cuda-version<=${CUDA_VERSION}" "${cudart_spec}" - - # Install jupyter kernel in this environment - "${envpath}/bin/python3" -m pip install ipykernel - - # package environment and cache in GCS - pushd "${envpath}" - tar czf "${local_tarball}" . - popd - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi - building_file="" + echo "Unknown install method: ${source_method}" + exit 1 fi - # register the environment as a selectable kernel - "${envpath}/bin/python3" -m ipykernel install --name "${env}" --display-name "Python (${env})" + ldconfig - mark_complete pytorch + echo "NVIDIA cuDNN successfully installed for ${OS_NAME}." + mark_complete cudnn } + function configure_dkms_certs() { if test -v PSN && [[ -z "${PSN}" ]]; then echo "No signing secret provided. skipping"; @@ -1022,6 +1328,56 @@ function add_nonfree_components() { sed -i -e 's/ main$/ main contrib non-free/' /etc/apt/sources.list fi } +function import_gpg_keys() { + local keyring_path="$1" + shift + local keys=("$@") + + mkdir -p "$(dirname "${keyring_path}")" + + local GPG_PROXY_ARGS=() + if [[ -n "${HTTP_PROXY:-}" ]]; then + GPG_PROXY_ARGS=(--keyserver-options "http-proxy=${HTTP_PROXY}") + elif [[ -n "${http_proxy:-}" ]]; then + GPG_PROXY_ARGS=(--keyserver-options "http-proxy=${http_proxy}") + fi + + local tmp_keyring + tmp_keyring=$(mktemp) + local keyserver_keys_found=0 + + for key in "${keys[@]}"; do + echo "DEBUG: Importing GPG key: ${key} into ${keyring_path}" + if [[ "${key}" =~ ^https?:// ]]; then + # Import dearmored key from URL, overwrites keyring_path + if ! execute_with_retries curl "${curl_retry_args[@]}" "${key}" | gpg --dearmor --yes -o "${keyring_path}"; then + echo "ERROR: Failed to import GPG key from URL: ${key}" + rm -f "${tmp_keyring}" + exit 1 + fi + elif [[ "${key}" =~ ^0x ]]; then + # Fetch key from keyserver into tmp_keyring + keyserver_keys_found=1 + if ! execute_with_retries gpg --keyserver keyserver.ubuntu.com "${GPG_PROXY_ARGS[@]}" --no-default-keyring --keyring "${tmp_keyring}" --recv-keys "${key}"; then + echo "ERROR: Failed to receive GPG key from keyserver: ${key}" + rm -f "${tmp_keyring}" + exit 1 + fi + else + echo "WARN: Unrecognized key format, skipping: ${key}" + fi + done + + # If any keys were fetched from keyserver, export and dearmor them all into the final keyring + if [[ "${keyserver_keys_found}" -eq 1 ]]; then + if ! gpg --no-default-keyring --keyring "${tmp_keyring}" --export | gpg --dearmor --yes -o "${keyring_path}"; then + echo "ERROR: Failed to export/dearmor GPG keys from temporary keyring" + rm -f "${tmp_keyring}" + exit 1 + fi + fi + rm -f "${tmp_keyring}" +} # # Install package signing key and add corresponding repository @@ -1042,10 +1398,7 @@ function add_repo_nvidia_container_toolkit() { elif [[ -v http_proxy ]] ; then GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" fi - execute_with_retries gpg --keyserver keyserver.ubuntu.com \ - ${GPG_PROXY_ARGS} \ - --no-default-keyring --keyring "${kr_path}" \ - --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" + import_gpg_keys "${kr_path}" "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" "0xc95b321b61e88c1809c4f759ddcae044f796ecb0" local -r repo_data="${nvctk_root}/stable/deb/\$(ARCH) /" local -r repo_path="/etc/apt/sources.list.d/${repo_name}.list" echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" @@ -1072,11 +1425,9 @@ function add_repo_cuda() { if [[ -n "${HTTP_PROXY}" ]] ; then GPG_PROXY="--keyserver-options http-proxy=${HTTP_PROXY}" elif [[ -n "${http_proxy}" ]] ; then - GPG_PROXY="--keyserver-options http-proxy=${http_proxy}" + GPG_PROXY="--keyserver-options http-proxy=\"${http_proxy}\"" fi - execute_with_retries gpg --keyserver keyserver.ubuntu.com ${GPG_PROXY_ARGS} \ - --no-default-keyring --keyring "${kr_path}" \ - --recv-keys "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" + import_gpg_keys "${kr_path}" "0xae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80" "0xeb693b3035cd5710e231e123a4b469963bf863cc" else install_cuda_keyring_pkg # 11.7+, 12.0+ fi @@ -1095,7 +1446,7 @@ function build_driver_from_github() { pushd "${workdir}" test -d "${workdir}/open-gpu-kernel-modules" || { tarball_fn="${DRIVER_VERSION}.tar.gz" - execute_with_retries curl ${curl_retry_args} \ + execute_with_retries curl "${curl_retry_args[@]}" \ "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/refs/tags/${tarball_fn}" \ \| tar xz mv "open-gpu-kernel-modules-${DRIVER_VERSION}" open-gpu-kernel-modules @@ -1112,20 +1463,20 @@ function build_driver_from_github() { local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + if [[ "$(hostname -s)" =~ ^test ]] && (( $(nproc) < 32 )) ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || echo "might have been deleted by a peer" break fi sleep 5m @@ -1133,12 +1484,12 @@ function build_driver_from_github() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" 2>&1 ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" 2>&1 ; then echo "cache hit" else # build the kernel modules touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" pushd open-gpu-kernel-modules install_build_dependencies @@ -1167,14 +1518,14 @@ function build_driver_from_github() { tar czvf "${local_tarball}" \ "${workdir}/open-gpu-kernel-modules/kernel-open/"*.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" rm "${local_tarball}" make clean popd fi - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C / -xzv depmod -a } @@ -1237,6 +1588,17 @@ function install_nvidia_userspace_runfile() { # wget https://us.download.nvidia.com/XFree86/Linux-x86_64/560.35.03/NVIDIA-Linux-x86_64-560.35.03.run # sh ./NVIDIA-Linux-x86_64-560.35.03.run -x # this will allow you to review the contents of the package without installing it. is_complete userspace && return + + if command -v nvidia-smi >/dev/null 2>&1; then + local installed_version + installed_version="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n1)" + if [[ "${installed_version}" == "${DRIVER_VERSION}" ]]; then + echo "INFO: NVIDIA driver ${DRIVER_VERSION} is already installed." + mark_complete userspace + return 0 + fi + fi + local local_fn="${tmpdir}/${USERSPACE_RUNFILE}" cache_fetched_package "${USERSPACE_URL}" \ @@ -1259,31 +1621,32 @@ function install_nvidia_userspace_runfile() { || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) then + local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}_nonfree.tar.gz" + local_tarball="${workdir}/${build_tarball}" + local build_dir + if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] + then build_dir="${modulus_md5sum}" + else build_dir="unsigned" ; fi + + local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" + local nvidia_ko_path="$(find /lib/modules/$(uname -r)/ -name 'nvidia.ko')" test -n "${nvidia_ko_path}" && test -f "${nvidia_ko_path}" || { - local build_tarball="kmod_${_shortname}_${DRIVER_VERSION}_nonfree.tar.gz" - local_tarball="${workdir}/${build_tarball}" - local build_dir - if test -v modulus_md5sum && [[ -n "${modulus_md5sum}" ]] - then build_dir="${modulus_md5sum}" - else build_dir="unsigned" ; fi - local gcs_tarball="${pkg_bucket}/nvidia/kmod/${_shortname}/${uname_r}/${build_dir}/${build_tarball}" - - if [[ "$(hostname -s)" =~ ^test && "$(nproc)" < 32 ]] ; then + if [[ "$(hostname -s)" =~ ^test ]] && (( $(nproc) < 32 )) ; then # when running with fewer than 32 cores, yield to in-progress build sleep $(( ( RANDOM % 11 ) + 10 )) - local output="$(${gsutil_stat_cmd} "${gcs_tarball}.building"|grep '.reation.time')" + local output="$("${gsutil_stat_cmd[@]}" "${gcs_tarball}.building"|grep '.reation.time')" if [[ "$?" == "0" ]] ; then local build_start_time build_start_epoch timeout_epoch build_start_time="$(echo ${output} | awk -F': +' '{print $2}')" build_start_epoch="$(date -u -d "${build_start_time}" +%s)" timeout_epoch=$((build_start_epoch + 2700)) # 45 minutes - while ${gsutil_stat_cmd} "${gcs_tarball}.building" ; do + while "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; do local now_epoch="$(date -u +%s)" if (( now_epoch > timeout_epoch )) ; then # detect unexpected build failure after 45m - ${gsutil_cmd} rm "${gcs_tarball}.building" + "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" break fi sleep 5m @@ -1291,7 +1654,7 @@ function install_nvidia_userspace_runfile() { fi fi - if ${gsutil_stat_cmd} "${gcs_tarball}" ; then + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}" ; then cache_hit="1" if version_ge "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" ; then runfile_args="${runfile_args} --no-kernel-modules" @@ -1300,7 +1663,7 @@ function install_nvidia_userspace_runfile() { else # build the kernel modules touch "${local_tarball}.building" - ${gsutil_cmd} cp "${local_tarball}.building" "${gcs_tarball}.building" + "${gsutil_cmd[@]}" cp "${local_tarball}.building" "${gcs_tarball}.building" building_file="${gcs_tarball}.building" install_build_dependencies configure_dkms_certs @@ -1335,16 +1698,16 @@ function install_nvidia_userspace_runfile() { || version_lt "${DRIVER_VERSION}" "${MIN_OPEN_DRIVER_VER}" \ || [[ "$((16#${pci_device_id}))" < "$((16#1E00))" ]] ) ; then if [[ "${cache_hit}" == "1" ]] ; then - ${gsutil_cmd} cat "${gcs_tarball}" | tar -C / -xzv + "${gsutil_cmd[@]}" cat "${gcs_tarball}" | tar -C / -xzv depmod -a else clear_dkms_key tar czvf "${local_tarball}" \ /var/log/nvidia-installer.log \ $(find /lib/modules/${uname_r}/ -iname 'nvidia*.ko') - ${gsutil_cmd} cp "${local_tarball}" "${gcs_tarball}" + "${gsutil_cmd[@]}" cp "${local_tarball}" "${gcs_tarball}" - if ${gsutil_stat_cmd} "${gcs_tarball}.building" ; then ${gsutil_cmd} rm "${gcs_tarball}.building" || true ; fi + if "${gsutil_stat_cmd[@]}" "${gcs_tarball}.building" ; then "${gsutil_cmd[@]}" rm "${gcs_tarball}.building" || true ; fi building_file="" fi fi @@ -1478,7 +1841,7 @@ function install_ops_agent(){ mkdir -p /opt/google cd /opt/google # https://cloud.google.com/stackdriver/docs/solutions/agents/ops-agent/installation - curl ${curl_retry_args} -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh + curl "${curl_retry_args[@]}" -O https://dl.google.com/cloudagents/add-google-cloud-ops-agent-repo.sh local expected="038d98644e4c4a7969d26da790946720d278c8d49bb82b677f550c2a2b858411 add-google-cloud-ops-agent-repo.sh" execute_with_retries bash add-google-cloud-ops-agent-repo.sh --also-install @@ -1496,11 +1859,12 @@ function install_gpu_agent() { fi local install_dir=/opt/gpu-utilization-agent mkdir -p "${install_dir}" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/requirements.txt" -o "${install_dir}/requirements.txt" - curl ${curl_retry_args} \ + curl "${curl_retry_args[@]}" \ "${GPU_AGENT_REPO_URL}/report_gpu_metrics.py" \ | sed -e 's/-u --format=/--format=/' \ + | sed -e 's|http://metadata/|http://metadata.google.internal/|g' \ | dd status=none of="${install_dir}/report_gpu_metrics.py" local venv="${install_dir}/venv" python_interpreter="/opt/conda/miniconda3/bin/python3" @@ -1511,7 +1875,7 @@ function install_gpu_agent() { "${python_interpreter}" -m venv "${venv}" ( source "${venv}/bin/activate" - if [[ -v METADATA_HTTP_PROXY_PEM_URI ]]; then + if [[ -n "${trusted_pem_path:-}" ]]; then export REQUESTS_CA_BUNDLE="${trusted_pem_path}" pip install pip-system-certs unset REQUESTS_CA_BUNDLE @@ -1529,6 +1893,7 @@ Description=GPU Utilization Metric Agent [Service] Type=simple PIDFile=/run/gpu_agent.pid +EnvironmentFile=-/etc/environment ExecStart=/bin/bash --login -c '. ${venv}/bin/activate ; python3 "${install_dir}/report_gpu_metrics.py"' User=root Group=root @@ -1761,7 +2126,7 @@ function install_build_dependencies() { is_complete build-dependencies && return if is_debuntu ; then - if is_ubuntu22 && is_cuda12 ; then + if is_ubuntu22 && ge_cuda12 ; then # On ubuntu22, the default compiler does not build some kernel module versions # https://forums.developer.nvidia.com/t/linux-new-kernel-6-5-0-14-ubuntu-22-04-can-not-compile-nvidia-display-card-driver/278553/11 execute_with_retries apt-get install -y -qq gcc-12 @@ -1825,6 +2190,98 @@ function is_complete() { phase="$1" test -f "${workdir}/complete/${phase}" } +function evaluate_network() { + local state_file="${tmpdir}/network_state.json" + echo "INFO: Evaluating network and writing state to ${state_file}" + + # Metadata checks + local http_proxy=$(get_metadata_attribute 'http-proxy' 'null') + if [[ "${http_proxy}" != "null" ]]; then http_proxy=""${http_proxy}""; fi + local swp_egress=$(get_metadata_attribute 'swp-egress' 'false') + + local instance_ips=$(hostname -I || echo "") + local has_external_ip="false" + # Crude check for non-internal IP + if [[ "${instance_ips}" =~ [^10\.|^172\.(1[6-9]|2[0-9]|3[0-1])\.|^192\.168] ]]; then + has_external_ip="true" + fi + + # Kernel Route Table + local default_route_v4="null" + local default_route_v6="null" + if ip -4 route show default | grep -q default; then + default_route_v4=""$(ip -4 route show default)"" + fi + if ip -6 route show default | grep -q default; then + default_route_v6=""$(ip -6 route show default)"" + fi + + # DNS & Connectivity Tests + local target_host="www.gstatic.com" + local dns_v4_ips=($(dig +short A "${target_host}" || true)) + local dns_v6_ips=($(dig +short AAAA "${target_host}" || true)) + + local dns_v4_ok="false"; [[ ${#dns_v4_ips[@]} -gt 0 ]] && dns_v4_ok="true" + local dns_v6_ok="false"; [[ ${#dns_v6_ips[@]} -gt 0 ]] && dns_v6_ok="true" + + local ping_v4_ok="false" + if [[ "${dns_v4_ok}" == "true" ]]; then + if ping -c 1 "${dns_v4_ips[0]}" >/dev/null 2>&1; then ping_v4_ok="true"; fi + fi + + local ping_v6_ok="false" + if [[ "${dns_v6_ok}" == "true" ]]; then + if ping -6 -c 1 "${dns_v6_ips[0]}" >/dev/null 2>&1; then ping_v6_ok="true"; fi + fi + + local curl_target="http://${target_host}/generate_204" + local curl_v4_ok="false" + if curl -4 -s -m 10 --head "${curl_target}" >/dev/null 2>&1; then + curl_v4_ok="true" + fi + + local curl_v6_ok="false" + if curl -6 -s -m 10 --head "${curl_target}" >/dev/null 2>&1; then + curl_v6_ok="true" + fi + + # More general checks + local nvidia_http_ok="false" + if curl -s -m 10 --head "https://us.download.nvidia.com" >/dev/null 2>&1; then + nvidia_http_ok="true" + fi + + # Assemble JSON + cat << EOF > "${state_file}" +{ + "config": { + "has_external_ip": ${has_external_ip}, + "http_proxy": ${http_proxy}, + "swp_egress": ${swp_egress} + }, + "routing": { + "default_route_v4": ${default_route_v4}, + "default_route_v6": ${default_route_v6} + }, + "gstatic": { + "dns_v4_ok": ${dns_v4_ok}, + "dns_v4_ips": [$(printf '"%s",' "${dns_v4_ips[@]}" | sed 's/,$//')], + "ping_v4_ok": ${ping_v4_ok}, + "curl_v4_ok": ${curl_v4_ok}, + "dns_v6_ok": ${dns_v6_ok}, + "dns_v6_ips": [$(printf '"%s",' "${dns_v6_ips[@]}" | sed 's/,$//')], + "ping_v6_ok": ${ping_v6_ok}, + "curl_v6_ok": ${curl_v6_ok} + }, + "http_checks": { + "https://us.download.nvidia.com": ${nvidia_http_ok} + } +} +EOF + + echo "INFO: Network state evaluation complete." + cat "${state_file}" # For debugging +} function mark_complete() { phase="$1" @@ -1839,7 +2296,7 @@ function mark_incomplete() { function install_dependencies() { is_complete install-dependencies && return 0 - pkg_list="screen" + pkg_list="screen jq dnsutils" if is_debuntu ; then execute_with_retries apt-get -y -q install ${pkg_list} elif is_rocky ; then execute_with_retries dnf -y -q install ${pkg_list} ; fi mark_complete install-dependencies @@ -2036,6 +2493,8 @@ readonly HADOOP_CONF_DIR='/etc/hadoop/conf' readonly SPARK_CONF_DIR='/etc/spark/conf' readonly bdcfg="/usr/local/bin/bdconfig" readonly workdir=/opt/install-dpgce # Needed for cache_fetched_package +readonly tmpdir="${tmpdir}" +readonly install_log="${tmpdir}/install.log" # --- Define Necessary Global Arrays --- # These need to be explicitly defined here as they are not functions. @@ -2149,14 +2608,15 @@ $(declare -f cache_fetched_package) $(declare -f execute_with_retries) # --- Define gsutil/gcloud commands and curl args --- -gsutil_cmd="gcloud storage" -gsutil_stat_cmd="gcloud storage objects describe" -gcloud_sdk_version="\$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print \$2}' || echo '0.0.0')" -if version_lt "\${gcloud_sdk_version}" "402.0.0" ; then - gsutil_cmd="gsutil -o GSUtil:check_hashes=never" - gsutil_stat_cmd="gsutil stat" +gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}' || echo '0.0.0')" +if version_lt "${gcloud_sdk_version}" "402.0.0" ; then + gsutil_cmd=("gsutil" "-o" "GSUtil:check_hashes=never") + gsutil_stat_cmd=("gsutil" "stat") +else + gsutil_cmd=("gcloud" "storage") + gsutil_stat_cmd=("gcloud" "storage" "objects" "describe") fi -curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" +curl_retry_args=("-fsSL" "--retry-connrefused" "--retry" "10" "--retry-max-time" "30") # --- Include the main config function --- $(declare -f run_hadoop_spark_config) @@ -2237,15 +2697,21 @@ function main() { if [[ -n ${CUDNN_VERSION} ]]; then install_nvidia_nccl - install_nvidia_cudnn + local default_cudnn_source="package" + if is_rocky && version_le "${DATAPROC_IMAGE_VERSION}" "2.1" ; then + default_cudnn_source="tarball" + fi + install_nvidia_cudnn "$(get_metadata_attribute 'cudnn-install-source' "${default_cudnn_source}")" fi - case "${INCLUDE_PYTORCH^^}" in - "1" | "YES" | "TRUE" ) install_pytorch ;; - esac + + install_tensorflow + install_pytorch #Install GPU metrics collection in Stackdriver if needed if [[ "${INSTALL_GPU_AGENT}" == "true" ]]; then + echo "DEBUG: About to call install_gpu_agent" #install_ops_agent install_gpu_agent + echo "DEBUG: Finished install_gpu_agent call. Exit code: $?" echo 'GPU metrics agent successfully deployed.' else echo 'GPU metrics agent will not be installed.' @@ -2253,7 +2719,7 @@ function main() { # for some use cases, the kernel module needs to be removed before first use of nvidia-smi for module in nvidia_uvm nvidia_drm nvidia_modeset nvidia ; do - rmmod ${module} > /dev/null 2>&1 || echo "unable to rmmod ${module}" + rmmod "${module}" > /dev/null 2>&1 || echo "unable to rmmod \"${module}\"" done if test -n "$(nvsmi -L)" ; then @@ -2322,11 +2788,11 @@ function cache_fetched_package() { local gcs_fn="$2" local local_fn="$3" - if ${gsutil_stat_cmd} "${gcs_fn}" 2>&1 ; then - execute_with_retries ${gsutil_cmd} cp "${gcs_fn}" "${local_fn}" + if "${gsutil_stat_cmd[@]}" "${gcs_fn}" > /dev/null 2>&1; then + execute_with_retries "${gsutil_cmd[@]}" cp "${gcs_fn}" "${local_fn}" else - time ( curl ${curl_retry_args} "${src_url}" -o "${local_fn}" && \ - execute_with_retries ${gsutil_cmd} cp "${local_fn}" "${gcs_fn}" ; ) + time ( curl "${curl_retry_args[@]}" "${src_url}" -o "${local_fn}" && \ + execute_with_retries "${gsutil_cmd[@]}" cp "${local_fn}" "${gcs_fn}" ; ) fi } @@ -2427,8 +2893,7 @@ function clean_up_sources_lists() { # if [[ -f /etc/apt/sources.list.d/mysql.list ]]; then rm -f /usr/share/keyrings/mysql.gpg - curl ${curl_retry_args} 'https://keyserver.ubuntu.com/pks/lookup?op=get&search=0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C' | \ - gpg --dearmor -o /usr/share/keyrings/mysql.gpg + import_gpg_keys "/usr/share/keyrings/mysql.gpg" "0xBCA43417C3B485DD128EC6D4B7B3B788A8D3785C" sed -i -e 's:deb https:deb [signed-by=/usr/share/keyrings/mysql.gpg] https:g' /etc/apt/sources.list.d/mysql.list fi @@ -2442,7 +2907,7 @@ function exit_handler() { # clean up incomplete build indicators if test -n "${building_file}" ; then - if ${gsutil_stat_cmd} "${building_file}" ; then ${gsutil_cmd} rm "${building_file}" || true ; fi + if "${gsutil_stat_cmd[@]}" "${building_file}" ; then "${gsutil_cmd[@]}" rm "${building_file}" || true ; fi fi set +e # Allow cleanup commands to fail without exiting script @@ -2478,7 +2943,7 @@ function exit_handler() { apt-mark hold systemd libsystemd0 ; fi hold_nvidia_packages else - dnf clean all + execute_with_retries dnf clean all fi # print disk usage statistics for large components @@ -2672,12 +3137,12 @@ EOF echo "${output}" exit 1 } - output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://google.com" 2>&1)|| { + output="$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://google.com" 2>&1)" || { echo "curl rejects proxy configuration" - echo "${curl_output}" + echo "${output}" exit 1 } - output=$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run" 2>&1)|| { + output="$(curl --verbose -fsSL --retry-connrefused --retry 10 --retry-max-time 30 --head "https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run" 2>&1)" || { echo "curl rejects proxy configuration" echo "${output}" exit 1 @@ -2730,8 +3195,10 @@ function mount_ramdisk(){ # Download OS packages to tmpfs if is_debuntu ; then + mkdir -p /var/cache/apt/archives mount -t tmpfs tmpfs /var/cache/apt/archives else + mkdir -p /var/cache/dnf mount -t tmpfs tmpfs /var/cache/dnf fi } @@ -2761,6 +3228,16 @@ function harden_sshd_config() { } function prepare_to_install(){ + # Setup temporary directories (potentially on RAM disk) + tmpdir=/tmp/ # Default + mount_ramdisk # Updates tmpdir if successful + export tmpdir + install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir + export install_log + + # Evaluate network and cache results *before* any network operations + evaluate_network + readonly uname_r=$(uname -r) # Verify OS compatability and Secure boot state check_os @@ -2780,17 +3257,17 @@ function prepare_to_install(){ # With the 402.0.0 release of gcloud sdk, `gcloud storage` can be # used as a more performant replacement for `gsutil` - gsutil_cmd="gcloud storage" - gsutil_stat_cmd="gcloud storage objects describe" + gsutil_cmd=("gcloud" "storage") + gsutil_stat_cmd=("gcloud" "storage" "objects" "describe") gcloud_sdk_version="$(gcloud --version | awk -F'SDK ' '/Google Cloud SDK/ {print $2}')" if version_lt "${gcloud_sdk_version}" "402.0.0" ; then - gsutil_cmd="gsutil -o GSUtil:check_hashes=never" - gsutil_stat_cmd="gsutil stat" + gsutil_cmd=("gsutil" "-o" "GSUtil:check_hashes=never") + gsutil_stat_cmd=("gsutil" "stat") fi # if fetches of nvidia packages fail, apply -k argument to the following. - curl_retry_args="-fsSL --retry-connrefused --retry 10 --retry-max-time 30" + curl_retry_args=("-fsSL" "--retry-connrefused" "--retry" "10" "--retry-max-time" "30") # After manually verifying the veracity of the asset, take note of sha256sum # of the downloaded files in your gcs bucket and submit these data with an @@ -2811,11 +3288,6 @@ function prepare_to_install(){ # ["NVIDIA-Linux-x86_64-550.135.run"]="a8c3ae0076f11e864745fac74bfdb01f" # ["NVIDIA-Linux-x86_64-550.142.run"]="e507e578ecf10b01a08e5424dddb25b8" - # Setup temporary directories (potentially on RAM disk) - tmpdir=/tmp/ # Default - mount_ramdisk # Updates tmpdir if successful - install_log="${tmpdir}/install.log" # Set install log path based on final tmpdir - workdir=/opt/install-dpgce # Set GCS bucket for caching temp_bucket="$(get_metadata_attribute dataproc-temp-bucket)" @@ -2835,11 +3307,14 @@ function prepare_to_install(){ harden_sshd_config if is_debuntu ; then + # Globally configure apt/dpkg to wait up to 60 seconds for locks + echo 'DPkg::Lock::Timeout="60";' > /etc/apt/apt.conf.d/99-dpkg-lock-timeout + repair_old_backports clean_up_sources_lists apt-get update -qq --allow-releaseinfo-change apt-get -y clean - apt-get -o DPkg::Lock::Timeout=60 -y autoremove + apt-get -y autoremove if ge_debian12 ; then apt-mark unhold systemd libsystemd0 ; fi if is_ubuntu ; then @@ -2847,7 +3322,7 @@ function prepare_to_install(){ while ! command -v gcloud ; do sleep 5s ; done fi else # Rocky - dnf clean all + execute_with_retries dnf clean all fi # zero free disk space (only if creating image) @@ -2919,7 +3394,7 @@ function apt_add_repo() { echo "deb [signed-by=${kr_path}] ${repo_data}" > "${repo_path}" if [[ "${include_src}" == "yes" ]] ; then - echo "deb-src [signed-by=${kr_path}] ${repo_data}" >> "${repo_path}" + echo "deb-src [signed-by='${kr_path}'] ${repo_data}" >> "${repo_path}" fi apt-get update -qq @@ -2934,7 +3409,7 @@ function dnf_add_repo() { local -r kr_path="${5:-/etc/pki/rpm-gpg/${repo_name}.gpg}" local -r repo_path="${6:-/etc/yum.repos.d/${repo_name}.repo}" - curl ${curl_retry_args} "${repo_url}" \ + curl "${curl_retry_args[@]}" "${repo_url}" \ | dd of="${repo_path}" status=progress } diff --git a/gpu/run-bazel-tests-with-podman.sh b/gpu/run-bazel-tests-with-podman.sh new file mode 100644 index 000000000..d43cea57e --- /dev/null +++ b/gpu/run-bazel-tests-with-podman.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -e + +# Ensure key file exists +if [ ! -f "key.json" ]; then + echo "Error: key.json not found. Please create it." + echo "Example: gcloud iam service-accounts keys create key.json --iam-account=YOUR-SA@YOUR-PROJECT.iam.gserviceaccount.com --project=YOUR-PROJECT" + exit 1 +fi + +# Create the host directory if it doesn't exist and make it writable +HOST_CACHE_DIR="${PWD}/tmp/bazel-cache" +mkdir -p "${HOST_CACHE_DIR}" +chmod 777 "${HOST_CACHE_DIR}" +echo "Host cache directory: ${HOST_CACHE_DIR}" + +podman build -f gpu/Dockerfile -t gpu-init-actions-runner:latest . + +IMAGE_VERSION="${1:-2.2-debian12}" + +time podman run -it --rm \ + --name gpu-test-runner \ + -v ${HOST_CACHE_DIR}:/home/ia-tests/.cache/bazel:Z \ + -e GOOGLE_APPLICATION_CREDENTIALS=/init-actions/key.json \ + -e PROJECT_ID="${PROJECT_ID:-$(gcloud config get-value project 2>/dev/null)}" \ + -e REGION="${REGION:-$(gcloud config get-value compute/region 2>/dev/null)}" \ + --entrypoint /bin/bash \ + gpu-init-actions-runner:latest \ + /init-actions/gpu/run-bazel-tests.sh "$@" \ No newline at end of file diff --git a/gpu/run-bazel-tests.sh b/gpu/run-bazel-tests.sh index ae717bf5b..f9c59a278 100644 --- a/gpu/run-bazel-tests.sh +++ b/gpu/run-bazel-tests.sh @@ -6,18 +6,29 @@ IMAGE="rapids-actions-image:$BUILD_ID" max_parallel_tests=10 IMAGE_VERSION="$1" +shift if [[ -z "${IMAGE_VERSION}" ]] ; then IMAGE_VERSION="$(jq -r .IMAGE_VERSION env.json)" ; fi ; export IMAGE_VERSION #declare -a TESTS_TO_RUN=('dask:test_dask' 'rapids:test_rapids') #declare -a TESTS_TO_RUN=('dask:test_dask') #declare -a TESTS_TO_RUN=('rapids:test_rapids') +if [[ -n "${GOOGLE_APPLICATION_CREDENTIALS:-}" ]] && [[ -f "${GOOGLE_APPLICATION_CREDENTIALS}" ]]; then + echo "Authenticating gcloud with service account key..." + gcloud auth activate-service-account --key-file="${GOOGLE_APPLICATION_CREDENTIALS}" + gcloud config set project "${PROJECT_ID}" +fi + declare -a TESTS_TO_RUN=('gpu:test_gpu') time bazel test \ --jobs="${max_parallel_tests}" \ --local_test_jobs="${max_parallel_tests}" \ --action_env="INTERNAL_IP_SSH=true" \ + --test_env="PROJECT_ID=${PROJECT_ID}" \ + --test_env="REGION=${REGION}" \ + --test_env="GOOGLE_APPLICATION_CREDENTIALS=${GOOGLE_APPLICATION_CREDENTIALS}" \ --test_output="errors" \ --test_arg="--image_version=${IMAGE_VERSION}" \ + "$@" \ "${TESTS_TO_RUN[@]}" diff --git a/gpu/split.go b/gpu/split.go new file mode 100644 index 000000000..992bfa7fa --- /dev/null +++ b/gpu/split.go @@ -0,0 +1,131 @@ +package main + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strings" + + "mvdan.cc/sh/v3/syntax" +) + +type chunk struct { + startLine int + endLine int + name string + isFunc bool +} + +func main() { + if len(os.Args) < 2 { + fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) + os.Exit(1) + } + inputFile := os.Args[1] + outputDir := inputFile + ".d" + + if err := os.MkdirAll(outputDir, 0755); err != nil { + fmt.Fprintf(os.Stderr, "Error creating output directory: %v\n", err) + os.Exit(1) + } + + content, err := ioutil.ReadFile(inputFile) + if err != nil { + fmt.Fprintf(os.Stderr, "Error reading input file: %v\n", err) + os.Exit(1) + } + scriptContent := string(content) + lines := strings.Split(scriptContent, "\n") + + parser := syntax.NewParser() + f, err := parser.Parse(strings.NewReader(scriptContent), "") + if err != nil { + fmt.Fprintf(os.Stderr, "Error parsing script: %v\n", err) + os.Exit(1) + } + + var chunks []chunk + syntax.Walk(f, func(node syntax.Node) bool { + if node == nil { + return false + } + + switch x := node.(type) { + case *syntax.FuncDecl: + chunks = append(chunks, chunk{ + startLine: int(x.Pos().Line()), + endLine: int(x.End().Line()), + name: x.Name.Value, + isFunc: true, + }) + return false // Don't descend into function body + } + return true + }) + + sort.Slice(chunks, func(i, j int) bool { + return chunks[i].startLine < chunks[j].startLine + }) + + var fileIndex int + lastLine := 0 + + writeChunk := func(start, end int, name string) { + if start > end || start <= 0 || end <= 0 { + return + } + fileName := fmt.Sprintf("%03d_%s.sh", fileIndex, name) + filePath := filepath.Join(outputDir, fileName) + fileIndex++ + + fmt.Printf("Extracting lines %d to %d to %s\n", start, end, filePath) + outFile, err := os.Create(filePath) + if err != nil { + fmt.Fprintf(os.Stderr, "Error creating output file: %v\n", err) + return + } + defer outFile.Close() + + writer := bufio.NewWriter(outFile) + for i := start - 1; i < end && i < len(lines); i++ { + fmt.Fprintln(writer, lines[i]) + } + writer.Flush() + } + + // Header + if len(chunks) > 0 && chunks[0].startLine > 1 { + writeChunk(1, chunks[0].startLine-1, "header") + lastLine = chunks[0].startLine - 1 + } else if len(chunks) == 0 { + writeChunk(1, len(lines), "header") + lastLine = len(lines) + } + + for _, c := range chunks { + // Interim + if c.startLine > lastLine+1 { + writeChunk(lastLine+1, c.startLine-1, "interim") + } + + // Function + writeChunk(c.startLine, c.endLine, c.name) + lastLine = c.endLine + } + + // Footer (after the last function) + if lastLine < len(lines) { + finalEndLine := len(lines) + if len(lines) > 0 && lines[len(lines)-1] == "" { + finalEndLine-- + } + if lastLine < finalEndLine { + writeChunk(lastLine+1, finalEndLine, "footer") + } + } + + fmt.Println("Splitting complete.") +} diff --git a/gpu/test_gpu.py b/gpu/test_gpu.py index d6c86bd8c..bdc5d6c67 100644 --- a/gpu/test_gpu.py +++ b/gpu/test_gpu.py @@ -1,5 +1,6 @@ import pkg_resources import time +import os from absl.testing import absltest from absl.testing import parameterized @@ -18,11 +19,16 @@ class NvidiaGpuDriverTestCase(DataprocTestCase): GPU_A100 = "type=nvidia-tesla-a100,count=2" GPU_H100 = "type=nvidia-h100-80gb,count=2" - # Tests for PyTorch - TORCH_TEST_SCRIPT_FILE_NAME = "verify_pytorch.py" - - # Tests for TensorFlow - TF_TEST_SCRIPT_FILE_NAME = "verify_tensorflow.py" + @classmethod + def setUpClass(cls): + import os + if os.getenv("PROJECT_ID"): + os.environ["CLOUDSDK_CORE_PROJECT"] = os.getenv("PROJECT_ID") + DataprocTestCase.PROJECT = os.getenv("PROJECT_ID") + if os.getenv("REGION"): + os.environ["CLOUDSDK_COMPUTE_REGION"] = os.getenv("REGION") + DataprocTestCase.REGION = os.getenv("REGION") + super().setUpClass() def assert_instance_command(self, instance, @@ -63,18 +69,17 @@ def verify_pytorch(self, name): self.TORCH_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) - conda_env="dpgce" - # until the numa node is selected, every time the GPU is accessed # from pytorch, log noise about numa node not being selected is # printed to the console. Selecting numa node before the python is # executed improves readability of the diagnostic information. - verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ - "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ - "${envpath}/bin/python {}".format( - self.TORCH_TEST_SCRIPT_FILE_NAME) + verify_cmd = ( + "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > ${f} ; done ; " + "PY_BIN=$(find /opt/conda -maxdepth 6 -path '*/envs/pytorch/bin/python3' | head -n1); " + "if [[ -z \"$PY_BIN\" ]]; then echo 'PyTorch python not found'; exit 1; fi; " + f"$PY_BIN {self.TORCH_TEST_SCRIPT_FILE_NAME}" + ) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TORCH_TEST_SCRIPT_FILE_NAME, name) @@ -83,15 +88,24 @@ def verify_tensorflow(self, name): self.TF_TEST_SCRIPT_FILE_NAME) self.upload_test_file(test_filename, name) # all on a single numa node - conda_env="dpgce" - verify_cmd = \ - "env={} ; envpath=/opt/conda/miniconda3/envs/${env} ; ".format(conda_env) + \ - "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node) ; do echo 0 > ${f} ; done ;" + \ - "${envpath}/bin/python {}".format( - self.TF_TEST_SCRIPT_FILE_NAME) + verify_cmd = ( + "for f in $(ls /sys/module/nvidia/drivers/pci:nvidia/*/numa_node 2>/dev/null) ; do echo 0 > ${f} ; done ; " + "PY_BIN=$(find /opt/conda -maxdepth 6 -path '*/envs/tensorflow/bin/python3' | head -n1); " + "if [[ -z \"$PY_BIN\" ]]; then echo 'TensorFlow python not found'; exit 1; fi; " + f"$PY_BIN {self.TF_TEST_SCRIPT_FILE_NAME}" + ) self.assert_instance_command(name, verify_cmd) self.remove_test_script(self.TF_TEST_SCRIPT_FILE_NAME, name) + def verify_rapids(self, name): + # Verify that rapids works + verify_cmd = ( + "PY_BIN=$(find /opt/conda -maxdepth 6 -path '*/envs/rapids/bin/python3' | head -n1); " + "if [[ -z \"$PY_BIN\" ]]; then echo 'Rapids python not found'; exit 1; fi; " + "$PY_BIN -c 'import cuml'" + ) + self.assert_instance_command(name, verify_cmd) + def verify_mig_instance(self, name): self.assert_instance_command(name, "/usr/bin/nvidia-smi --query-gpu=mig.mode.current --format=csv,noheader | uniq | xargs -I % test % = 'Enabled'") @@ -163,7 +177,7 @@ def verify_driver_signature(self, name): if self.getImageOs() == 'ubuntu': cert_path='/var/lib/shim-signed/mok/MOK.der' - cert_verification_cmd = """ + cert_verification_cmd = r""" perl -Mv5.10 -e ' my $cert = ( qx{openssl x509 -inform DER -in {} -text} =~ /Serial Number:.*? +(.+?)\s*$/ms ); @@ -180,8 +194,7 @@ def verify_driver_signature(self, name): def test_install_gpu_without_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + self.skipTest('Limiting tests as we probe for success') metadata = "install-gpu-agent=false" if configuration == 'SINGLE' \ @@ -200,7 +213,7 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB") + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) @@ -213,8 +226,6 @@ def test_install_gpu_without_agent(self, configuration, machine_suffixes, def test_install_gpu_with_agent(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") self.skipTest("No need to regularly test installing the agent on its own cluster ; this is exercised elsewhere") @@ -234,7 +245,7 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) @@ -250,8 +261,9 @@ def test_install_gpu_with_agent(self, configuration, machine_suffixes, def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + + if self.getImageOs() == 'rocky' and self.getImageVersion() <= pkg_resources.parse_version("2.0"): + self.skipTest("2.0-rocky8 known to fail") if configuration == 'KERBEROS' \ and self.getImageVersion() <= pkg_resources.parse_version("2.1"): @@ -283,7 +295,7 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB") + boot_disk_size="60GB") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) @@ -300,8 +312,6 @@ def test_install_gpu_cuda_nvidia(self, configuration, machine_suffixes, def test_install_gpu_with_mig(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, driver_provider, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") # Operation [projects/.../regions/.../operations/...] failed: # Invalid value for field 'resource.machineType': \ @@ -331,7 +341,7 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", startup_script="gpu/mig.sh") for machine_suffix in ["w-0", "w-1"]: @@ -344,8 +354,9 @@ def test_install_gpu_with_mig(self, configuration, machine_suffixes, ) def test_gpu_allocation(self, configuration, master_accelerator, worker_accelerator, driver_provider): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + + if self.getImageOs() == 'rocky' and self.getImageVersion() <= pkg_resources.parse_version("2.0"): + self.skipTest("2.0-rocky8 known to fail") if configuration == 'SINGLE' \ and self.getImageOs() == 'rocky' \ @@ -364,7 +375,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, machine_type="n1-standard-16", master_accelerator=master_accelerator, worker_accelerator=worker_accelerator, - boot_disk_size="50GB", + boot_disk_size="60GB", timeout_in_minutes=90) self.verify_instance_spark() @@ -379,8 +390,7 @@ def test_gpu_allocation(self, configuration, master_accelerator, def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suffixes, master_accelerator, worker_accelerator, cuda_version): -# if self.getImageOs() == 'rocky' and self.getImageVersion() >= pkg_resources.parse_version("2.2"): -# self.skipTest("disabling rocky9 builds due to out of date base dataproc image") + self.skipTest('Limiting tests as we probe for success') if pkg_resources.parse_version(cuda_version) > pkg_resources.parse_version("12.4") \ and ( ( self.getImageOs() == 'ubuntu' and self.getImageVersion() <= pkg_resources.parse_version("2.0") ) or \ @@ -397,7 +407,7 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf # ('2.1-rocky8 and 2.0-rocky8 tests are known to fail in SINGLE configuration with errors about nodes_include being empty') self.skipTest("known to fail") - metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={}".format(cuda_version) + metadata = "install-gpu-agent=true,gpu-driver-provider=NVIDIA,cuda-version={},include-tensorflow=true,include-pytorch=yes".format(cuda_version) self.createCluster( configuration, self.INIT_ACTIONS, @@ -406,13 +416,21 @@ def test_install_gpu_cuda_nvidia_with_spark_job(self, configuration, machine_suf worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: machine_name="{}-{}".format(self.getClusterName(),machine_suffix) self.verify_instance(machine_name) self.verify_instance_gpu_agent(machine_name) + + self.verify_tensorflow(machine_name) + if self.getImageVersion() >= pkg_resources.parse_version("2.1"): + self.verify_pytorch(machine_name) + self.verify_rapids(machine_name) + else: + print("Skipping PyTorch and RAPIDS verification on Dataproc < 2.1 due to expected Conda solver timeout.") + self.verify_instance_spark() @parameterized.parameters( @@ -461,7 +479,7 @@ def untested_driver_signing(self, configuration, machine_suffixes, worker_accelerator=worker_accelerator, metadata=metadata, timeout_in_minutes=90, - boot_disk_size="50GB", + boot_disk_size="60GB", scopes="https://www.googleapis.com/auth/monitoring.write") for machine_suffix in machine_suffixes: hostname="{}-{}".format(self.getClusterName(),machine_suffix)