Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ type ClusterPolicySpec struct {
HostPaths HostPathsSpec `json:"hostPaths,omitempty"`
// KataSandboxDevicePlugin component spec
KataSandboxDevicePlugin KataDevicePluginSpec `json:"kataSandboxDevicePlugin,omitempty"`
// FabricManager component spec
FabricManager FabricManagerSpec `json:"fabricManager,omitempty"`
}

// Runtime defines container runtime type
Expand Down Expand Up @@ -1819,6 +1821,38 @@ type CDIConfigSpec struct {
NRIPluginEnabled *bool `json:"nriPluginEnabled,omitempty"`
}

// FabricMode defines the Fabric Manager mode
type FabricMode string

const (
// FabricModeFullPassthrough indicates Full-passthrough mode (FABRIC_MODE=0)
FabricModeFullPassthrough FabricMode = "full-passthrough"
// FabricModeSharedNVSwitch indicates Shared NVSwitch Virtualization mode (FABRIC_MODE=1)
FabricModeSharedNVSwitch FabricMode = "shared-nvswitch"
)

func (f FabricMode) String() string {
switch f {
case FabricModeFullPassthrough:
return "full-passthrough"
case FabricModeSharedNVSwitch:
return "shared-nvswitch"
default:
return ""
}
}

// FabricManagerSpec defines the properties for NVIDIA Fabric Manager configuration
type FabricManagerSpec struct {
// Mode indicates the Fabric Manager mode
// +kubebuilder:validation:Enum=full-passthrough;shared-nvswitch
// +kubebuilder:default=full-passthrough
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Fabric Manager Mode"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:select:full-passthrough,urn:alm:descriptor:com.tectonic.ui:select:shared-nvswitch"
Mode FabricMode `json:"mode,omitempty"`
}

// MIGStrategy indicates MIG mode
type MIGStrategy string

Expand Down Expand Up @@ -2334,3 +2368,18 @@ func (c *MIGPartedConfigSpec) GetName() string {
func (c *VGPUDevicesConfigSpec) GetName() string {
return ptr.Deref(c, VGPUDevicesConfigSpec{}).Name
}

// IsSharedNVSwitchMode returns true if Fabric Manager is configured for Shared NVSwitch mode
func (f *FabricManagerSpec) IsSharedNVSwitchMode() bool {
return f.Mode == FabricModeSharedNVSwitch
}

// ValidateFabricManagerConfig validates the Fabric Manager configuration
func (c *ClusterPolicySpec) ValidateFabricManagerConfig() error {
if c.SandboxWorkloads.DefaultWorkload == "vm-passthrough" &&
c.FabricManager.IsSharedNVSwitchMode() &&
!c.Driver.IsEnabled() {
return fmt.Errorf("driver must be enabled when using vm-passthrough with Fabric Manager Shared NVSwitch mode")
}
return nil
}
16 changes: 16 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions assets/state-driver/0400_configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@ data:
fi

if ! nvidia-smi; then
echo "nvidia-smi failed"
exit 1
# For vm-passthrough with shared-nvswitch mode, nvidia-smi may fail due to unbound devices
# Fall back to checking if nvidia module is loaded when FABRIC_MANAGER_FABRIC_MODE=1
Comment on lines +25 to +26
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question (for my understanding) -- GPUs may not be bound to the nvidia driver since there is a chance that the vfio-manager ran already and unbound the devices? Am I understanding this correct?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct. In shared-nvswitch mode, the vfio-manager runs vfio-manage unbind --all to unbind GPUs from the nvidia driver and rebind them to vfio-pci for VM passthrough. The nvidia kernel module remains loaded (needed for Fabric Manager / NVSwitch management), but since the GPU devices are no longer bound to the nvidia driver nvidia-smi fails. So we fall back to just verifying the kernel module is loaded, which is sufficient for this mode.

if [ "${FABRIC_MANAGER_FABRIC_MODE:-}" = "1" ]; then
echo "nvidia-smi failed but nvidia module is loaded (vm-passthrough with shared-nvswitch mode)"
else
echo "nvidia-smi failed"
exit 1
fi
fi

GPU_DIRECT_RDMA_ENABLED="${GPU_DIRECT_RDMA_ENABLED:-false}"
Expand Down
7 changes: 7 additions & 0 deletions assets/state-sandbox-validation/0200_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,10 @@ rules:
- use
resourceNames:
- privileged
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
33 changes: 33 additions & 0 deletions assets/state-sandbox-validation/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,36 @@ spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-sandbox-validator
initContainers:
- name: driver-validation
image: "FILLED BY THE OPERATOR"
command: ["sh", "-c"]
args: ["nvidia-validator"]
env:
- name: WITH_WAIT
value: "true"
- name: COMPONENT
value: driver
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
securityContext:
privileged: true
seLinuxOptions:
level: "s0"
volumeMounts:
- name: host-root
mountPath: /host
readOnly: true
mountPropagation: HostToContainer
- name: driver-install-path
mountPath: /run/nvidia/driver
mountPropagation: HostToContainer
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: Bidirectional
- name: host-dev-char
mountPath: /host-dev-char
- name: cc-manager-validation
image: "FILLED BY THE OPERATOR"
command: ['sh', '-c']
Expand Down Expand Up @@ -145,3 +175,6 @@ spec:
- name: host-root
hostPath:
path: /
- name: host-dev-char
hostPath:
path: /dev/char
30 changes: 30 additions & 0 deletions assets/state-vfio-manager/0400_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-vfio-manager-entrypoint
namespace: "FILLED BY THE OPERATOR"
labels:
app: nvidia-vfio-manager
data:
init-entrypoint.sh: |-
#!/bin/sh

if [ "${FABRIC_MANAGER_MODE}" = "shared-nvswitch" ]; then
# In shared-nvswitch mode, wait for driver to be ready before unbinding devices
echo "Shared NVSwitch mode detected, waiting for driver readiness..."
until [ -f /run/nvidia/validations/driver-ready ]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done

set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready

echo "Driver is ready, proceeding with device unbind"
exec vfio-manage unbind --all
else
# Default mode: uninstall the driver
exec driver-manager uninstall_driver
fi
20 changes: 18 additions & 2 deletions assets/state-vfio-manager/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ spec:
- name: k8s-driver-manager
image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
command: ["driver-manager"]
args: ["uninstall_driver"]
command: ["/bin/sh", "-c"]
args:
- /bin/init-entrypoint.sh
env:
- name: NODE_NAME
valueFrom:
Expand All @@ -47,6 +48,10 @@ spec:
securityContext:
privileged: true
volumeMounts:
- name: nvidia-vfio-manager-entrypoint
readOnly: true
mountPath: /bin/init-entrypoint.sh
subPath: init-entrypoint.sh
- name: run-nvidia
mountPath: /run/nvidia
mountPropagation: Bidirectional
Expand Down Expand Up @@ -80,6 +85,9 @@ spec:
readOnly: true
- name: host-root
mountPath: /host
- name: run-nvidia-validations
mountPath: /run/nvidia/validations
mountPropagation: Bidirectional
securityContext:
privileged: true
seLinuxOptions:
Expand All @@ -90,6 +98,10 @@ spec:
command: ["vfio-manage unbind --all"]
terminationGracePeriodSeconds: 30
volumes:
- name: nvidia-vfio-manager-entrypoint
configMap:
name: nvidia-vfio-manager-entrypoint
defaultMode: 448
- name: host-sys
hostPath:
path: /sys
Expand All @@ -102,6 +114,10 @@ spec:
hostPath:
path: /run/nvidia
type: DirectoryOrCreate
- name: run-nvidia-validations
hostPath:
path: /run/nvidia/validations
type: DirectoryOrCreate
- name: host-root
hostPath:
path: "/"
11 changes: 11 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,17 @@ spec:
type: string
type: object
type: object
fabricManager:
description: FabricManager component spec
properties:
mode:
default: full-passthrough
description: Mode indicates the Fabric Manager mode
enum:
- full-passthrough
- shared-nvswitch
type: string
type: object
gdrcopy:
description: GDRCopy component spec
properties:
Expand Down
24 changes: 14 additions & 10 deletions cmd/nvidia-validator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1657,18 +1657,22 @@ func (v *VfioPCI) validate() error {
return err
}

err = v.runValidation()
if err != nil {
return err
}
log.Info("Validation completed successfully - all devices are bound to vfio-pci")
for {
log.Info("Attempting to validate that all device are bound to vfio-pci")
err := v.runValidation()
if err != nil {
if !withWaitFlag {
return fmt.Errorf("error validating vfio-pci: %w", err)
}
log.Warningf("failed to validate vfio-pci, retrying after %d seconds\n", sleepIntervalSecondsFlag)
time.Sleep(time.Duration(sleepIntervalSecondsFlag) * time.Second)
continue
}

// delete status file is already present
err = createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
if err != nil {
return err
log.Info("Validation completed successfully - all devices are bound to vfio-pci")

return createStatusFile(outputDirFlag + "/" + vfioPCIStatusFile)
}
return nil
}

func (v *VfioPCI) runValidation() error {
Expand Down
11 changes: 11 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1297,6 +1297,17 @@ spec:
type: string
type: object
type: object
fabricManager:
description: FabricManager component spec
properties:
mode:
default: full-passthrough
description: Mode indicates the Fabric Manager mode
enum:
- full-passthrough
- shared-nvswitch
type: string
type: object
gdrcopy:
description: GDRCopy component spec
properties:
Expand Down
Loading