From 6b2724e091d20d8971222be29493b667af57a16f Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Sat, 9 May 2026 10:50:32 +0300 Subject: [PATCH] [phase4][chipstar][l0-igba] cache zeKernelSetIndirectAccess per kernel The L0 backend was calling zeKernelSetIndirectAccess on every kernel launch when the module's HasNoIGBAs flag was false. The Intel L0 driver appears to serialize this entry point globally, which caused 8 HecBench benchmarks (entropy, layout, ldpc, minisweep, p4, scan2, simpleSpmv, sptrsv) to time out under back-to-back launches while passing under the OpenCL backend. The indirect-access flags are a property of the kernel handle and only need to be set once. Add an atomic flag to CHIPKernelLevel0 and use a CAS so that the call is performed exactly once per kernel handle. On failure the flag is cleared so a later launch can retry. --- src/backend/Level0/CHIPBackendLevel0.cc | 21 +++++++++++++++++---- src/backend/Level0/CHIPBackendLevel0.hh | 7 +++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/src/backend/Level0/CHIPBackendLevel0.cc b/src/backend/Level0/CHIPBackendLevel0.cc index 54f6298a8..380295e21 100644 --- a/src/backend/Level0/CHIPBackendLevel0.cc +++ b/src/backend/Level0/CHIPBackendLevel0.cc @@ -1392,10 +1392,23 @@ CHIPQueueLevel0::launchImpl(chipstar::ExecItem *ExecItem) { if (!ModInfo.HasNoIGBAs) { // skpiing this check because PVC has a hardcoded value for this flag even though it's not supported: // if (!LzDev->hasOnDemandPaging()) - zeStatus = zeKernelSetIndirectAccess( - KernelZe, ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | - ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST); - CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeKernelSetIndirectAccess); + // The L0 driver appears to serialize zeKernelSetIndirectAccess across + // threads (it can deadlock or extreme-slow under back-to-back launches). + // The flag is a property of the kernel handle and only needs to be set + // once per kernel, not per launch. Cache the result and skip the call + // if it has already been performed for this kernel. + bool Expected = false; + if (ChipKernel->IndirectAccessSet_.compare_exchange_strong( + Expected, true, std::memory_order_acq_rel)) { + zeStatus = zeKernelSetIndirectAccess( + KernelZe, ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | + ZE_KERNEL_INDIRECT_ACCESS_FLAG_HOST); + if (zeStatus != ZE_RESULT_SUCCESS) { + // Reset the flag so a future launch can retry. + ChipKernel->IndirectAccessSet_.store(false, std::memory_order_release); + } + CHIPERR_CHECK_LOG_AND_THROW_TABLE(zeKernelSetIndirectAccess); + } } // if there's a spill buffer, we must use an event so we can track when diff --git a/src/backend/Level0/CHIPBackendLevel0.hh b/src/backend/Level0/CHIPBackendLevel0.hh index 298f7b3b3..2780dc357 100644 --- a/src/backend/Level0/CHIPBackendLevel0.hh +++ b/src/backend/Level0/CHIPBackendLevel0.hh @@ -30,6 +30,7 @@ #include "../src/common.hh" #include "zeHipErrorConversion.hh" #include +#include static thread_local ze_result_t zeStatus; // instantiated in CHIPBackendLevel0.cc @@ -629,6 +630,12 @@ protected: CHIPDeviceLevel0 *Device; public: + // Tracks whether zeKernelSetIndirectAccess has already been called on + // this kernel handle. The L0 driver appears to serialize this call + // globally; calling it once per kernel instead of once per launch + // avoids contention/deadlocks with high launch rates. + std::atomic IndirectAccessSet_{false}; + CHIPKernelLevel0(); virtual ~CHIPKernelLevel0() {