diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index e6e50e041195..33916b74d982 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1639,3 +1639,12 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke CUDA_CHECK(cudaGetLastError()); } +// Opt-in stderr logging for MMQ/MMVQ kernel dimensions (set GGML_CUDA_MM_LOG=1). +static inline bool ggml_cuda_mm_log_enabled() { + static int enabled = -1; + if (enabled < 0) { + enabled = getenv("GGML_CUDA_MM_LOG") ? 1 : 0; + } + return enabled; +} + diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu index e1add5e03316..e400fcefa987 100644 --- a/ggml/src/ggml-cuda/mmq.cu +++ b/ggml/src/ggml-cuda/mmq.cu @@ -157,6 +157,15 @@ void ggml_cuda_mul_mat_q( ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, use_stream_k, ne1}; + + // Log kernel dimensions for profiling analysis (GGML_CUDA_MM_LOG=1). + // dst[ne01, ne11] = src0[ne00, ne01] @ src1[ne10, ne11]; ne00==ne10 (K) + // GEMM C[M,N] = W[N,K] @ X[K,M]: M=ne11, N=ne01, K=ne00 + if (ggml_cuda_mm_log_enabled()) { + fprintf(stderr, "[MUL_MAT_Q] M=%ld N=%ld K=%ld ne00=%ld ne01=%ld ne1=%ld ne11=%ld ne02=%ld ne12=%ld ne03=%ld ne13=%ld type=%d\n", + ne11, ne01, ne00, ne00, ne01, ne1, ne11, ne02, ne12, ne03, ne13, src0->type); + } + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); return; } @@ -219,6 +228,12 @@ void ggml_cuda_mul_mat_q( ne03, ne13, s03, s13, s3, use_stream_k, ne12}; + // Log kernel dimensions for profiling analysis (batched/MoE path, GGML_CUDA_MM_LOG=1). + if (ggml_cuda_mm_log_enabled()) { + fprintf(stderr, "[MUL_MAT_Q_MoE] M=%ld N=%ld K=%ld ne_get_rows=%ld ne00=%ld ne01=%ld ne02=%ld ne12=%ld type=%d\n", + ne11_flat, ne01, ne00, ne_get_rows, ne00, ne01, ne02, ne12, src0->type); + } + ggml_cuda_mul_mat_q_switch_type(ctx, args, stream); } diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh index b58ac9e7b428..0b42888daefe 100644 --- a/ggml/src/ggml-cuda/mmq.cuh +++ b/ggml/src/ggml-cuda/mmq.cuh @@ -6,9 +6,135 @@ #include #include +#include +#include +#include using namespace ggml_cuda_mma; +#if defined(GGML_USE_HIP) +enum mmq_profile_phase { + MMQ_PROFILE_LOAD_TILES = 0, + MMQ_PROFILE_Y_ACT = 1, + MMQ_PROFILE_VEC_DOT = 2, + MMQ_PROFILE_KB_ITERS = 3, + MMQ_PROFILE_N = 4, +}; + +static __device__ __forceinline__ bool mmq_profile_want_sample(uint64_t * const profile) { + return profile && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 + && threadIdx.x == 0 && threadIdx.y == 0; +} + +static __device__ __forceinline__ void mmq_profile_kb_iter(uint64_t * const profile) { + if (!mmq_profile_want_sample(profile)) { + return; + } + ++profile[MMQ_PROFILE_KB_ITERS]; +} + +static __device__ __forceinline__ void mmq_profile_phase_begin(uint64_t * const profile, uint64_t & t0) { + if (!mmq_profile_want_sample(profile)) { + return; + } + t0 = clock64(); +} + +// Safe delta for clock64(): unsigned (t1 - t0) wraps to ~2^64 when t1 < t0 (non-monotonic +// timestamp across __syncthreads / wave scheduling). Discard small backward glitches; keep +// genuine 64-bit counter wrap (t0 near UINT64_MAX, t1 small). +static __device__ __forceinline__ uint64_t mmq_profile_phase_cycles(const uint64_t t0, const uint64_t t1) { + if (t1 >= t0) { + return t1 - t0; + } + const uint64_t backward = t0 - t1; + if (backward > (UINT64_MAX / 2)) { + return t1 - t0; // unsigned wrap + } + return 0; +} + +static __device__ __forceinline__ void mmq_profile_phase_end(uint64_t * const profile, const uint64_t t0, const int phase) { + if (!mmq_profile_want_sample(profile)) { + return; + } + profile[phase] += mmq_profile_phase_cycles(t0, clock64()); +} + +static bool mmq_profile_phases_enabled() { + static int enabled = -1; + if (enabled < 0) { + enabled = getenv("MMQ_PROFILE_PHASES") ? 1 : 0; + } + return enabled; +} + +struct mmq_profile_guard { + ggml_cuda_pool_alloc buf; + cudaStream_t stream = nullptr; + int type = 0; + int mmq_x = 0; + int64_t nrows_x = 0; + int64_t ncols_max = 0; + bool active = false; + + mmq_profile_guard(ggml_cuda_pool & pool, const bool enable, const int type_in, const int mmq_x_in, + const int64_t nrows_x_in, const int64_t ncols_max_in, cudaStream_t stream_in) + : buf(pool), stream(stream_in), type(type_in), mmq_x(mmq_x_in), nrows_x(nrows_x_in), ncols_max(ncols_max_in), active(enable) { + if (!active) { + return; + } + buf.alloc(MMQ_PROFILE_N); + CUDA_CHECK(cudaMemset(buf.get(), 0, MMQ_PROFILE_N*sizeof(uint64_t))); + } + + uint64_t * ptr() { + return active ? buf.get() : nullptr; + } + + ~mmq_profile_guard() { + if (!active) { + return; + } + CUDA_CHECK(cudaStreamSynchronize(stream)); + uint64_t h[MMQ_PROFILE_N] = {}; + CUDA_CHECK(cudaMemcpy(h, buf.get(), sizeof(h), cudaMemcpyDeviceToHost)); + const uint64_t total = h[MMQ_PROFILE_LOAD_TILES] + h[MMQ_PROFILE_Y_ACT] + h[MMQ_PROFILE_VEC_DOT]; + // Skip corrupt rows (overflow / legacy bad deltas) — see mmq_profile_phase_cycles. + if (total > 0 && total < 5000000000ULL) { + fprintf(stderr, + "[MMQ_PROFILE] type=%d mmq_x=%d nrows_x=%ld ncols_max=%ld " + "load_tiles=%.1f%% y_act=%.1f%% vec_dot=%.1f%% " + "kb_iters=%llu cycles_load=%llu cycles_y=%llu cycles_vec=%llu cycles_total=%llu\n", + type, mmq_x, (long) nrows_x, (long) ncols_max, + 100.0*h[MMQ_PROFILE_LOAD_TILES]/total, + 100.0*h[MMQ_PROFILE_Y_ACT]/total, + 100.0*h[MMQ_PROFILE_VEC_DOT]/total, + (unsigned long long) h[MMQ_PROFILE_KB_ITERS], + (unsigned long long) h[MMQ_PROFILE_LOAD_TILES], + (unsigned long long) h[MMQ_PROFILE_Y_ACT], + (unsigned long long) h[MMQ_PROFILE_VEC_DOT], + (unsigned long long) total); + } + } +}; +#else +static __device__ __forceinline__ void mmq_profile_kb_iter(uint64_t *) {} + +static __device__ __forceinline__ void mmq_profile_phase_begin(uint64_t *, uint64_t &) {} + +static __device__ __forceinline__ void mmq_profile_phase_end(uint64_t *, const uint64_t, const int) {} + +static bool mmq_profile_phases_enabled() { + return false; +} + +struct mmq_profile_guard { + mmq_profile_guard(ggml_cuda_pool &, const bool, const int, const int, const int64_t, const int64_t, cudaStream_t) {} + uint64_t * ptr() { return nullptr; } +}; +#endif // GGML_USE_HIP + #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available. #define MMQ_ITER_K 256 #define MMQ_ITER_K_FP4 512 @@ -275,8 +401,26 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) { #define MMQ_TILE_Y_K (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1) #define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K +static int mmq_get_tile_y_k_padded_host(const int mmq_x, const int cc, const int warp_size, const int nwarps) { + const int pad = GGML_CUDA_CC_IS_RDNA3_5(cc) ? 2*nwarps*warp_size : nwarps*warp_size; + return GGML_PAD(mmq_x*MMQ_TILE_Y_K, pad); +} + +#if defined(RDNA3_5) +static constexpr __device__ int mmq_get_tile_y_k_padded_device(const int mmq_x, const int nwarps, const int warp_size) { + return GGML_PAD(mmq_x*MMQ_TILE_Y_K, 2*nwarps*warp_size); +} +#else +static constexpr __device__ int mmq_get_tile_y_k_padded_device(const int mmq_x, const int nwarps, const int warp_size) { + return GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size); +} +#endif // RDNA3_5 + static int mmq_get_granularity_host(const int mmq_x, const int cc) { if (amd_mfma_available(cc) || amd_wmma_available(cc)) { + if (GGML_CUDA_CC_IS_RDNA3_5(cc) && mmq_x >= 64) { + return 32; + } return mmq_x >= 128 ? 32 : 16; } else if (turing_mma_available(cc) && mmq_x >= 48) { return 16; @@ -287,7 +431,11 @@ static int mmq_get_granularity_host(const int mmq_x, const int cc) { #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) { +#if defined(RDNA3_5) + return mmq_x >= 64 ? 32 : 16; +#else return mmq_x >= 128 ? 32 : 16; +#endif // RDNA3_5 } #elif defined(TURING_MMA_AVAILABLE) static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) { @@ -1303,6 +1451,133 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma( #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) } +template +static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mma( + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { +#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE) + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const half2 * y_ds = (const half2 *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { + const int k0 = k00 + k01; + + tile_A A[ntx]; + float dA[ntx][tile_C::ne]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_A::I + tile_C::get_i(l); + dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0]; + } + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B; + load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float dB = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]); + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + mma(C, A[n], B); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA[n][l]*dB; + } + } + } + } +#else + vec_dot_q8_0_q8_1_mma(x, y, sum, k00); +#endif +} + +template +static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma_rdna35( + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { +#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE) + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) { + const int k0 = k00 + k01; + + tile_A A[ntx]; + float dA[ntx][tile_C::ne]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_A::I + tile_C::get_i(l); + dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0]; + } + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B; + load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + tile_C C; + mma(C, A[n], B); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA[n][l]*dB; + } + } + } + } +#else + vec_dot_q8_0_q8_1_mma(x, y, sum, k00); +#endif +} template static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a( @@ -2100,6 +2375,78 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co ((scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030); // upper 2 bits } +// LDS int offset for pseudo-q8_1 qs within one weight row (matches Q4_1 / WMMA ldmatrix layout). +static __device__ __forceinline__ int mmq_q4_K_qs_lds_k(const int txi, const int nibble) { + return ((txi >> 3) << 4) + (txi & 7) + (nibble << 3); +} + +#if defined(RDNA3_5) +// gfx115x mmq_y=64: 128 threads map 2:1 to rows for scale/dm prep (no warp divergence). +template +static __device__ __forceinline__ void load_tiles_q4_K_dm_rdna35( + const char * __restrict__ x, half2 * __restrict__ x_dm, const int kbx0, const int i_max, const int stride) { + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + + const int dm_tid = threadIdx.y*warp_size + threadIdx.x; + const int dm_row = dm_tid/2; + const int dm_ksc = dm_tid%2; + + if (dm_row >= mmq_y) { + return; + } + + int i = dm_row; + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride; + const int * scales = (const int *) bxi->scales; + + const int sc32 = unpack_scales_q45_K(scales, dm_ksc + 0); + const int m32 = unpack_scales_q45_K(scales, dm_ksc + 2); + + const uint8_t * sc8 = (const uint8_t *) &sc32; + const uint8_t * m8 = (const uint8_t *) &m32; + + const half2 dm = bxi->dm * make_half2(1.0f, -1.0f); + +#pragma unroll + for (int l = 0; l < 4; ++l) { + x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*dm_ksc + l] = dm*make_half2(sc8[l], m8[l]); + } +} + +// gfx115x WMMA: one warp per row, coalesced qs global load, nibble expand to ldmatrix-ready LDS. +template +static __device__ __forceinline__ void load_tiles_q4_K_qs_wmma_rdna35( + const char * __restrict__ x, int * __restrict__ x_qs, const int kbx0, const int i_max, const int stride) { + constexpr int nwarps = mmq_get_nwarps_device(); + constexpr int warp_size = ggml_cuda_get_physical_warp_size(); + constexpr int qs_per_row = MMQ_ITER_K / (4 * QR4_K); // 32 ints per block_q4_K::qs + static_assert(qs_per_row == 32, "bad Q4_K qs_per_row"); + constexpr int nrows = warp_size / qs_per_row; + static_assert(nrows == 1, "Q4_K RDNA3.5 WMMA qs path expects one row per warp"); + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) { + int i = i0 + threadIdx.y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride; + const int qs0 = ((const int *) bxi->qs)[threadIdx.x]; + + int * row_qs = x_qs + i*MMQ_MMA_TILE_X_K_Q8_1; + const int kqs = mmq_q4_K_qs_lds_k(threadIdx.x, 0); + row_qs[kqs] = qs0 & 0x0F0F0F0F; + row_qs[kqs+8] = (qs0 >> 4) & 0x0F0F0F0F; + } +} +#endif // RDNA3_5 + template static __device__ __forceinline__ void load_tiles_q4_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { constexpr int nwarps = mmq_get_nwarps_device(); @@ -2115,6 +2462,11 @@ template static __device__ __forceinline__ void loa int * x_sc = (int *) (x_dm + txs.dm); #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE) + // dm first: warms cache lines before qs reads from same block_q4_K. + load_tiles_q4_K_dm_rdna35(x, x_dm, kbx0, i_max, stride); + load_tiles_q4_K_qs_wmma_rdna35(x, x_qs, kbx0, i_max, stride); +#else constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_K); constexpr int nrows = warp_size / threads_per_row; const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x; @@ -2128,30 +2480,36 @@ template static __device__ __forceinline__ void loa } const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride; +#if defined(RDNA3_5) + const int qs0 = ((const int *) bxi->qs)[txi]; +#else const int qs0 = get_int_b4(bxi->qs, txi); +#endif #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F; - x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + mmq_q4_K_qs_lds_k(txi, 0)] = (qs0 >> 0) & 0x0F0F0F0F; + x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + mmq_q4_K_qs_lds_k(txi, 1)] = (qs0 >> 4) & 0x0F0F0F0F; #else x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0; #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) } +#endif // RDNA3_5 WMMA qs path #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE) + // dm handled above. +#else constexpr int rows_per_warp = warp_size / 2; #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) { -#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) - // Need if on AMD instead of % because warp_size == 64 - // This causes double work and throughput loss (MI300X) - // H100 loses about 100 t/s with 'if' condition over '%' +#if defined(AMD_MFMA_AVAILABLE) + // Need if on CDNA (warp_size == 64) instead of %. int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/2; if (i < mmq_y) { #else int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y; { -#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#endif // defined(AMD_MFMA_AVAILABLE) if (need_check) { i = min(i, i_max); } @@ -2175,6 +2533,7 @@ template static __device__ __forceinline__ void loa } } } +#endif // RDNA3_5 RDNA WMMA dm path #else #pragma unroll for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) { @@ -2244,6 +2603,81 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a( } } +template +static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mma( + const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { +#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE) + // Hoist dmA per k-slice only — full A+dmA hoist spills registers (~2x slower). + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 8, int, input_layout> tile_A; + typedef tile<16, 8, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K; + const int * y_qs = (const int *) y + 4; + const half2 * y_dm = (const half2 *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) { + const int k0 = k00 + k01; + + tile_A A[ntx]; + float2 dmA[ntx][tile_C::ne]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_A::I + tile_C::get_i(l); + dmA[n][l] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]); + } + } + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) { + tile_B B; + load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + + const int j = j0 + tile_C::get_j(0); + const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]); + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + float2 scl[tile_C::ne]; + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + scl[l].x = dmA[n][l].x*dsB.x; + scl[l].y = dmA[n][l].y*dsB.y; + } + + tile_C C; + mma(C, A[n], B); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int si = (j0/tile_C::J + n)*tile_C::ne + l; + sum[si] += scl[l].x*C.x[l]; + sum[si] += scl[l].y; + } + } + } + } +#else + vec_dot_q8_1_q8_1_mma(x, y, sum, k00); +#endif +} + template static __device__ __forceinline__ void load_tiles_q5_K( const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) { constexpr int nwarps = mmq_get_nwarps_device(); @@ -2525,7 +2959,85 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a( template static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma( const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) { -#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) +#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE) + // Hoist sclA = sc[k01/4]*d per k-slice — scales were loaded per (n,l) in j-loop. + constexpr data_layout input_layout = get_input_data_layout(); + typedef tile<16, 4, int, input_layout> tile_A; + typedef tile<16, 4, int, input_layout> tile_B; + typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C; + + constexpr int granularity = mmq_get_granularity_device(mmq_x); + constexpr int rows_per_warp = granularity; + constexpr int ntx = rows_per_warp/tile_C::I; + + y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K); + + const int * x_qs = (const int *) x; + const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2; + const int * x_sc = (const int *) x_df + MMQ_TILE_NE_K/QI6_K; + const int * y_qs = (const int *) y + 4; + const float * y_df = (const float *) y; + + const int i0 = (threadIdx.y / ntx) * rows_per_warp; + + for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) { + const int k0 = k00 + k01; + + tile_A A[ntx]; + float sclA[ntx][tile_C::ne]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K); + +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + const int i = i0 + n*tile_A::I + tile_C::get_i(l); + const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16); + sclA[n][l] = (float) sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K]; + } + } + + constexpr int j_step = ntx*tile_C::J; + + tile_B B0; + load_ldmatrix(B0, y_qs + 0, MMQ_TILE_Y_K); + float dB0 = y_df[tile_C::get_j(0)*MMQ_TILE_Y_K + k01/QI8_1]; + +#pragma unroll + for (int j0 = 0; j0 < mmq_x; j0 += j_step) { + const int j0_next = j0 + j_step; + + tile_B B1; + float dB1 = 0.0f; + if (j0_next < mmq_x) { + load_ldmatrix(B1, y_qs + j0_next*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K); + const int jn = j0_next + tile_C::get_j(0); + dB1 = y_df[jn*MMQ_TILE_Y_K + k01/QI8_1]; + } + + tile_C C[ntx]; + +#pragma unroll + for (int n = 0; n < ntx; ++n) { + mma(C[n], A[n], B0); + } + +#pragma unroll + for (int n = 0; n < ntx; ++n) { +#pragma unroll + for (int l = 0; l < tile_C::ne; ++l) { + sum[(j0/tile_C::J + n)*tile_C::ne + l] += C[n].x[l]*sclA[n][l]*dB0; + } + } + + if (j0_next < mmq_x) { + B0 = B1; + dB0 = dB1; + } + } + } +#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) constexpr data_layout input_layout = get_input_data_layout(); typedef tile<16, 4, int, input_layout> tile_A; typedef tile<16, 4, int, input_layout> tile_B; @@ -3287,7 +3799,7 @@ template struct mmq_type_traits { static constexpr int vdr = VDR_Q4_0_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_0; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q4_0_q8_1_mma; static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a; }; @@ -3319,7 +3831,7 @@ template struct mmq_type_traits { static constexpr int vdr = VDR_Q8_0_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q8_0; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_0_q8_1_mma_rdna35; static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; @@ -3369,7 +3881,7 @@ template struct mmq_type_traits { static constexpr int vdr = VDR_Q4_K_Q8_1_MMQ; static constexpr load_tiles_mmq_t load_tiles = load_tiles_q4_K; - static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q8_1_q8_1_mma; + static constexpr vec_dot_mmq_t vec_dot_mma = vec_dot_q4_K_q8_1_mma; static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a; }; @@ -3453,12 +3965,48 @@ struct mmq_type_traits { static constexpr vec_dot_mmq_t vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a; }; + +#if defined(RDNA3_5) +// Software-pipeline activation tile loads: issue global loads into registers, run WMMA, +// then store to LDS. Overlaps memory latency with vec_dot on gfx115x (no ISA prefetch). +template +static __device__ __forceinline__ void mmq_tile_y_load_global( + int * __restrict__ tile_y, const int * __restrict__ by) { +#pragma unroll + for (int c = 0; c < nchunks; ++c) { + const int l = c*(nwarps*warp_size) + threadIdx.y*warp_size + threadIdx.x; + tile_y[l] = by[l]; + } +} + +template +static __device__ __forceinline__ void mmq_tile_y_load_global_to_regs( + const int * __restrict__ by, int (&cache)[nchunks]) { +#pragma unroll + for (int c = 0; c < nchunks; ++c) { + const int l = c*(nwarps*warp_size) + threadIdx.y*warp_size + threadIdx.x; + cache[c] = by[l]; + } +} + +template +static __device__ __forceinline__ void mmq_tile_y_store_regs( + int * __restrict__ tile_y, const int (&cache)[nchunks]) { +#pragma unroll + for (int c = 0; c < nchunks; ++c) { + const int l = c*(nwarps*warp_size) + threadIdx.y*warp_size + threadIdx.x; + tile_y[l] = cache[c]; + } +} +#endif // RDNA3_5 + template static __device__ __forceinline__ void mul_mat_q_process_tile( const char * __restrict__ x, const int offset_x, const int * __restrict__ y, const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup, const int stride_row_x, const int ncols_y, const int stride_col_dst, - const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) { + const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop, + uint64_t * mmq_profile) { constexpr int warp_size = ggml_cuda_get_physical_warp_size(); constexpr int nwarps = mmq_get_nwarps_device(); @@ -3468,7 +4016,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( extern __shared__ int data_mul_mat_q[]; int * tile_y = data_mul_mat_q + mmq_x; - int * tile_x = tile_y + GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size); + int * tile_x = tile_y + mmq_get_tile_y_k_padded_device(mmq_x, nwarps, warp_size); #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE) constexpr vec_dot_mmq_t vec_dot = mmq_type_traits::vec_dot_mma; @@ -3492,39 +4040,206 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int); - for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { - load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); - { - const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz; -#pragma unroll - for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { - int l = l0 + threadIdx.y*warp_size + threadIdx.x; +#if defined(RDNA3_5) + constexpr int tile_y_elems = mmq_x*MMQ_TILE_Y_K; + constexpr int tile_y_load_stride = nwarps*warp_size; + if constexpr (mmq_x <= 64 && tile_y_elems % tile_y_load_stride == 0) { + constexpr int tile_y_nchunks = tile_y_elems/tile_y_load_stride; + + int y0_next_cache[tile_y_nchunks]; + bool have_y0_prefetch = false; + + if (mmq_profile) { + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { + mmq_profile_kb_iter(mmq_profile); + + { + uint64_t t0 = 0; + __syncthreads(); + mmq_profile_phase_begin(mmq_profile, t0); + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); + __syncthreads(); + mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_LOAD_TILES); + } - tile_y[l] = by0[l]; + const int yk = kb0 * qk / ne_block; + const int * by0 = y + ncols_y * yk * sz; + const int * by1 = y + ncols_y * (yk + 1) * sz; + + { + uint64_t t0 = 0; + __syncthreads(); + mmq_profile_phase_begin(mmq_profile, t0); + + if (have_y0_prefetch) { + mmq_tile_y_store_regs(tile_y, y0_next_cache); + have_y0_prefetch = false; + } else { + mmq_tile_y_load_global(tile_y, by0); + } + + __syncthreads(); + + int y1_cache[tile_y_nchunks]; + mmq_tile_y_load_global_to_regs(by1, y1_cache); + + mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_Y_ACT); + + __syncthreads(); + mmq_profile_phase_begin(mmq_profile, t0); + + vec_dot(tile_x, tile_y, sum, 0); + + __syncthreads(); + + mmq_tile_y_store_regs(tile_y, y1_cache); + + __syncthreads(); + + const int kb0_next = kb0 + blocks_per_iter; + if (kb0_next < kb0_stop) { + const int * by0_next = y + ncols_y * (kb0_next * qk / ne_block) * sz; + mmq_tile_y_load_global_to_regs(by0_next, y0_next_cache); + have_y0_prefetch = true; + } + + vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K); + + __syncthreads(); + mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_VEC_DOT); + } + } + } else { + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); + + const int yk = kb0 * qk / ne_block; + const int * by0 = y + ncols_y * yk * sz; + const int * by1 = y + ncols_y * (yk + 1) * sz; + + if (have_y0_prefetch) { + mmq_tile_y_store_regs(tile_y, y0_next_cache); + have_y0_prefetch = false; + } else { + mmq_tile_y_load_global(tile_y, by0); + } + + __syncthreads(); + + int y1_cache[tile_y_nchunks]; + mmq_tile_y_load_global_to_regs(by1, y1_cache); + + vec_dot(tile_x, tile_y, sum, 0); + + __syncthreads(); + + mmq_tile_y_store_regs(tile_y, y1_cache); + + __syncthreads(); + + const int kb0_next = kb0 + blocks_per_iter; + if (kb0_next < kb0_stop) { + const int * by0_next = y + ncols_y * (kb0_next * qk / ne_block) * sz; + mmq_tile_y_load_global_to_regs(by0_next, y0_next_cache); + have_y0_prefetch = true; + } + + vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K); + + __syncthreads(); } } + } else +#endif // RDNA3_5 + { + if (mmq_profile) { + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { + mmq_profile_kb_iter(mmq_profile); + + { + uint64_t t0 = 0; + __syncthreads(); + mmq_profile_phase_begin(mmq_profile, t0); + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); + __syncthreads(); + mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_LOAD_TILES); + } - __syncthreads(); + const int yk = kb0 * qk / ne_block; + const int * by0 = y + ncols_y * yk * sz; + const int * by1 = y + ncols_y * (yk + 1) * sz; - vec_dot(tile_x, tile_y, sum, 0); + { + uint64_t t0 = 0; + __syncthreads(); + mmq_profile_phase_begin(mmq_profile, t0); +#pragma unroll + for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { + int l = l0 + threadIdx.y*warp_size + threadIdx.x; - __syncthreads(); + tile_y[l] = by0[l]; + } + __syncthreads(); + mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_Y_ACT); + } - { - const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz); + { + uint64_t t0 = 0; + __syncthreads(); + mmq_profile_phase_begin(mmq_profile, t0); + vec_dot(tile_x, tile_y, sum, 0); + __syncthreads(); #pragma unroll - for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { - int l = l0 + threadIdx.y*warp_size + threadIdx.x; + for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { + int l = l0 + threadIdx.y*warp_size + threadIdx.x; - tile_y[l] = by0[l]; + tile_y[l] = by1[l]; + } + __syncthreads(); + vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K); + __syncthreads(); + mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_VEC_DOT); + } } - } + } else { + for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) { + load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x); - __syncthreads(); + const int yk = kb0 * qk / ne_block; + const int * by0 = y + ncols_y * yk * sz; + const int * by1 = y + ncols_y * (yk + 1) * sz; - vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K); + { +#pragma unroll + for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { + int l = l0 + threadIdx.y*warp_size + threadIdx.x; - __syncthreads(); + tile_y[l] = by0[l]; + } + } + + __syncthreads(); + + vec_dot(tile_x, tile_y, sum, 0); + + __syncthreads(); + + { +#pragma unroll + for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) { + int l = l0 + threadIdx.y*warp_size + threadIdx.x; + + tile_y[l] = by1[l]; + } + } + + __syncthreads(); + + vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K); + + __syncthreads(); + } + } } if (fixup) { @@ -3535,6 +4250,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile( } + // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598 template @@ -3555,7 +4271,7 @@ static __global__ void mul_mat_q( const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst, const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, - const uint3 ntx) { + const uint3 ntx, uint64_t * mmq_profile) { // Skip unused template specializations for faster compilation: if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) { @@ -3640,7 +4356,7 @@ static __global__ void mul_mat_q( constexpr bool fixup = false; mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, - tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z); + tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z, mmq_profile); return; } #endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA @@ -3720,7 +4436,7 @@ static __global__ void mul_mat_q( constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer. mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, - tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop, mmq_profile); kbc += blocks_per_ne00.z; kbc -= fastmodulo(kbc, blocks_per_ne00); @@ -3789,7 +4505,7 @@ static __global__ void mul_mat_q( constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks. mul_mat_q_process_tile (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst, - tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop); + tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop, mmq_profile); } template @@ -3939,6 +4655,23 @@ struct mmq_args { bool use_stream_k; int64_t ncols_max; }; +#if defined(GGML_USE_HIP) +// RDNA3.5 dual-WG (mmq_x=64, nbytes <= smpbo/2) helps K-quants with large per-tile LDS +// (Q6_K WMMA tuning). Block quants (Q5_0, Q8_0, Q4_0) are faster at mmq_x=128 ntx=1. +static bool mmq_rdna35_dual_wg_eligible(const ggml_type type) { + switch (type) { + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return true; + default: + return false; + } +} +#endif // GGML_USE_HIP + template static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc, const int warp_size, const int nwarps) { const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y); @@ -3946,7 +4679,11 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int const size_t nbs_ids = mmq_x*sizeof(int); const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int); const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq)); - return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int)); + const int tile_y_k_padded = mmq_get_tile_y_k_padded_host(mmq_x, cc, warp_size, nwarps); + const size_t nbs_y_padded = std::max(nbs_y, (size_t) tile_y_k_padded*sizeof(int)); + const int pad = GGML_CUDA_CC_IS_RDNA3_5(cc) ? 2*nwarps*warp_size : nwarps*warp_size; + size_t nbytes = nbs_ids + nbs_x + GGML_PAD(nbs_y_padded, pad*sizeof(int)); + return nbytes; } template @@ -3957,9 +4694,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int warp_size = ggml_cuda_info().devices[id].warp_size; const int nwarps = mmq_get_nwarps_host(cc, warp_size); const int mmq_y = get_mmq_y_host(cc); + const size_t smpbo = ggml_cuda_info().devices[id].smpbo; const dim3 block_dims(warp_size, nwarps, 1); + mmq_profile_guard prof_guard(ctx.pool(id), mmq_profile_phases_enabled(), type, mmq_x, + args.nrows_x, args.ncols_max, stream); + const int nbytes_shared = mmq_get_nbytes_shared(mmq_x, mmq_y, cc, warp_size, nwarps); CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q), nbytes_shared); @@ -3970,6 +4711,22 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a const int ntzw = args.nchannels_y * args.nsamples_y; const dim3 block_nums_xy_tiling(nty, ntx, ntzw); + const auto log_launch_config = [&](const dim3 & grid, const bool need_check, const char * path) { + if (!ggml_cuda_mm_log_enabled()) { + return; + } + fprintf(stderr, + "[MUL_MAT_Q_LAUNCH] type=%d mmq_x=%d mmq_y=%d nwarps=%d warp_size=%d " + "nbytes_shared=%d smpbo=%zu grid=(%u,%u,%u) block=(%u,%u,%u) " + "ntx=%d nty=%d ntzw=%d ncols_max=%ld nrows_x=%ld use_stream_k=%d need_check=%d path=%s\n", + type, mmq_x, mmq_y, nwarps, warp_size, + nbytes_shared, smpbo, + grid.x, grid.y, grid.z, + block_dims.x, block_dims.y, block_dims.z, + ntx, nty, ntzw, args.ncols_max, args.nrows_x, + args.use_stream_k ? 1 : 0, need_check ? 1 : 0, path); + }; + GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0); GGML_ASSERT(args.nsamples_y % args.nsamples_x == 0); const int channel_ratio = args.nchannels_y / args.nchannels_x; @@ -3985,20 +4742,22 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a if (!args.use_stream_k) { if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; + log_launch_config(block_nums_xy_tiling, need_check, "tiling"); mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, - ntx_fd); + ntx_fd, prof_guard.ptr()); } else { constexpr bool need_check = true; + log_launch_config(block_nums_xy_tiling, need_check, "tiling"); mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, - ntx_fd); + ntx_fd, prof_guard.ptr()); } return; } @@ -4025,12 +4784,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a if (args.nrows_x % mmq_y == 0) { constexpr bool need_check = false; + log_launch_config(block_nums_stream_k, need_check, "stream_k"); mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, - ntx_fd); + ntx_fd, prof_guard.ptr()); if (!fixup_needed) { return; @@ -4043,12 +4803,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a ntx_fd); } else { constexpr bool need_check = true; + log_launch_config(block_nums_stream_k, need_check, "stream_k"); mul_mat_q<<>> (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr, blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst, channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst, sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst, - ntx_fd); + ntx_fd, prof_guard.ptr()); if (!fixup_needed) { return; @@ -4091,6 +4852,89 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda } } +#if defined(GGML_USE_HIP) + // RDNA3.5 (gfx115x): mmq_x=128 uses ~59% of LDS/CU → 1 WG/CU. For K-quants only, + // pick the smallest mmq_x with nbytes_shared <= smpbo/2 (2 WGs/CU). + if (GGML_CUDA_CC_IS_RDNA3_5(cc) && mmq_x_best > 0 && mmq_rdna35_dual_wg_eligible(type)) { + const size_t lds_dual_wg = smpbo / 2; + const size_t nbytes_best = mmq_get_nbytes_shared(mmq_x_best, mmq_y, cc, warp_size, nwarps); + if (nbytes_best > lds_dual_wg) { + int mmq_x_lds = 0; + size_t nbytes_lds_min = SIZE_MAX; + for (int mmq_x = 8; mmq_x <= mmq_x_max; mmq_x += 8) { + const int granularity = mmq_get_granularity_host(mmq_x, cc); + if (mmq_x % granularity != 0) { + continue; + } + const size_t nbytes = mmq_get_nbytes_shared(mmq_x, mmq_y, cc, warp_size, nwarps); + if (nbytes > smpbo || nbytes > lds_dual_wg) { + continue; + } + const int ntiles_x = (args.ncols_max + mmq_x - 1) / mmq_x; + if (ntiles_x > ntiles_x_best * 2) { + continue; + } + if (nbytes < nbytes_lds_min) { + mmq_x_lds = mmq_x; + nbytes_lds_min = nbytes; + } + } + if (mmq_x_lds > 0) { + mmq_x_best = mmq_x_lds; + } + } + } +#endif // GGML_USE_HIP + +#if defined(GGML_USE_HIP) + // Tiny M (e.g. gate 128×32×4096 Q8_0): prefer largest mmq_x within LDS so N is not + // split; occupancy is irrelevant when only a handful of blocks launch. + if (GGML_CUDA_CC_IS_RDNA3_5(cc) && args.nrows_x <= 32) { + const int nty = (args.nrows_x + mmq_y - 1) / mmq_y; + const int ntzw = static_cast(args.nchannels_y * args.nsamples_y); + + int mmq_x_tiny = 0; + int ntiles_tiny = INT_MAX; + for (int mmq_x = mmq_x_max; mmq_x >= 8; mmq_x -= 8) { + const int granularity = mmq_get_granularity_host(mmq_x, cc); + if (mmq_x % granularity != 0) { + continue; + } + const size_t nbytes = mmq_get_nbytes_shared(mmq_x, mmq_y, cc, warp_size, nwarps); + if (nbytes > smpbo) { + continue; + } + const int ntx = (args.ncols_max + mmq_x - 1) / mmq_x; + const int grid_blocks = nty * ntx * ntzw; + if (grid_blocks <= 4 && ntx <= ntiles_tiny) { + mmq_x_tiny = mmq_x; + ntiles_tiny = ntx; + if (ntiles_tiny == 1) { + break; + } + } + } + if (mmq_x_tiny > 0) { + mmq_x_best = mmq_x_tiny; + } + } + + // Q6_K / K-quants narrow-N: dual-WG path (mmq_x=64, ntx=2) from smpbo/2 selection. +#endif // GGML_USE_HIP + +#if defined(GGML_USE_HIP) + if (GGML_CUDA_CC_IS_RDNA3_5(cc)) { + if (const char * force = getenv("MMQ_FORCE_MMQ_X")) { + const int mmq_x_force = atoi(force); + if (mmq_x_force >= 8 && mmq_x_force <= mmq_x_max && + mmq_x_force % mmq_get_granularity_host(mmq_x_force, cc) == 0 && + mmq_get_nbytes_shared(mmq_x_force, mmq_y, cc, warp_size, nwarps) <= smpbo) { + mmq_x_best = mmq_x_force; + } + } + } +#endif // GGML_USE_HIP + switch (mmq_x_best) { case 8: launch_mul_mat_q(ctx, args, stream); diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index fe44a58da918..cee73315d5ef 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -1213,6 +1213,14 @@ void ggml_cuda_mul_mat_vec_q( const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0; + // Log kernel dimensions for profiling analysis (GGML_CUDA_MM_LOG=1). + // dst[ne01] = src0[ne00, ne01] @ src1[ne10]; ne00==ne10 (K), ne11==1 for decode + // GEMV y[M] = W[M,K] @ x[K]: M=ne01, K=ne00 + if (ggml_cuda_mm_log_enabled()) { + fprintf(stderr, "[MUL_MAT_VEC_Q] M=%ld K=%ld ne00=%ld ne01=%ld ne10=%ld ne11=%ld ne02=%ld ne12=%ld ne03=%ld ne13=%ld type=%d\n", + ne01, ne00, ne00, ne01, ne10, ne11, ne02, ne12, ne03, ne13, src0->type); + } + mul_mat_vec_q_switch_type( src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00, ne01, ncols_dst, s01, stride_col_y, stride_col_dst, diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu index 39a500a17041..7d23fd8886dd 100644 --- a/ggml/src/ggml-cuda/quantize.cu +++ b/ggml/src/ggml-cuda/quantize.cu @@ -400,6 +400,13 @@ void quantize_mmq_q8_1_cuda( const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ); const dim3 num_blocks(ne1, block_num_y, ne2*ne3); const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1); + + // Log kernel dimensions for profiling analysis (GGML_CUDA_MM_LOG=1). + if (ggml_cuda_mm_log_enabled()) { + fprintf(stderr, "[QUANTIZE_MMQ_Q8_1] ne00=%ld ne0=%ld ne1=%ld ne2=%ld ne3=%ld grid=(%u,%u,%u) type=%d\n", + ne00, ne0, ne1, ne2, ne3, num_blocks.x, num_blocks.y, num_blocks.z, type_src0); + } + switch (mmq_get_q8_1_ds_layout(type_src0)) { case MMQ_Q8_1_DS_LAYOUT_D4: quantize_mmq_q8_1 diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index a6115cd80dc4..2ad1207bef98 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -94,6 +94,7 @@ #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyKind hipMemcpyKind +#define cudaMemcpyToSymbol hipMemcpyToSymbol #define cudaMemset hipMemset #define cudaMemsetAsync hipMemsetAsync #define cudaMemGetInfo hipMemGetInfo