diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index e6e50e041195..33916b74d982 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1639,3 +1639,12 @@ static __inline__ void ggml_cuda_kernel_launch(Kernel kernel, const ggml_cuda_ke
     CUDA_CHECK(cudaGetLastError());
 }
 
+// Opt-in stderr logging for MMQ/MMVQ kernel dimensions (set GGML_CUDA_MM_LOG=1).
+static inline bool ggml_cuda_mm_log_enabled() {
+    static int enabled = -1;
+    if (enabled < 0) {
+        enabled = getenv("GGML_CUDA_MM_LOG") ? 1 : 0;
+    }
+    return enabled;
+}
+
diff --git a/ggml/src/ggml-cuda/mmq.cu b/ggml/src/ggml-cuda/mmq.cu
index e1add5e03316..e400fcefa987 100644
--- a/ggml/src/ggml-cuda/mmq.cu
+++ b/ggml/src/ggml-cuda/mmq.cu
@@ -157,6 +157,15 @@ void ggml_cuda_mul_mat_q(
             ne02, ne12, s02, s12, s2,
             ne03, ne13, s03, s13, s3,
             use_stream_k, ne1};
+
+        // Log kernel dimensions for profiling analysis (GGML_CUDA_MM_LOG=1).
+        // dst[ne01, ne11] = src0[ne00, ne01] @ src1[ne10, ne11]; ne00==ne10 (K)
+        // GEMM C[M,N] = W[N,K] @ X[K,M]: M=ne11, N=ne01, K=ne00
+        if (ggml_cuda_mm_log_enabled()) {
+            fprintf(stderr, "[MUL_MAT_Q] M=%ld N=%ld K=%ld ne00=%ld ne01=%ld ne1=%ld ne11=%ld ne02=%ld ne12=%ld ne03=%ld ne13=%ld type=%d\n",
+                    ne11, ne01, ne00, ne00, ne01, ne1, ne11, ne02, ne12, ne03, ne13, src0->type);
+        }
+
         ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
         return;
     }
@@ -219,6 +228,12 @@ void ggml_cuda_mul_mat_q(
         ne03, ne13, s03, s13, s3,
         use_stream_k, ne12};
 
+    // Log kernel dimensions for profiling analysis (batched/MoE path, GGML_CUDA_MM_LOG=1).
+    if (ggml_cuda_mm_log_enabled()) {
+        fprintf(stderr, "[MUL_MAT_Q_MoE] M=%ld N=%ld K=%ld ne_get_rows=%ld ne00=%ld ne01=%ld ne02=%ld ne12=%ld type=%d\n",
+                ne11_flat, ne01, ne00, ne_get_rows, ne00, ne01, ne02, ne12, src0->type);
+    }
+
     ggml_cuda_mul_mat_q_switch_type(ctx, args, stream);
 }
 
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index b58ac9e7b428..0b42888daefe 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -6,9 +6,135 @@
 
 #include <climits>
 #include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <algorithm>
 
 using namespace ggml_cuda_mma;
 
+#if defined(GGML_USE_HIP)
+enum mmq_profile_phase {
+    MMQ_PROFILE_LOAD_TILES = 0,
+    MMQ_PROFILE_Y_ACT      = 1,
+    MMQ_PROFILE_VEC_DOT    = 2,
+    MMQ_PROFILE_KB_ITERS   = 3,
+    MMQ_PROFILE_N          = 4,
+};
+
+static __device__ __forceinline__ bool mmq_profile_want_sample(uint64_t * const profile) {
+    return profile && blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0
+        && threadIdx.x == 0 && threadIdx.y == 0;
+}
+
+static __device__ __forceinline__ void mmq_profile_kb_iter(uint64_t * const profile) {
+    if (!mmq_profile_want_sample(profile)) {
+        return;
+    }
+    ++profile[MMQ_PROFILE_KB_ITERS];
+}
+
+static __device__ __forceinline__ void mmq_profile_phase_begin(uint64_t * const profile, uint64_t & t0) {
+    if (!mmq_profile_want_sample(profile)) {
+        return;
+    }
+    t0 = clock64();
+}
+
+// Safe delta for clock64(): unsigned (t1 - t0) wraps to ~2^64 when t1 < t0 (non-monotonic
+// timestamp across __syncthreads / wave scheduling). Discard small backward glitches; keep
+// genuine 64-bit counter wrap (t0 near UINT64_MAX, t1 small).
+static __device__ __forceinline__ uint64_t mmq_profile_phase_cycles(const uint64_t t0, const uint64_t t1) {
+    if (t1 >= t0) {
+        return t1 - t0;
+    }
+    const uint64_t backward = t0 - t1;
+    if (backward > (UINT64_MAX / 2)) {
+        return t1 - t0; // unsigned wrap
+    }
+    return 0;
+}
+
+static __device__ __forceinline__ void mmq_profile_phase_end(uint64_t * const profile, const uint64_t t0, const int phase) {
+    if (!mmq_profile_want_sample(profile)) {
+        return;
+    }
+    profile[phase] += mmq_profile_phase_cycles(t0, clock64());
+}
+
+static bool mmq_profile_phases_enabled() {
+    static int enabled = -1;
+    if (enabled < 0) {
+        enabled = getenv("MMQ_PROFILE_PHASES") ? 1 : 0;
+    }
+    return enabled;
+}
+
+struct mmq_profile_guard {
+    ggml_cuda_pool_alloc<uint64_t> buf;
+    cudaStream_t stream = nullptr;
+    int           type  = 0;
+    int           mmq_x = 0;
+    int64_t       nrows_x = 0;
+    int64_t       ncols_max = 0;
+    bool          active = false;
+
+    mmq_profile_guard(ggml_cuda_pool & pool, const bool enable, const int type_in, const int mmq_x_in,
+            const int64_t nrows_x_in, const int64_t ncols_max_in, cudaStream_t stream_in)
+        : buf(pool), stream(stream_in), type(type_in), mmq_x(mmq_x_in), nrows_x(nrows_x_in), ncols_max(ncols_max_in), active(enable) {
+        if (!active) {
+            return;
+        }
+        buf.alloc(MMQ_PROFILE_N);
+        CUDA_CHECK(cudaMemset(buf.get(), 0, MMQ_PROFILE_N*sizeof(uint64_t)));
+    }
+
+    uint64_t * ptr() {
+        return active ? buf.get() : nullptr;
+    }
+
+    ~mmq_profile_guard() {
+        if (!active) {
+            return;
+        }
+        CUDA_CHECK(cudaStreamSynchronize(stream));
+        uint64_t h[MMQ_PROFILE_N] = {};
+        CUDA_CHECK(cudaMemcpy(h, buf.get(), sizeof(h), cudaMemcpyDeviceToHost));
+        const uint64_t total = h[MMQ_PROFILE_LOAD_TILES] + h[MMQ_PROFILE_Y_ACT] + h[MMQ_PROFILE_VEC_DOT];
+        // Skip corrupt rows (overflow / legacy bad deltas) — see mmq_profile_phase_cycles.
+        if (total > 0 && total < 5000000000ULL) {
+            fprintf(stderr,
+                "[MMQ_PROFILE] type=%d mmq_x=%d nrows_x=%ld ncols_max=%ld "
+                "load_tiles=%.1f%% y_act=%.1f%% vec_dot=%.1f%% "
+                "kb_iters=%llu cycles_load=%llu cycles_y=%llu cycles_vec=%llu cycles_total=%llu\n",
+                type, mmq_x, (long) nrows_x, (long) ncols_max,
+                100.0*h[MMQ_PROFILE_LOAD_TILES]/total,
+                100.0*h[MMQ_PROFILE_Y_ACT]/total,
+                100.0*h[MMQ_PROFILE_VEC_DOT]/total,
+                (unsigned long long) h[MMQ_PROFILE_KB_ITERS],
+                (unsigned long long) h[MMQ_PROFILE_LOAD_TILES],
+                (unsigned long long) h[MMQ_PROFILE_Y_ACT],
+                (unsigned long long) h[MMQ_PROFILE_VEC_DOT],
+                (unsigned long long) total);
+        }
+    }
+};
+#else
+static __device__ __forceinline__ void mmq_profile_kb_iter(uint64_t *) {}
+
+static __device__ __forceinline__ void mmq_profile_phase_begin(uint64_t *, uint64_t &) {}
+
+static __device__ __forceinline__ void mmq_profile_phase_end(uint64_t *, const uint64_t, const int) {}
+
+static bool mmq_profile_phases_enabled() {
+    return false;
+}
+
+struct mmq_profile_guard {
+    mmq_profile_guard(ggml_cuda_pool &, const bool, const int, const int, const int64_t, const int64_t, cudaStream_t) {}
+    uint64_t * ptr() { return nullptr; }
+};
+#endif // GGML_USE_HIP
+
 #define MMQ_DP4A_MAX_BATCH_SIZE 64 // Max. batch size to use for dp4a MMQ kernels when FP16 tensor cores are available.
 #define MMQ_ITER_K             256
 #define MMQ_ITER_K_FP4         512
@@ -275,8 +401,26 @@ static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
 #define MMQ_TILE_Y_K     (MMQ_TILE_NE_K + MMQ_TILE_NE_K / QI8_1)
 #define MMQ_TILE_Y_FP4_K MMQ_TILE_Y_K
 
+static int mmq_get_tile_y_k_padded_host(const int mmq_x, const int cc, const int warp_size, const int nwarps) {
+    const int pad = GGML_CUDA_CC_IS_RDNA3_5(cc) ? 2*nwarps*warp_size : nwarps*warp_size;
+    return GGML_PAD(mmq_x*MMQ_TILE_Y_K, pad);
+}
+
+#if defined(RDNA3_5)
+static constexpr __device__ int mmq_get_tile_y_k_padded_device(const int mmq_x, const int nwarps, const int warp_size) {
+    return GGML_PAD(mmq_x*MMQ_TILE_Y_K, 2*nwarps*warp_size);
+}
+#else
+static constexpr __device__ int mmq_get_tile_y_k_padded_device(const int mmq_x, const int nwarps, const int warp_size) {
+    return GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size);
+}
+#endif // RDNA3_5
+
 static int mmq_get_granularity_host(const int mmq_x, const int cc) {
     if (amd_mfma_available(cc) || amd_wmma_available(cc)) {
+        if (GGML_CUDA_CC_IS_RDNA3_5(cc) && mmq_x >= 64) {
+            return 32;
+        }
         return mmq_x >= 128 ? 32 : 16;
     } else if (turing_mma_available(cc) && mmq_x >= 48) {
         return 16;
@@ -287,7 +431,11 @@ static int mmq_get_granularity_host(const int mmq_x, const int cc) {
 
 #if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
 static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
+#if defined(RDNA3_5)
+    return mmq_x >= 64 ? 32 : 16;
+#else
     return mmq_x >= 128 ? 32 : 16;
+#endif // RDNA3_5
 }
 #elif defined(TURING_MMA_AVAILABLE)
 static constexpr __device__ int mmq_get_granularity_device(const int mmq_x) {
@@ -1303,6 +1451,133 @@ static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma(
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
 }
 
+template <int mmq_x, int mmq_y>
+static __device__ __forceinline__ void vec_dot_q4_0_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
+#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE)
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+
+    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = granularity;
+    constexpr int ntx           = rows_per_warp/tile_C::I;
+
+    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_ds = (const half2 *) y;
+
+    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
+
+    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
+        const int k0 = k00 + k01;
+
+        tile_A A[ntx];
+        float dA[ntx][tile_C::ne];
+
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
+
+#pragma unroll
+            for (int l = 0; l < tile_C::ne; ++l) {
+                const int i = i0 + n*tile_A::I + tile_C::get_i(l);
+                dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0];
+            }
+        }
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
+            tile_B B;
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+
+            const int j = j0 + tile_C::get_j(0);
+            const float dB = __low2float(y_ds[j*MMQ_TILE_Y_K + k01/QI8_1]);
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                tile_C C;
+                mma(C, A[n], B);
+
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA[n][l]*dB;
+                }
+            }
+        }
+    }
+#else
+    vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_DS4>(x, y, sum, k00);
+#endif
+}
+
+template <int mmq_x, int mmq_y>
+static __device__ __forceinline__ void vec_dot_q8_0_q8_1_mma_rdna35(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
+#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE)
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+
+    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = granularity;
+    constexpr int ntx           = rows_per_warp/tile_C::I;
+
+    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + 2*MMQ_TILE_NE_K;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
+
+    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_0) {
+        const int k0 = k00 + k01;
+
+        tile_A A[ntx];
+        float dA[ntx][tile_C::ne];
+
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_0 + k0, MMQ_MMA_TILE_X_K_Q8_0);
+
+#pragma unroll
+            for (int l = 0; l < tile_C::ne; ++l) {
+                const int i = i0 + n*tile_A::I + tile_C::get_i(l);
+                dA[n][l] = x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + k0/QI8_0];
+            }
+        }
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
+            tile_B B;
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+
+            const int j = j0 + tile_C::get_j(0);
+            const float dB = y_df[j*MMQ_TILE_Y_K + k01/QI8_1];
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                tile_C C;
+                mma(C, A[n], B);
+
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    sum[(j0/tile_C::J + n)*tile_C::ne + l] += C.x[l]*dA[n][l]*dB;
+                }
+            }
+        }
+    }
+#else
+    vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>(x, y, sum, k00);
+#endif
+}
 
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q8_1_q8_1_dp4a(
@@ -2100,6 +2375,78 @@ static __device__ __forceinline__ int unpack_scales_q45_K(const int * scales, co
            ((scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030);  // upper 2 bits
 }
 
+// LDS int offset for pseudo-q8_1 qs within one weight row (matches Q4_1 / WMMA ldmatrix layout).
+static __device__ __forceinline__ int mmq_q4_K_qs_lds_k(const int txi, const int nibble) {
+    return ((txi >> 3) << 4) + (txi & 7) + (nibble << 3);
+}
+
+#if defined(RDNA3_5)
+// gfx115x mmq_y=64: 128 threads map 2:1 to rows for scale/dm prep (no warp divergence).
+template <int mmq_y, bool need_check>
+static __device__ __forceinline__ void load_tiles_q4_K_dm_rdna35(
+        const char * __restrict__ x, half2 * __restrict__ x_dm, const int kbx0, const int i_max, const int stride) {
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
+
+    const int dm_tid = threadIdx.y*warp_size + threadIdx.x;
+    const int dm_row = dm_tid/2;
+    const int dm_ksc = dm_tid%2;
+
+    if (dm_row >= mmq_y) {
+        return;
+    }
+
+    int i = dm_row;
+    if (need_check) {
+        i = min(i, i_max);
+    }
+
+    const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+    const int * scales = (const int *) bxi->scales;
+
+    const int sc32 = unpack_scales_q45_K(scales, dm_ksc + 0);
+    const int  m32 = unpack_scales_q45_K(scales, dm_ksc + 2);
+
+    const uint8_t * sc8 = (const uint8_t *) &sc32;
+    const uint8_t *  m8 = (const uint8_t *)  &m32;
+
+    const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
+
+#pragma unroll
+    for (int l = 0; l < 4; ++l) {
+        x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*dm_ksc + l] = dm*make_half2(sc8[l], m8[l]);
+    }
+}
+
+// gfx115x WMMA: one warp per row, coalesced qs global load, nibble expand to ldmatrix-ready LDS.
+template <int mmq_y, bool need_check>
+static __device__ __forceinline__ void load_tiles_q4_K_qs_wmma_rdna35(
+        const char * __restrict__ x, int * __restrict__ x_qs, const int kbx0, const int i_max, const int stride) {
+    constexpr int nwarps      = mmq_get_nwarps_device();
+    constexpr int warp_size   = ggml_cuda_get_physical_warp_size();
+    constexpr int qs_per_row  = MMQ_ITER_K / (4 * QR4_K); // 32 ints per block_q4_K::qs
+    static_assert(qs_per_row == 32, "bad Q4_K qs_per_row");
+    constexpr int nrows       = warp_size / qs_per_row;
+    static_assert(nrows == 1, "Q4_K RDNA3.5 WMMA qs path expects one row per warp");
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nrows*nwarps) {
+        int i = i0 + threadIdx.y;
+
+        if (need_check) {
+            i = min(i, i_max);
+        }
+
+        const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+        const int qs0 = ((const int *) bxi->qs)[threadIdx.x];
+
+        int * row_qs = x_qs + i*MMQ_MMA_TILE_X_K_Q8_1;
+        const int kqs = mmq_q4_K_qs_lds_k(threadIdx.x, 0);
+        row_qs[kqs]    = qs0 & 0x0F0F0F0F;
+        row_qs[kqs+8]  = (qs0 >> 4) & 0x0F0F0F0F;
+    }
+}
+#endif // RDNA3_5
+
 template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q4_K(
     const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
     constexpr int nwarps = mmq_get_nwarps_device();
@@ -2115,6 +2462,11 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
     int   * x_sc = (int   *) (x_dm + txs.dm);
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
 
+#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE)
+    // dm first: warms cache lines before qs reads from same block_q4_K.
+    load_tiles_q4_K_dm_rdna35<mmq_y, need_check>(x, x_dm, kbx0, i_max, stride);
+    load_tiles_q4_K_qs_wmma_rdna35<mmq_y, need_check>(x, x_qs, kbx0, i_max, stride);
+#else
     constexpr int threads_per_row = MMQ_ITER_K / (4 * QR4_K);
     constexpr int nrows = warp_size / threads_per_row;
     const int txi = warp_size > threads_per_row ? threadIdx.x % threads_per_row : threadIdx.x;
@@ -2128,30 +2480,36 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
         }
 
         const block_q4_K * bxi = (const block_q4_K *) x + kbx0 + i*stride;
+#if defined(RDNA3_5)
+        const int qs0 = ((const int *) bxi->qs)[txi];
+#else
         const int qs0 = get_int_b4(bxi->qs, txi);
+#endif
 
 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 0] = (qs0 >> 0) & 0x0F0F0F0F;
-        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + 16*(txi/8) + txi % 8 + 8] = (qs0 >> 4) & 0x0F0F0F0F;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + mmq_q4_K_qs_lds_k(txi, 0)] = (qs0 >> 0) & 0x0F0F0F0F;
+        x_qs[i*MMQ_MMA_TILE_X_K_Q8_1 + mmq_q4_K_qs_lds_k(txi, 1)] = (qs0 >> 4) & 0x0F0F0F0F;
 #else
         x_qs[i*(MMQ_TILE_NE_K + 1) + txi] = qs0;
 #endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
     }
+#endif // RDNA3_5 WMMA qs path
 
 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE)
+    // dm handled above.
+#else
     constexpr int rows_per_warp = warp_size / 2;
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps*rows_per_warp) {
-#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
-        // Need if on AMD instead of % because warp_size == 64
-        // This causes double work and throughput loss (MI300X)
-        // H100 loses about 100 t/s with 'if' condition over '%'
+#if defined(AMD_MFMA_AVAILABLE)
+        // Need if on CDNA (warp_size == 64) instead of %.
         int i = i0 + threadIdx.y*rows_per_warp + threadIdx.x/2;
         if (i < mmq_y) {
 #else
         int i = (i0 + threadIdx.y*rows_per_warp + threadIdx.x/2) % mmq_y;
         {
-#endif // defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#endif // defined(AMD_MFMA_AVAILABLE)
             if (need_check) {
                 i = min(i, i_max);
             }
@@ -2175,6 +2533,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
             }
         }
     }
+#endif // RDNA3_5 RDNA WMMA dm path
 #else
 #pragma unroll
     for (int i0 = 0; i0 < mmq_y; i0 += nwarps*warp_size) {
@@ -2244,6 +2603,81 @@ static __device__ __forceinline__ void vec_dot_q4_K_q8_1_dp4a(
     }
 }
 
+template <int mmq_x, int mmq_y>
+static __device__ __forceinline__ void vec_dot_q4_K_q8_1_mma(
+    const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
+#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE)
+    // Hoist dmA per k-slice only — full A+dmA hoist spills registers (~2x slower).
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  8, int, input_layout>        tile_A;
+    typedef tile<16,  8, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+
+    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = granularity;
+    constexpr int ntx           = rows_per_warp/tile_C::I;
+
+    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const half2 * x_dm = (const half2 *) x_qs + 2*MMQ_TILE_NE_K;
+    const int   * y_qs = (const int   *) y + 4;
+    const half2 * y_dm = (const half2 *) y;
+
+    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
+
+    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += QI8_1) {
+        const int k0 = k00 + k01;
+
+        tile_A A[ntx];
+        float2 dmA[ntx][tile_C::ne];
+
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q8_1 + k0, MMQ_MMA_TILE_X_K_Q8_1);
+
+#pragma unroll
+            for (int l = 0; l < tile_C::ne; ++l) {
+                const int i = i0 + n*tile_A::I + tile_C::get_i(l);
+                dmA[n][l] = __half22float2(x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + k0/QI8_1]);
+            }
+        }
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += ntx*tile_C::J) {
+            tile_B B;
+            load_ldmatrix(B, y_qs + j0*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+
+            const int j = j0 + tile_C::get_j(0);
+            const float2 dsB = __half22float2(y_dm[j*MMQ_TILE_Y_K + k01/QI8_1]);
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                float2 scl[tile_C::ne];
+
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    scl[l].x = dmA[n][l].x*dsB.x;
+                    scl[l].y = dmA[n][l].y*dsB.y;
+                }
+
+                tile_C C;
+                mma(C, A[n], B);
+
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    const int si = (j0/tile_C::J + n)*tile_C::ne + l;
+                    sum[si] += scl[l].x*C.x[l];
+                    sum[si] += scl[l].y;
+                }
+            }
+        }
+    }
+#else
+    vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>(x, y, sum, k00);
+#endif
+}
+
 template <int mmq_y, bool need_check> static __device__ __forceinline__ void load_tiles_q5_K(
     const char * __restrict__ x, int * __restrict__ x_tile, const int kbx0, const int i_max, const int stride) {
     constexpr int nwarps = mmq_get_nwarps_device();
@@ -2525,7 +2959,85 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_dp4a(
 template <int mmq_x, int mmq_y>
 static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
     const int * __restrict__ x, const int * __restrict__ y, float * __restrict__ sum, const int k00) {
-#if defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
+#if defined(RDNA3_5) && defined(AMD_WMMA_AVAILABLE) && !defined(AMD_MFMA_AVAILABLE)
+    // Hoist sclA = sc[k01/4]*d per k-slice — scales were loaded per (n,l) in j-loop.
+    constexpr data_layout input_layout = get_input_data_layout();
+    typedef tile<16,  4, int, input_layout>        tile_A;
+    typedef tile<16,  4, int, input_layout>        tile_B;
+    typedef tile<16, 16, int, DATA_LAYOUT_J_MAJOR> tile_C;
+
+    constexpr int granularity   = mmq_get_granularity_device(mmq_x);
+    constexpr int rows_per_warp = granularity;
+    constexpr int ntx           = rows_per_warp/tile_C::I;
+
+    y += (threadIdx.y % ntx) * (tile_C::J*MMQ_TILE_Y_K);
+
+    const int   * x_qs = (const int   *) x;
+    const float * x_df = (const float *) x_qs + MMQ_TILE_NE_K*2;
+    const int   * x_sc = (const int   *) x_df + MMQ_TILE_NE_K/QI6_K;
+    const int   * y_qs = (const int   *) y + 4;
+    const float * y_df = (const float *) y;
+
+    const int i0 = (threadIdx.y / ntx) * rows_per_warp;
+
+    for (int k01 = 0; k01 < MMQ_TILE_NE_K; k01 += 4) {
+        const int k0 = k00 + k01;
+
+        tile_A A[ntx];
+        float sclA[ntx][tile_C::ne];
+
+#pragma unroll
+        for (int n = 0; n < ntx; ++n) {
+            load_ldmatrix(A[n], x_qs + (i0 + n*tile_A::I)*MMQ_MMA_TILE_X_K_Q6_K + k0, MMQ_MMA_TILE_X_K_Q6_K);
+
+#pragma unroll
+            for (int l = 0; l < tile_C::ne; ++l) {
+                const int i = i0 + n*tile_A::I + tile_C::get_i(l);
+                const int8_t * sc = (const int8_t *) (x_sc + i*MMQ_MMA_TILE_X_K_Q6_K + k00/16);
+                sclA[n][l] = (float) sc[k01/4] * x_df[i*MMQ_MMA_TILE_X_K_Q6_K];
+            }
+        }
+
+        constexpr int j_step = ntx*tile_C::J;
+
+        tile_B B0;
+        load_ldmatrix(B0, y_qs + 0, MMQ_TILE_Y_K);
+        float dB0 = y_df[tile_C::get_j(0)*MMQ_TILE_Y_K + k01/QI8_1];
+
+#pragma unroll
+        for (int j0 = 0; j0 < mmq_x; j0 += j_step) {
+            const int j0_next = j0 + j_step;
+
+            tile_B B1;
+            float dB1 = 0.0f;
+            if (j0_next < mmq_x) {
+                load_ldmatrix(B1, y_qs + j0_next*MMQ_TILE_Y_K + k01, MMQ_TILE_Y_K);
+                const int jn = j0_next + tile_C::get_j(0);
+                dB1 = y_df[jn*MMQ_TILE_Y_K + k01/QI8_1];
+            }
+
+            tile_C C[ntx];
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+                mma(C[n], A[n], B0);
+            }
+
+#pragma unroll
+            for (int n = 0; n < ntx; ++n) {
+#pragma unroll
+                for (int l = 0; l < tile_C::ne; ++l) {
+                    sum[(j0/tile_C::J + n)*tile_C::ne + l] += C[n].x[l]*sclA[n][l]*dB0;
+                }
+            }
+
+            if (j0_next < mmq_x) {
+                B0 = B1;
+                dB0 = dB1;
+            }
+        }
+    }
+#elif defined(AMD_MFMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
     constexpr data_layout input_layout = get_input_data_layout();
     typedef tile<16,  4, int, input_layout>        tile_A;
     typedef tile<16,  4, int, input_layout>        tile_B;
@@ -3287,7 +3799,7 @@ template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_0> {
     static constexpr int              vdr          = VDR_Q4_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_0<mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_DS4>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q4_0_q8_1_mma<mmq_x, mmq_y>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_0_q8_1_dp4a<mmq_x, mmq_y>;
 };
 
@@ -3319,7 +3831,7 @@ template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q8_0> {
     static constexpr int              vdr          = VDR_Q8_0_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q8_0<mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma<mmq_x, mmq_y, MMQ_Q8_1_DS_LAYOUT_D4>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_0_q8_1_mma_rdna35<mmq_x, mmq_y>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
 };
 
@@ -3369,7 +3881,7 @@ template <int mmq_x, int mmq_y, bool need_check>
 struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_Q4_K> {
     static constexpr int              vdr          = VDR_Q4_K_Q8_1_MMQ;
     static constexpr load_tiles_mmq_t load_tiles   = load_tiles_q4_K<mmq_y, need_check>;
-    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q8_1_q8_1_mma<mmq_x, mmq_y>;
+    static constexpr vec_dot_mmq_t    vec_dot_mma  = vec_dot_q4_K_q8_1_mma<mmq_x, mmq_y>;
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q4_K_q8_1_dp4a<mmq_x, mmq_y>;
 };
 
@@ -3453,12 +3965,48 @@ struct mmq_type_traits<mmq_x, mmq_y, need_check, GGML_TYPE_IQ4_XS> {
     static constexpr vec_dot_mmq_t    vec_dot_dp4a = vec_dot_q8_0_q8_1_dp4a<mmq_x, mmq_y>;
 };
 
+
+#if defined(RDNA3_5)
+// Software-pipeline activation tile loads: issue global loads into registers, run WMMA,
+// then store to LDS. Overlaps memory latency with vec_dot on gfx115x (no ISA prefetch).
+template <int mmq_x, int nwarps, int warp_size, int nchunks>
+static __device__ __forceinline__ void mmq_tile_y_load_global(
+        int * __restrict__ tile_y, const int * __restrict__ by) {
+#pragma unroll
+    for (int c = 0; c < nchunks; ++c) {
+        const int l = c*(nwarps*warp_size) + threadIdx.y*warp_size + threadIdx.x;
+        tile_y[l] = by[l];
+    }
+}
+
+template <int nwarps, int warp_size, int nchunks>
+static __device__ __forceinline__ void mmq_tile_y_load_global_to_regs(
+        const int * __restrict__ by, int (&cache)[nchunks]) {
+#pragma unroll
+    for (int c = 0; c < nchunks; ++c) {
+        const int l = c*(nwarps*warp_size) + threadIdx.y*warp_size + threadIdx.x;
+        cache[c] = by[l];
+    }
+}
+
+template <int nwarps, int warp_size, int nchunks>
+static __device__ __forceinline__ void mmq_tile_y_store_regs(
+        int * __restrict__ tile_y, const int (&cache)[nchunks]) {
+#pragma unroll
+    for (int c = 0; c < nchunks; ++c) {
+        const int l = c*(nwarps*warp_size) + threadIdx.y*warp_size + threadIdx.x;
+        tile_y[l] = cache[c];
+    }
+}
+#endif // RDNA3_5
+
 template <ggml_type type, int mmq_x, bool need_check, bool fixup>
 static __device__ __forceinline__ void mul_mat_q_process_tile(
         const char * __restrict__ x, const int offset_x, const int * __restrict__ y,
         const int * __restrict__ ids_dst, float * __restrict__ dst, float * __restrict__ tmp_fixup,
         const int stride_row_x, const int ncols_y, const int stride_col_dst,
-        const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop) {
+        const int tile_x_max_i, const int tile_y_max_j, const int kb0_start, const int kb0_stop,
+        uint64_t * mmq_profile) {
 
     constexpr int              warp_size  = ggml_cuda_get_physical_warp_size();
     constexpr int              nwarps     = mmq_get_nwarps_device();
@@ -3468,7 +4016,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
 
     extern __shared__ int data_mul_mat_q[];
     int * tile_y = data_mul_mat_q + mmq_x;
-    int * tile_x = tile_y + GGML_PAD(mmq_x*MMQ_TILE_Y_K, nwarps*warp_size);
+    int * tile_x = tile_y + mmq_get_tile_y_k_padded_device(mmq_x, nwarps, warp_size);
 
 #if defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
     constexpr vec_dot_mmq_t    vec_dot    = mmq_type_traits<mmq_x, mmq_y, need_check, type>::vec_dot_mma;
@@ -3492,39 +4040,206 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
 
     constexpr int sz = sizeof(block_q8_1_mmq) / sizeof(int);
 
-    for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
-        load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
-        {
-            const int * by0 = y + ncols_y * (kb0 * qk / ne_block) * sz;
-#pragma unroll
-            for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
-                int l = l0 + threadIdx.y*warp_size + threadIdx.x;
+#if defined(RDNA3_5)
+    constexpr int tile_y_elems       = mmq_x*MMQ_TILE_Y_K;
+    constexpr int tile_y_load_stride   = nwarps*warp_size;
+    if constexpr (mmq_x <= 64 && tile_y_elems % tile_y_load_stride == 0) {
+        constexpr int tile_y_nchunks = tile_y_elems/tile_y_load_stride;
+
+        int  y0_next_cache[tile_y_nchunks];
+        bool have_y0_prefetch = false;
+
+        if (mmq_profile) {
+            for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
+                mmq_profile_kb_iter(mmq_profile);
+
+                {
+                    uint64_t t0 = 0;
+                    __syncthreads();
+                    mmq_profile_phase_begin(mmq_profile, t0);
+                    load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
+                    __syncthreads();
+                    mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_LOAD_TILES);
+                }
 
-                tile_y[l] = by0[l];
+                const int yk = kb0 * qk / ne_block;
+                const int * by0 = y + ncols_y * yk * sz;
+                const int * by1 = y + ncols_y * (yk + 1) * sz;
+
+                {
+                    uint64_t t0 = 0;
+                    __syncthreads();
+                    mmq_profile_phase_begin(mmq_profile, t0);
+
+                    if (have_y0_prefetch) {
+                        mmq_tile_y_store_regs<nwarps, warp_size, tile_y_nchunks>(tile_y, y0_next_cache);
+                        have_y0_prefetch = false;
+                    } else {
+                        mmq_tile_y_load_global<mmq_x, nwarps, warp_size, tile_y_nchunks>(tile_y, by0);
+                    }
+
+                    __syncthreads();
+
+                    int y1_cache[tile_y_nchunks];
+                    mmq_tile_y_load_global_to_regs<nwarps, warp_size, tile_y_nchunks>(by1, y1_cache);
+
+                    mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_Y_ACT);
+
+                    __syncthreads();
+                    mmq_profile_phase_begin(mmq_profile, t0);
+
+                    vec_dot(tile_x, tile_y, sum, 0);
+
+                    __syncthreads();
+
+                    mmq_tile_y_store_regs<nwarps, warp_size, tile_y_nchunks>(tile_y, y1_cache);
+
+                    __syncthreads();
+
+                    const int kb0_next = kb0 + blocks_per_iter;
+                    if (kb0_next < kb0_stop) {
+                        const int * by0_next = y + ncols_y * (kb0_next * qk / ne_block) * sz;
+                        mmq_tile_y_load_global_to_regs<nwarps, warp_size, tile_y_nchunks>(by0_next, y0_next_cache);
+                        have_y0_prefetch = true;
+                    }
+
+                    vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K);
+
+                    __syncthreads();
+                    mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_VEC_DOT);
+                }
+            }
+        } else {
+            for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
+                load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
+
+                const int yk = kb0 * qk / ne_block;
+                const int * by0 = y + ncols_y * yk * sz;
+                const int * by1 = y + ncols_y * (yk + 1) * sz;
+
+                if (have_y0_prefetch) {
+                    mmq_tile_y_store_regs<nwarps, warp_size, tile_y_nchunks>(tile_y, y0_next_cache);
+                    have_y0_prefetch = false;
+                } else {
+                    mmq_tile_y_load_global<mmq_x, nwarps, warp_size, tile_y_nchunks>(tile_y, by0);
+                }
+
+                __syncthreads();
+
+                int y1_cache[tile_y_nchunks];
+                mmq_tile_y_load_global_to_regs<nwarps, warp_size, tile_y_nchunks>(by1, y1_cache);
+
+                vec_dot(tile_x, tile_y, sum, 0);
+
+                __syncthreads();
+
+                mmq_tile_y_store_regs<nwarps, warp_size, tile_y_nchunks>(tile_y, y1_cache);
+
+                __syncthreads();
+
+                const int kb0_next = kb0 + blocks_per_iter;
+                if (kb0_next < kb0_stop) {
+                    const int * by0_next = y + ncols_y * (kb0_next * qk / ne_block) * sz;
+                    mmq_tile_y_load_global_to_regs<nwarps, warp_size, tile_y_nchunks>(by0_next, y0_next_cache);
+                    have_y0_prefetch = true;
+                }
+
+                vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K);
+
+                __syncthreads();
             }
         }
+    } else
+#endif // RDNA3_5
+    {
+        if (mmq_profile) {
+            for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
+                mmq_profile_kb_iter(mmq_profile);
+
+                {
+                    uint64_t t0 = 0;
+                    __syncthreads();
+                    mmq_profile_phase_begin(mmq_profile, t0);
+                    load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
+                    __syncthreads();
+                    mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_LOAD_TILES);
+                }
 
-        __syncthreads();
+                const int yk = kb0 * qk / ne_block;
+                const int * by0 = y + ncols_y * yk * sz;
+                const int * by1 = y + ncols_y * (yk + 1) * sz;
 
-        vec_dot(tile_x, tile_y, sum, 0);
+                {
+                    uint64_t t0 = 0;
+                    __syncthreads();
+                    mmq_profile_phase_begin(mmq_profile, t0);
+#pragma unroll
+                    for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
+                        int l = l0 + threadIdx.y*warp_size + threadIdx.x;
 
-        __syncthreads();
+                        tile_y[l] = by0[l];
+                    }
+                    __syncthreads();
+                    mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_Y_ACT);
+                }
 
-        {
-            const int * by0 = y + ncols_y * ((kb0 * qk / ne_block) * sz + sz);
+                {
+                    uint64_t t0 = 0;
+                    __syncthreads();
+                    mmq_profile_phase_begin(mmq_profile, t0);
+                    vec_dot(tile_x, tile_y, sum, 0);
+                    __syncthreads();
 #pragma unroll
-            for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
-                int l = l0 + threadIdx.y*warp_size + threadIdx.x;
+                    for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
+                        int l = l0 + threadIdx.y*warp_size + threadIdx.x;
 
-                tile_y[l] = by0[l];
+                        tile_y[l] = by1[l];
+                    }
+                    __syncthreads();
+                    vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K);
+                    __syncthreads();
+                    mmq_profile_phase_end(mmq_profile, t0, MMQ_PROFILE_VEC_DOT);
+                }
             }
-        }
+        } else {
+            for (int kb0 = kb0_start; kb0 < kb0_stop; kb0 += blocks_per_iter) {
+                load_tiles(x, tile_x, offset_x + kb0, tile_x_max_i, stride_row_x);
 
-        __syncthreads();
+                const int yk = kb0 * qk / ne_block;
+                const int * by0 = y + ncols_y * yk * sz;
+                const int * by1 = y + ncols_y * (yk + 1) * sz;
 
-        vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K);
+                {
+#pragma unroll
+                    for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
+                        int l = l0 + threadIdx.y*warp_size + threadIdx.x;
 
-        __syncthreads();
+                        tile_y[l] = by0[l];
+                    }
+                }
+
+                __syncthreads();
+
+                vec_dot(tile_x, tile_y, sum, 0);
+
+                __syncthreads();
+
+                {
+#pragma unroll
+                    for (int l0 = 0; l0 < mmq_x * MMQ_TILE_Y_K; l0 += nwarps * warp_size) {
+                        int l = l0 + threadIdx.y*warp_size + threadIdx.x;
+
+                        tile_y[l] = by1[l];
+                    }
+                }
+
+                __syncthreads();
+
+                vec_dot(tile_x, tile_y, sum, MMQ_TILE_NE_K);
+
+                __syncthreads();
+            }
+        }
     }
 
     if (fixup) {
@@ -3535,6 +4250,7 @@ static __device__ __forceinline__ void mul_mat_q_process_tile(
 }
 
 
+
 // The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
 
 template <ggml_type type, int mmq_x, bool need_check>
@@ -3555,7 +4271,7 @@ static __global__ void mul_mat_q(
         const uint3 blocks_per_ne00, const int nrows_x, const int ncols_dst, const int stride_row_x, const int ncols_y, const int stride_col_dst,
         const uint3 channel_ratio, const uint3 nchannels_y, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
         const uint3 sample_ratio, const uint3 nsamples_y, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
-        const uint3 ntx) {
+        const uint3 ntx, uint64_t * mmq_profile) {
 
     // Skip unused template specializations for faster compilation:
     if (mmq_x > get_mmq_x_max_device() || mmq_x % mmq_get_granularity_device(mmq_x) != 0) {
@@ -3640,7 +4356,7 @@ static __global__ void mul_mat_q(
         constexpr bool fixup = false;
         mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
             (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
-             tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z);
+             tile_x_max_i, tile_y_max_j, 0, blocks_per_ne00.z, mmq_profile);
         return;
     }
 #endif // (defined(GGML_USE_HIP) && !defined(CDNA4) && !defined(CDNA3)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -3720,7 +4436,7 @@ static __global__ void mul_mat_q(
         constexpr bool fixup = false; // All but (potentially) the last iterations write their data to dst rather than the fixup buffer.
         mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
             (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
-             tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
+             tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop, mmq_profile);
 
         kbc += blocks_per_ne00.z;
         kbc -= fastmodulo(kbc, blocks_per_ne00);
@@ -3789,7 +4505,7 @@ static __global__ void mul_mat_q(
     constexpr bool fixup = true; // Last index writes its data to fixup buffer to avoid data races with other blocks.
     mul_mat_q_process_tile<type, mmq_x, need_check, fixup>
         (x, offset_x, y + offset_y, ids_dst_shared, dst + offset_dst, tmp_fixup, stride_row_x, ncols_y, stride_col_dst,
-         tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
+         tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop, mmq_profile);
 }
 
 template <ggml_type type, int mmq_x, bool need_check>
@@ -3939,6 +4655,23 @@ struct mmq_args {
     bool use_stream_k; int64_t ncols_max;
 };
 
+#if defined(GGML_USE_HIP)
+// RDNA3.5 dual-WG (mmq_x=64, nbytes <= smpbo/2) helps K-quants with large per-tile LDS
+// (Q6_K WMMA tuning). Block quants (Q5_0, Q8_0, Q4_0) are faster at mmq_x=128 ntx=1.
+static bool mmq_rdna35_dual_wg_eligible(const ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return true;
+        default:
+            return false;
+    }
+}
+#endif // GGML_USE_HIP
+
 template<ggml_type type>
 static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int cc, const int warp_size, const int nwarps) {
     const tile_x_sizes txs = mmq_get_dp4a_tile_x_sizes(type, mmq_y);
@@ -3946,7 +4679,11 @@ static size_t mmq_get_nbytes_shared(const int mmq_x, const int mmq_y, const int
     const size_t nbs_ids = mmq_x*sizeof(int);
     const size_t nbs_x = (turing_mma_available(cc) || amd_mfma_available(cc) || amd_wmma_available(cc)) ? mmq_y*mmq_tile_x_k*sizeof(int) : txs.qs*sizeof(int) + txs.dm*sizeof(half2) + txs.sc*sizeof(int);
     const size_t nbs_y = mmq_x * (sizeof(block_q8_1_mmq));
-    return nbs_ids + nbs_x + GGML_PAD(nbs_y, nwarps*warp_size*sizeof(int));
+    const int tile_y_k_padded = mmq_get_tile_y_k_padded_host(mmq_x, cc, warp_size, nwarps);
+    const size_t nbs_y_padded = std::max(nbs_y, (size_t) tile_y_k_padded*sizeof(int));
+    const int pad = GGML_CUDA_CC_IS_RDNA3_5(cc) ? 2*nwarps*warp_size : nwarps*warp_size;
+    size_t nbytes = nbs_ids + nbs_x + GGML_PAD(nbs_y_padded, pad*sizeof(int));
+    return nbytes;
 }
 
 template <ggml_type type, int mmq_x>
@@ -3957,9 +4694,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
     const int warp_size = ggml_cuda_info().devices[id].warp_size;
     const int nwarps = mmq_get_nwarps_host(cc, warp_size);
     const int mmq_y = get_mmq_y_host(cc);
+    const size_t smpbo = ggml_cuda_info().devices[id].smpbo;
 
     const dim3 block_dims(warp_size, nwarps, 1);
 
+    mmq_profile_guard prof_guard(ctx.pool(id), mmq_profile_phases_enabled(), type, mmq_x,
+        args.nrows_x, args.ncols_max, stream);
+
     const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps);
 
     CUDA_SET_SHARED_MEMORY_LIMIT((mul_mat_q<type, mmq_x, false>), nbytes_shared);
@@ -3970,6 +4711,22 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
     const int ntzw = args.nchannels_y * args.nsamples_y;
     const dim3 block_nums_xy_tiling(nty, ntx, ntzw);
 
+    const auto log_launch_config = [&](const dim3 & grid, const bool need_check, const char * path) {
+        if (!ggml_cuda_mm_log_enabled()) {
+            return;
+        }
+        fprintf(stderr,
+            "[MUL_MAT_Q_LAUNCH] type=%d mmq_x=%d mmq_y=%d nwarps=%d warp_size=%d "
+            "nbytes_shared=%d smpbo=%zu grid=(%u,%u,%u) block=(%u,%u,%u) "
+            "ntx=%d nty=%d ntzw=%d ncols_max=%ld nrows_x=%ld use_stream_k=%d need_check=%d path=%s\n",
+            type, mmq_x, mmq_y, nwarps, warp_size,
+            nbytes_shared, smpbo,
+            grid.x, grid.y, grid.z,
+            block_dims.x, block_dims.y, block_dims.z,
+            ntx, nty, ntzw, args.ncols_max, args.nrows_x,
+            args.use_stream_k ? 1 : 0, need_check ? 1 : 0, path);
+    };
+
     GGML_ASSERT(args.nchannels_y % args.nchannels_x == 0);
     GGML_ASSERT(args.nsamples_y  % args.nsamples_x  == 0);
     const int channel_ratio = args.nchannels_y / args.nchannels_x;
@@ -3985,20 +4742,22 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
     if (!args.use_stream_k) {
         if (args.nrows_x % mmq_y == 0) {
             constexpr bool need_check = false;
+            log_launch_config(block_nums_xy_tiling, need_check, "tiling");
             mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                 (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
                  blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
                  channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
                  sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-                 ntx_fd);
+                 ntx_fd, prof_guard.ptr());
         } else {
             constexpr bool need_check = true;
+            log_launch_config(block_nums_xy_tiling, need_check, "tiling");
             mul_mat_q<type, mmq_x, need_check><<<block_nums_xy_tiling, block_dims, nbytes_shared, stream>>>
                 (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, nullptr,
                  blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
                  channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
                  sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-                 ntx_fd);
+                 ntx_fd, prof_guard.ptr());
         }
         return;
     }
@@ -4025,12 +4784,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
 
     if (args.nrows_x % mmq_y == 0) {
         constexpr bool need_check = false;
+        log_launch_config(block_nums_stream_k, need_check, "stream_k");
         mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
             (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
              blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
              channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
              sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-             ntx_fd);
+             ntx_fd, prof_guard.ptr());
 
         if (!fixup_needed) {
             return;
@@ -4043,12 +4803,13 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
              ntx_fd);
     } else {
         constexpr bool need_check = true;
+        log_launch_config(block_nums_stream_k, need_check, "stream_k");
         mul_mat_q<type, mmq_x, need_check><<<block_nums_stream_k, block_dims, nbytes_shared, stream>>>
             (args.x, args.y, args.ids_dst, args.expert_bounds, args.dst, tmp_fixup.ptr,
              blocks_per_ne00_fd, args.nrows_x, args.ncols_dst, args.stride_row_x, args.ncols_y, args.nrows_dst,
              channel_ratio_fd, nchannels_y_fd, args.stride_channel_x, args.stride_channel_y, args.stride_channel_dst,
              sample_ratio_fd, nsamples_y_fd, args.stride_sample_x, args.stride_sample_y, args.stride_sample_dst,
-             ntx_fd);
+             ntx_fd, prof_guard.ptr());
 
         if (!fixup_needed) {
             return;
@@ -4091,6 +4852,89 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
         }
     }
 
+#if defined(GGML_USE_HIP)
+    // RDNA3.5 (gfx115x): mmq_x=128 uses ~59% of LDS/CU → 1 WG/CU. For K-quants only,
+    // pick the smallest mmq_x with nbytes_shared <= smpbo/2 (2 WGs/CU).
+    if (GGML_CUDA_CC_IS_RDNA3_5(cc) && mmq_x_best > 0 && mmq_rdna35_dual_wg_eligible(type)) {
+        const size_t lds_dual_wg = smpbo / 2;
+        const size_t nbytes_best = mmq_get_nbytes_shared<type>(mmq_x_best, mmq_y, cc, warp_size, nwarps);
+        if (nbytes_best > lds_dual_wg) {
+            int mmq_x_lds = 0;
+            size_t nbytes_lds_min = SIZE_MAX;
+            for (int mmq_x = 8; mmq_x <= mmq_x_max; mmq_x += 8) {
+                const int granularity = mmq_get_granularity_host(mmq_x, cc);
+                if (mmq_x % granularity != 0) {
+                    continue;
+                }
+                const size_t nbytes = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps);
+                if (nbytes > smpbo || nbytes > lds_dual_wg) {
+                    continue;
+                }
+                const int ntiles_x = (args.ncols_max + mmq_x - 1) / mmq_x;
+                if (ntiles_x > ntiles_x_best * 2) {
+                    continue;
+                }
+                if (nbytes < nbytes_lds_min) {
+                    mmq_x_lds = mmq_x;
+                    nbytes_lds_min = nbytes;
+                }
+            }
+            if (mmq_x_lds > 0) {
+                mmq_x_best = mmq_x_lds;
+            }
+        }
+    }
+#endif // GGML_USE_HIP
+
+#if defined(GGML_USE_HIP)
+    // Tiny M (e.g. gate 128×32×4096 Q8_0): prefer largest mmq_x within LDS so N is not
+    // split; occupancy is irrelevant when only a handful of blocks launch.
+    if (GGML_CUDA_CC_IS_RDNA3_5(cc) && args.nrows_x <= 32) {
+        const int nty  = (args.nrows_x + mmq_y - 1) / mmq_y;
+        const int ntzw = static_cast<int>(args.nchannels_y * args.nsamples_y);
+
+        int mmq_x_tiny  = 0;
+        int ntiles_tiny = INT_MAX;
+        for (int mmq_x = mmq_x_max; mmq_x >= 8; mmq_x -= 8) {
+            const int granularity = mmq_get_granularity_host(mmq_x, cc);
+            if (mmq_x % granularity != 0) {
+                continue;
+            }
+            const size_t nbytes = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps);
+            if (nbytes > smpbo) {
+                continue;
+            }
+            const int ntx = (args.ncols_max + mmq_x - 1) / mmq_x;
+            const int grid_blocks = nty * ntx * ntzw;
+            if (grid_blocks <= 4 && ntx <= ntiles_tiny) {
+                mmq_x_tiny  = mmq_x;
+                ntiles_tiny = ntx;
+                if (ntiles_tiny == 1) {
+                    break;
+                }
+            }
+        }
+        if (mmq_x_tiny > 0) {
+            mmq_x_best = mmq_x_tiny;
+        }
+    }
+
+    // Q6_K / K-quants narrow-N: dual-WG path (mmq_x=64, ntx=2) from smpbo/2 selection.
+#endif // GGML_USE_HIP
+
+#if defined(GGML_USE_HIP)
+    if (GGML_CUDA_CC_IS_RDNA3_5(cc)) {
+        if (const char * force = getenv("MMQ_FORCE_MMQ_X")) {
+            const int mmq_x_force = atoi(force);
+            if (mmq_x_force >= 8 && mmq_x_force <= mmq_x_max &&
+                mmq_x_force % mmq_get_granularity_host(mmq_x_force, cc) == 0 &&
+                mmq_get_nbytes_shared<type>(mmq_x_force, mmq_y, cc, warp_size, nwarps) <= smpbo) {
+                mmq_x_best = mmq_x_force;
+            }
+        }
+    }
+#endif // GGML_USE_HIP
+
     switch (mmq_x_best) {
         case   8:
             launch_mul_mat_q<type,   8>(ctx, args, stream);
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index fe44a58da918..cee73315d5ef 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -1213,6 +1213,14 @@ void ggml_cuda_mul_mat_vec_q(
 
     const int64_t ids_stride = ids ? ids->nb[1] / ggml_type_size(ids->type) : 0;
 
+    // Log kernel dimensions for profiling analysis (GGML_CUDA_MM_LOG=1).
+    // dst[ne01] = src0[ne00, ne01] @ src1[ne10]; ne00==ne10 (K), ne11==1 for decode
+    // GEMV y[M] = W[M,K] @ x[K]: M=ne01, K=ne00
+    if (ggml_cuda_mm_log_enabled()) {
+        fprintf(stderr, "[MUL_MAT_VEC_Q] M=%ld K=%ld ne00=%ld ne01=%ld ne10=%ld ne11=%ld ne02=%ld ne12=%ld ne03=%ld ne13=%ld type=%d\n",
+                ne01, ne00, ne00, ne01, ne10, ne11, ne02, ne12, ne03, ne13, src0->type);
+    }
+
     mul_mat_vec_q_switch_type(
         src0->data, src0->type, src1_q8_1.get(), ids_d, fusion_local, dst_d, ne00,
         ne01,              ncols_dst,     s01, stride_col_y,     stride_col_dst,
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 39a500a17041..7d23fd8886dd 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -400,6 +400,13 @@ void quantize_mmq_q8_1_cuda(
     const int64_t block_num_y = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
     const dim3 num_blocks(ne1, block_num_y, ne2*ne3);
     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
+
+    // Log kernel dimensions for profiling analysis (GGML_CUDA_MM_LOG=1).
+    if (ggml_cuda_mm_log_enabled()) {
+        fprintf(stderr, "[QUANTIZE_MMQ_Q8_1] ne00=%ld ne0=%ld ne1=%ld ne2=%ld ne3=%ld grid=(%u,%u,%u) type=%d\n",
+                ne00, ne0, ne1, ne2, ne3, num_blocks.x, num_blocks.y, num_blocks.z, type_src0);
+    }
+
     switch (mmq_get_q8_1_ds_layout(type_src0)) {
         case MMQ_Q8_1_DS_LAYOUT_D4:
             quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h
index a6115cd80dc4..2ad1207bef98 100644
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -94,6 +94,7 @@
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
+#define cudaMemcpyToSymbol hipMemcpyToSymbol
 #define cudaMemset hipMemset
 #define cudaMemsetAsync hipMemsetAsync
 #define cudaMemGetInfo hipMemGetInfo