From af65e3924afef20cdfe1dce3cff87a222ce00f49 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Mon, 11 May 2026 20:01:21 +0200
Subject: [PATCH 01/14] build: replace FATAL_ERROR for WITH_HIP+WITH_FLASH_ATTN
 with a warning

The build previously aborted with a hard error when both WITH_HIP=ON and
WITH_FLASH_ATTN=ON were set. The CUDA Flash Attention kernels rely on
CUTLASS/CuTe (sm80-specific) and cannot be compiled for HIP directly.

Replace the FATAL_ERROR with a cmake WARNING so that the build succeeds.
Using flash_attention=True at runtime on a ROCm build already raises a clear
std::invalid_argument (see model.cc CT2_USE_HIP guard). The warning explains
that a native HIP implementation via AMD Composable Kernel is planned.
---
 CMakeLists.txt | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf80e37b5..26f137c2c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -719,7 +719,13 @@ elseif(WITH_HIP)
     src/ops/awq/dequantize_gpu.cu
   )
   if(WITH_FLASH_ATTN)
-    message(FATAL_ERROR "WITH_HIP=ON incompatible with WITH_FLASH_ATTN=ON")
+    # Flash Attention for HIP/ROCm is not yet implemented.
+    # The CUDA kernels rely on CUTLASS/CuTe (sm80-specific) and cannot be hipified directly.
+    # A native HIP implementation using AMD Composable Kernel (CK) is planned.
+    # At runtime, using flash_attention=True on a ROCm build will raise an error.
+    message(WARNING "WITH_FLASH_ATTN=ON has no effect with WITH_HIP=ON: "
+                    "Flash Attention is not yet implemented for the HIP backend. "
+                    "The build will succeed but flash_attention=True will raise an error at runtime.")
   endif()
 
   set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP)

From 41bdbf30047c2258cbae49963e6a3e559d50ef76 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Mon, 11 May 2026 20:34:18 +0200
Subject: [PATCH 02/14] ops: add native HIP Flash Attention kernels for AMD
 RDNA3 (gfx1100)

Implements scaled dot-product attention for the ROCm/HIP backend using
three plain HIP kernels (QK^T, softmax, PV) with FP32 accumulators.
No CUTLASS, CuTe, or AMD Composable Kernel dependency required.

Supports FP16 and BF16 inputs; tested on gfx1100 (RX 7900 XTX).
KV-cache (offset > 0), rotary embeddings, and ALiBi are expected to be
pre-applied by the caller in this initial implementation.

Changes:
- flash_attention_gpu.cu: add #ifndef CT2_USE_HIP / #else / #endif
  guard so CUDA and HIP share one translation unit with a single
  FlashAttention::compute<Device::CUDA> specialisation each.
  CUTLASS/CuTe headers are excluded for HIP builds.
- CMakeLists.txt: enable -DCT2_WITH_FLASH_ATTN for HIP when
  WITH_FLASH_ATTN=ON (no sm80 CUTLASS sources added).
- model.cc: honour CT2_WITH_FLASH_ATTN on HIP builds so that
  flash_attention=True is accepted at runtime instead of raising
  "FlashAttention not supported on ROCm."
---
 CMakeLists.txt                 |  12 +-
 src/models/model.cc            |   8 +-
 src/ops/flash_attention_gpu.cu | 277 ++++++++++++++++++++++++++++++++-
 3 files changed, 286 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 26f137c2c..8c125f806 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -719,13 +719,11 @@ elseif(WITH_HIP)
     src/ops/awq/dequantize_gpu.cu
   )
   if(WITH_FLASH_ATTN)
-    # Flash Attention for HIP/ROCm is not yet implemented.
-    # The CUDA kernels rely on CUTLASS/CuTe (sm80-specific) and cannot be hipified directly.
-    # A native HIP implementation using AMD Composable Kernel (CK) is planned.
-    # At runtime, using flash_attention=True on a ROCm build will raise an error.
-    message(WARNING "WITH_FLASH_ATTN=ON has no effect with WITH_HIP=ON: "
-                    "Flash Attention is not yet implemented for the HIP backend. "
-                    "The build will succeed but flash_attention=True will raise an error at runtime.")
+    # Native HIP Flash Attention implementation (three-pass: QK^T, softmax, PV).
+    # Does not require CUTLASS/CuTe; uses plain HIP kernels with FP32 accumulators.
+    # Supports FP16 and BF16 inputs; tested on gfx1100 (RDNA3 / RX 7900 XTX).
+    add_definitions(-DCT2_WITH_FLASH_ATTN)
+    message(STATUS "Building with HIP Flash Attention support")
   endif()
 
   set_source_files_properties(${CUDA_SOURCES} PROPERTIES LANGUAGE HIP)
diff --git a/src/models/model.cc b/src/models/model.cc
index 1d8295b0a..0caf284a5 100644
--- a/src/models/model.cc
+++ b/src/models/model.cc
@@ -840,14 +840,20 @@ namespace ctranslate2 {
         int device_id = ctranslate2::get_device_index(ctranslate2::Device::CUDA);
         auto dprops = ctranslate2::cuda::get_device_properties(device_id);
 #ifdef CT2_USE_HIP
+#  ifdef CT2_WITH_FLASH_ATTN
+        supports_flash_attention = true;  // native HIP kernels available
+#  else
         supports_flash_attention = false;
+#  endif
 #else
         supports_flash_attention = dprops.major >= 8;
 #endif
       }
       if (use_flash_attention && !supports_flash_attention) {
 #ifdef CT2_USE_HIP
-        throw std::invalid_argument("FlashAttention not supported on ROCm.");
+        throw std::invalid_argument(
+            "FlashAttention is not available for this ROCm build. "
+            "Rebuild CTranslate2 with -DWITH_HIP=ON -DWITH_FLASH_ATTN=ON.");
 #else
         throw std::invalid_argument("FlashAttention only supports Ampere GPUs or newer.");
 #endif
diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 24bcbf878..5fdc0984d 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -1,10 +1,11 @@
 #include "ctranslate2/ops/flash_attention.h"
-#ifdef CT2_WITH_FLASH_ATTN
+#if defined(CT2_WITH_FLASH_ATTN) && !defined(CT2_USE_HIP)
 #include "ctranslate2/ops/flash-attention/flash.h"
 #include "ctranslate2/ops/flash-attention/static_switch.h"
 #endif
 #include "ctranslate2/ops/transpose.h"
 #include "cuda/utils.h"
+#include "cuda/helpers.h"
 
 #include "dispatch.h"
 
@@ -14,6 +15,9 @@
 
 namespace ctranslate2 {
   namespace ops {
+
+#ifndef CT2_USE_HIP  // CUDA-only: CUTLASS/CuTe Flash Attention kernels
+
 #ifdef CT2_WITH_FLASH_ATTN
     static void set_params_fprop(Flash_fwd_params &params,
       // sizes
@@ -364,5 +368,272 @@ namespace ctranslate2 {
       throw std::runtime_error("Flash attention 2 is not supported");
 #endif
     }
-  }
-}
+
+#else  // CT2_USE_HIP — native HIP Flash Attention implementation
+
+// ---------------------------------------------------------------------------
+// HIP Flash Attention — native implementation for AMD GPUs (gfx1100 / RDNA3+)
+//
+// Algorithm: standard scaled dot-product attention computed in three passes:
+//   1. S = Q @ K^T * scale          [batch, nheads, seqlen_q, seqlen_k]
+//   2. P = softmax(S + causal_mask)  in-place
+//   3. O = P @ V                     [batch, seqlen_q, nheads, head_dim]
+//
+// Memory layout of all tensors: [batch, seqlen, nheads, head_dim]
+// Supports FP16 and BF16 inputs; FP32 accumulators throughout.
+//
+// Limitations in this initial implementation:
+//   - KV-cache append path (offset > 0) is not yet supported.
+//   - Rotary embeddings and ALiBi are expected to be pre-applied by the caller.
+//   - No sub-quadratic memory tiling (full attention matrix is materialised).
+// ---------------------------------------------------------------------------
+
+
+    // -------------------------------------------------------------------
+    // Kernel 1 — Q @ K^T
+    // Grid:  (ceildiv(seqlen_q * seqlen_k, 256), nheads, batch)
+    // Block: (256, 1, 1)
+    // Each thread computes one element of S[b, h, q, k].
+    // -------------------------------------------------------------------
+    template <typename scalar_t>
+    __global__ void hip_attn_qk_kernel(
+        const scalar_t* __restrict__ Q,  // [batch, seqlen_q, nheads, head_dim]
+        const scalar_t* __restrict__ K,  // [batch, seqlen_k, nheads, head_dim]
+        float*          __restrict__ S,  // [batch, nheads, seqlen_q, seqlen_k]
+        const int seqlen_q,
+        const int seqlen_k,
+        const int nheads,
+        const int head_dim,
+        const float scale)
+    {
+      const int b   = blockIdx.z;
+      const int h   = blockIdx.y;
+      const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+      const int q   = idx / seqlen_k;
+      const int k   = idx % seqlen_k;
+      if (q >= seqlen_q) return;
+
+      float dot = 0.f;
+      const int q_base = b * seqlen_q * nheads * head_dim + q * nheads * head_dim + h * head_dim;
+      const int k_base = b * seqlen_k * nheads * head_dim + k * nheads * head_dim + h * head_dim;
+      for (int d = 0; d < head_dim; ++d)
+        dot += static_cast<float>(Q[q_base + d]) * static_cast<float>(K[k_base + d]);
+
+      S[b * nheads * seqlen_q * seqlen_k + h * seqlen_q * seqlen_k + q * seqlen_k + k] =
+          dot * scale;
+    }
+
+    // -------------------------------------------------------------------
+    // Kernel 2 — row-wise softmax with optional causal mask
+    // Grid:  (seqlen_q, nheads, batch)   — so gridDim = (seqlen_q, nheads, batch)
+    // Block: (min(seqlen_k, 256), 1, 1)
+    // Uses shared memory reduction; wavefront-safe for both wf32 and wf64.
+    // -------------------------------------------------------------------
+    __global__ void hip_attn_softmax_kernel(
+        float* __restrict__ S,       // [batch, nheads, seqlen_q, seqlen_k] — modified in place
+        const int nheads,
+        const int seqlen_q,
+        const int seqlen_k,
+        const bool is_causal,
+        const int q_offset)          // position of q[0] in the full sequence (for KV-cache)
+    {
+      const int b = blockIdx.z;
+      const int h = blockIdx.y;
+      const int q = blockIdx.x;
+
+      float* row = S + (b * nheads + h) * seqlen_q * seqlen_k + q * seqlen_k;
+
+      extern __shared__ float smem[];  // [blockDim.x] for reduction
+
+      // --- apply causal mask ---
+      const int q_pos = q + q_offset;
+      for (int k = threadIdx.x; k < seqlen_k; k += blockDim.x)
+        if (is_causal && k > q_pos) row[k] = -1e9f;
+      __syncthreads();
+
+      // --- find row max ---
+      float local_max = -1e9f;
+      for (int k = threadIdx.x; k < seqlen_k; k += blockDim.x)
+        local_max = fmaxf(local_max, row[k]);
+      smem[threadIdx.x] = local_max;
+      __syncthreads();
+      for (int s = blockDim.x >> 1; s > 0; s >>= 1) {
+        if (threadIdx.x < s) smem[threadIdx.x] = fmaxf(smem[threadIdx.x], smem[threadIdx.x + s]);
+        __syncthreads();
+      }
+      const float row_max = smem[0];
+      __syncthreads();
+
+      // --- exp and row sum ---
+      float local_sum = 0.f;
+      for (int k = threadIdx.x; k < seqlen_k; k += blockDim.x) {
+        const float e = expf(row[k] - row_max);
+        row[k] = e;
+        local_sum += e;
+      }
+      smem[threadIdx.x] = local_sum;
+      __syncthreads();
+      for (int s = blockDim.x >> 1; s > 0; s >>= 1) {
+        if (threadIdx.x < s) smem[threadIdx.x] += smem[threadIdx.x + s];
+        __syncthreads();
+      }
+      const float row_sum = smem[0];
+      __syncthreads();
+
+      // --- normalise ---
+      const float inv_sum = (row_sum > 0.f) ? 1.f / row_sum : 0.f;
+      for (int k = threadIdx.x; k < seqlen_k; k += blockDim.x)
+        row[k] *= inv_sum;
+    }
+
+    // -------------------------------------------------------------------
+    // Kernel 3 — O = P @ V
+    // Grid:  (seqlen_q, nheads, batch)
+    // Block: (head_dim, 1, 1)   — head_dim <= 1024
+    // Each thread accumulates one output channel O[b, q, h, d].
+    // -------------------------------------------------------------------
+    template <typename scalar_t>
+    __global__ void hip_attn_ov_kernel(
+        const float*    __restrict__ P,  // [batch, nheads, seqlen_q, seqlen_k]
+        const scalar_t* __restrict__ V,  // [batch, seqlen_k, nheads, head_dim]
+        scalar_t*       __restrict__ O,  // [batch, seqlen_q, nheads, head_dim]
+        const int seqlen_k,
+        const int nheads,
+        const int head_dim)
+    {
+      const int b = blockIdx.z;
+      const int h = blockIdx.y;
+      const int q = blockIdx.x;
+      const int d = threadIdx.x;
+      if (d >= head_dim) return;
+
+      const int seqlen_q = gridDim.x;
+      const float* p_row =
+          P + b * nheads * seqlen_q * seqlen_k + h * seqlen_q * seqlen_k + q * seqlen_k;
+
+      float out = 0.f;
+      for (int k = 0; k < seqlen_k; ++k)
+        out += p_row[k] *
+               static_cast<float>(V[b * seqlen_k * nheads * head_dim +
+                                    k * nheads * head_dim + h * head_dim + d]);
+
+      O[b * seqlen_q * nheads * head_dim + q * nheads * head_dim + h * head_dim + d] =
+          static_cast<scalar_t>(out);
+    }
+
+    // -------------------------------------------------------------------
+    // Dispatcher called from FlashAttention::compute<Device::CUDA> below.
+    // -------------------------------------------------------------------
+    template <typename scalar_t>
+    static void flash_attention_hip_impl(
+        StorageView& queries,
+        StorageView& keys,
+        StorageView& values,
+        StorageView& output,
+        StorageView* cached_keys,
+        StorageView* cached_values,
+        float queries_scale,
+        bool is_causal,
+        dim_t offset)
+    {
+      if (offset != 0)
+        throw std::runtime_error(
+            "Flash Attention HIP: KV-cache append path (offset > 0) is not yet implemented.");
+
+      const dim_t batch_size  = queries.dim(0);
+      const dim_t seqlen_q    = queries.dim(1);
+      const dim_t num_heads   = queries.dim(2);
+      const dim_t head_dim    = queries.dim(3);
+      const dim_t seqlen_k    = keys.dim(1);
+
+      using DevT = typename cuda::DeviceType<scalar_t>::type;
+      const DevT* q_ptr = reinterpret_cast<const DevT*>(queries.data<scalar_t>());
+      const DevT* k_ptr = reinterpret_cast<const DevT*>(keys.data<scalar_t>());
+      const DevT* v_ptr = reinterpret_cast<const DevT*>(values.data<scalar_t>());
+
+      output.resize(queries.shape());
+      DevT* o_ptr = reinterpret_cast<DevT*>(output.data<scalar_t>());
+
+      // Allocate FP32 attention score buffer: [batch, nheads, seqlen_q, seqlen_k]
+      StorageView scores_buf({batch_size, num_heads, seqlen_q, seqlen_k},
+                             DataType::FLOAT32, Device::CUDA);
+      float* s_ptr = scores_buf.data<float>();
+
+      hipStream_t stream = cuda::get_cuda_stream();
+
+      // --- Pass 1: Q @ K^T ---
+      {
+        const int total = seqlen_q * seqlen_k;
+        const int block = 256;
+        dim3 grid((total + block - 1) / block, num_heads, batch_size);
+        hipLaunchKernelGGL(hip_attn_qk_kernel<DevT>, grid, block, 0, stream,
+                           q_ptr, k_ptr, s_ptr,
+                           seqlen_q, seqlen_k, num_heads, head_dim,
+                           queries_scale);
+      }
+
+      // --- Pass 2: softmax (with causal mask) ---
+      {
+        const int block = min((int)seqlen_k, 256);
+        dim3 grid(seqlen_q, num_heads, batch_size);
+        // The kernel uses a flat pointer into S; we compute the nheads-stride
+        // by adjusting the base pointer per head inside the kernel via blockIdx.y.
+        // Pass nheads as a compile-time-unknown dynamic value: kernel reads it
+        // from gridDim.y.
+        hipLaunchKernelGGL(hip_attn_softmax_kernel, grid, block,
+                           block * sizeof(float),  // dynamic shared mem
+                           stream,
+                           s_ptr, (int)num_heads, (int)seqlen_q, (int)seqlen_k,
+                           is_causal, (int)offset);
+      }
+
+      // --- Pass 3: P @ V ---
+      {
+        const int block = min((int)head_dim, 1024);
+        dim3 grid(seqlen_q, num_heads, batch_size);
+        hipLaunchKernelGGL(hip_attn_ov_kernel<DevT>, grid, block, 0, stream,
+                           s_ptr, v_ptr, o_ptr,
+                           seqlen_k, num_heads, head_dim);
+      }
+    }
+
+    template <>
+    void FlashAttention::compute<Device::CUDA>(
+        StorageView& queries,
+        StorageView& keys,
+        StorageView& values,
+        StorageView& output,
+        StorageView* cached_keys,
+        StorageView* cached_values,
+        StorageView* /*attention*/,
+        bool         /*return_normalized_attention*/,
+        StorageView* /*rotary_cos*/,
+        StorageView* /*rotary_sin*/,
+        const bool   /*rotary_interleave*/,
+        StorageView* /*alibi*/,
+        dim_t        offset) const
+    {
+      const DataType dtype = queries.dtype();
+      switch (dtype) {
+        case DataType::FLOAT16:
+          flash_attention_hip_impl<float16_t>(
+              queries, keys, values, output,
+              cached_keys, cached_values,
+              _queries_scale, _is_causal, offset);
+          break;
+        case DataType::BFLOAT16:
+          flash_attention_hip_impl<bfloat16_t>(
+              queries, keys, values, output,
+              cached_keys, cached_values,
+              _queries_scale, _is_causal, offset);
+          break;
+        default:
+          throw std::invalid_argument(
+              "Flash Attention HIP only supports float16 and bfloat16 inputs.");
+      }
+    }
+
+#endif  // CT2_USE_HIP / !CT2_USE_HIP
+
+  }  // namespace ops
+}  // namespace ctranslate2

From d9016a58fd04633436d7128575a79e6b32010b69 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Mon, 11 May 2026 22:27:13 +0200
Subject: [PATCH 03/14] ops: KV-cache, tiled and decode kernels for HIP Flash
 Attention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Builds on the initial 3-pass HIP kernels with three changes:

1. KV-cache write path (offset > 0) — adds hip_kv_cache_write_kernel
   that stages new K/V tokens into the cached buffer before the attention
   pass.  Mirrors the CUTLASS path's append-then-attend semantics so
   autoregressive decoding works without the layer mutating the cache.

2. Softmax-reduction block-size fix — the tree reduction inside
   hip_attn_softmax_kernel requires blockDim.x to be a power of two.
   The dispatcher previously used min(seqlen_k, 256) directly, so for
   seqlen_k = 3 (e.g. the Whisper prompt prefill of three tokens) the
   reduction silently dropped the third element, corrupting the softmax
   and forcing generate() to always emit the same token regardless of
   input.  Now rounded up to the next power of two with extra threads
   contributing the identity (-1e9 for max, 0 for sum).

3. Two new fast paths in the dispatcher:
   - hip_flash_attn_fwd_tiled (Flash Attention 2): one block per
     (q_tile, head, batch), Q in registers, K/V tiles streamed through
     LDS, online-softmax state (m_i, l_i, acc[D]).  S = Q@K^T is never
     materialised in HBM.  Specialised for D in {64, 80, 128} with
     BM = BN = 64.  Used when seqlen_q >= BM.
   - hip_flash_decode_kernel: one block per (head, batch) with threads
     parallelising over K.  Phase 1 computes scores in LDS, phase 2
     reduces, phase 3 accumulates output channels with V-tiling
     (BLOCK threads stage a V_TILE-wide slab of V into LDS once, then
     D channel threads sum against the cached tile).  Used for
     seqlen_q == 1 with D in {64, 128} and seqlen_k bounded by the
     64 KiB per-block LDS budget.

The original 3-pass kernels remain as the correctness-oracle fallback
for unsupported head dimensions and the 2..BM-1 query-length gap.

Verified on Whisper-medium / RX 7900 XTX: all five test seeds produce
identical token sequences to the standard MultiHeadAttention path
(generate up to max_length=200).
---
 src/ops/flash_attention_gpu.cu | 555 +++++++++++++++++++++++++++++++--
 1 file changed, 522 insertions(+), 33 deletions(-)

diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 5fdc0984d..118860d75 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -374,34 +374,80 @@ namespace ctranslate2 {
 // ---------------------------------------------------------------------------
 // HIP Flash Attention — native implementation for AMD GPUs (gfx1100 / RDNA3+)
 //
-// Algorithm: standard scaled dot-product attention computed in three passes:
-//   1. S = Q @ K^T * scale          [batch, nheads, seqlen_q, seqlen_k]
-//   2. P = softmax(S + causal_mask)  in-place
-//   3. O = P @ V                     [batch, seqlen_q, nheads, head_dim]
+// Two code paths are provided:
+//
+// 1) Tiled fused kernel (hip_flash_attn_fwd_tiled<DevT, BM, BN, D>)
+//    Implements the Flash Attention 2 forward algorithm: a single grid block
+//    processes BM contiguous query rows of one (batch, head), streams K/V in
+//    BN-wide tiles through LDS, and maintains an online softmax state
+//    (m_i, l_i) plus an FP32 output accumulator in registers.  S = Q@K^T is
+//    NEVER materialised in HBM — memory and bandwidth scale with O(N·D)
+//    instead of O(N^2).  Used whenever head_dim matches one of the
+//    specialised values (64 / 80 / 128).
+//
+// 2) Three-pass fallback (hip_attn_qk_kernel + softmax + ov)
+//    Simple and provably correct reference path that materialises the full
+//    [batch, nheads, seqlen_q, seqlen_k] score buffer.  Kept as a fallback
+//    for head dimensions that the tiled kernel is not specialised for, and
+//    as an oracle for correctness comparisons.
 //
 // Memory layout of all tensors: [batch, seqlen, nheads, head_dim]
 // Supports FP16 and BF16 inputs; FP32 accumulators throughout.
 //
 // Limitations in this initial implementation:
-//   - KV-cache append path (offset > 0) is not yet supported.
 //   - Rotary embeddings and ALiBi are expected to be pre-applied by the caller.
-//   - No sub-quadratic memory tiling (full attention matrix is materialised).
+//   - No backward pass (inference only).
 // ---------------------------------------------------------------------------
 
 
+    // -------------------------------------------------------------------
+    // Kernel 0 — KV-cache write
+    // Copies new_kv[b, t, h, d] → cache[b, write_offset+t, h, d].
+    // Grid:  (seqlen_new, nheads, batch)
+    // Block: (head_dim, 1, 1)
+    // -------------------------------------------------------------------
+    template <typename scalar_t>
+    __global__ void hip_kv_cache_write_kernel(
+        const scalar_t* __restrict__ new_kv,  // [batch, seqlen_new, nheads, head_dim]
+        scalar_t*       __restrict__ cache,   // [batch, cache_size, nheads, head_dim]
+        const int seqlen_new,
+        const int cache_size,
+        const int nheads,
+        const int head_dim,
+        const int write_offset)
+    {
+      const int b = blockIdx.z;
+      const int t = blockIdx.y;
+      const int h = blockIdx.x;
+      const int d = threadIdx.x;
+      if (d >= head_dim) return;
+
+      const int src = b * seqlen_new * nheads * head_dim + t * nheads * head_dim + h * head_dim + d;
+      const int dst = b * cache_size  * nheads * head_dim
+                    + (write_offset + t) * nheads * head_dim + h * head_dim + d;
+      cache[dst] = new_kv[src];
+    }
+
     // -------------------------------------------------------------------
     // Kernel 1 — Q @ K^T
     // Grid:  (ceildiv(seqlen_q * seqlen_k, 256), nheads, batch)
     // Block: (256, 1, 1)
     // Each thread computes one element of S[b, h, q, k].
+    //
+    // k_time_stride: stride (in elements) between consecutive K time steps
+    //   within one batch.  For a K tensor of shape [batch, K_seqlen, nheads, head_dim]
+    //   this equals K_seqlen * nheads * head_dim.  When K is a view into a
+    //   larger KV-cache buffer the allocation seqlen may differ from the
+    //   logically active seqlen, so the stride must be passed explicitly.
     // -------------------------------------------------------------------
     template <typename scalar_t>
     __global__ void hip_attn_qk_kernel(
         const scalar_t* __restrict__ Q,  // [batch, seqlen_q, nheads, head_dim]
-        const scalar_t* __restrict__ K,  // [batch, seqlen_k, nheads, head_dim]
+        const scalar_t* __restrict__ K,  // [batch, K_alloc, nheads, head_dim]
         float*          __restrict__ S,  // [batch, nheads, seqlen_q, seqlen_k]
         const int seqlen_q,
         const int seqlen_k,
+        const int k_time_stride,         // K_alloc * nheads * head_dim
         const int nheads,
         const int head_dim,
         const float scale)
@@ -415,7 +461,7 @@ namespace ctranslate2 {
 
       float dot = 0.f;
       const int q_base = b * seqlen_q * nheads * head_dim + q * nheads * head_dim + h * head_dim;
-      const int k_base = b * seqlen_k * nheads * head_dim + k * nheads * head_dim + h * head_dim;
+      const int k_base = b * k_time_stride               + k * nheads * head_dim + h * head_dim;
       for (int d = 0; d < head_dim; ++d)
         dot += static_cast<float>(Q[q_base + d]) * static_cast<float>(K[k_base + d]);
 
@@ -491,13 +537,16 @@ namespace ctranslate2 {
     // Grid:  (seqlen_q, nheads, batch)
     // Block: (head_dim, 1, 1)   — head_dim <= 1024
     // Each thread accumulates one output channel O[b, q, h, d].
+    //
+    // v_time_stride: same concept as k_time_stride in Kernel 1.
     // -------------------------------------------------------------------
     template <typename scalar_t>
     __global__ void hip_attn_ov_kernel(
         const float*    __restrict__ P,  // [batch, nheads, seqlen_q, seqlen_k]
-        const scalar_t* __restrict__ V,  // [batch, seqlen_k, nheads, head_dim]
+        const scalar_t* __restrict__ V,  // [batch, V_alloc, nheads, head_dim]
         scalar_t*       __restrict__ O,  // [batch, seqlen_q, nheads, head_dim]
         const int seqlen_k,
+        const int v_time_stride,         // V_alloc * nheads * head_dim
         const int nheads,
         const int head_dim)
     {
@@ -514,15 +563,323 @@ namespace ctranslate2 {
       float out = 0.f;
       for (int k = 0; k < seqlen_k; ++k)
         out += p_row[k] *
-               static_cast<float>(V[b * seqlen_k * nheads * head_dim +
-                                    k * nheads * head_dim + h * head_dim + d]);
+               static_cast<float>(V[b * v_time_stride + k * nheads * head_dim + h * head_dim + d]);
 
       O[b * seqlen_q * nheads * head_dim + q * nheads * head_dim + h * head_dim + d] =
           static_cast<scalar_t>(out);
     }
 
+    // -------------------------------------------------------------------
+    // Tiled fused forward attention (Flash Attention 2 algorithm)
+    //
+    // Each grid block processes BM contiguous query rows of one (batch, head).
+    //   Grid:  (ceildiv(seqlen_q, BM), nheads, batch)
+    //   Block: (BM, 1, 1)        — one thread per query row.
+    //
+    // Per-thread state (kept in registers, never spilled to HBM):
+    //   q_reg[D]   : the thread's Q row, FP32
+    //   acc[D]     : running output accumulator, FP32
+    //   m_i, l_i   : running max / normaliser of the online softmax
+    //
+    // Per-block shared memory:
+    //   s_k[BN][D] : current K tile
+    //   s_v[BN][D] : current V tile
+    //   (BM threads cooperatively load BN·D elements per tile.)
+    //
+    // For every K/V tile we compute s_tile[BN] = q · k_t  for the local Q row,
+    // apply the bounds + causal mask, then perform the standard online-softmax
+    // update of (m_i, l_i, acc).  After all tiles we divide the accumulator
+    // by l_i and store it back to HBM.
+    //
+    // Template parameters allow the compiler to fully unroll the inner D loops
+    // and to keep q_reg/acc entirely in registers.  Supported D values are
+    // currently 64, 80, 128 (covering Whisper / common transformer heads).
+    // -------------------------------------------------------------------
+    template <typename scalar_t, int BM, int BN, int D>
+    __global__ void hip_flash_attn_fwd_tiled(
+        const scalar_t* __restrict__ Q,  // [batch, seqlen_q, nheads, D]
+        const scalar_t* __restrict__ K,  // [batch, K_alloc,  nheads, D]
+        const scalar_t* __restrict__ V,  // [batch, V_alloc,  nheads, D]
+        scalar_t*       __restrict__ O,  // [batch, seqlen_q, nheads, D]
+        const int seqlen_q,
+        const int seqlen_k,
+        const int k_time_stride,         // K_alloc * nheads * D (batch stride)
+        const int v_time_stride,         // V_alloc * nheads * D
+        const int nheads,
+        const float scale,
+        const bool is_causal,
+        const int q_offset)              // absolute position of q[0]
+    {
+      const int b       = blockIdx.z;
+      const int h       = blockIdx.y;
+      const int q_tile  = blockIdx.x;
+      const int tid     = threadIdx.x;          // 0 .. BM-1
+      const int q       = q_tile * BM + tid;    // global query row
+
+      __shared__ scalar_t s_k[BN][D];
+      __shared__ scalar_t s_v[BN][D];
+
+      // -- Load this thread's Q row into registers (FP32, scaled once) --
+      float q_reg[D];
+      const bool q_valid = (q < seqlen_q);
+      if (q_valid) {
+        const int q_base = b * seqlen_q * nheads * D + q * nheads * D + h * D;
+        #pragma unroll
+        for (int d = 0; d < D; ++d)
+          q_reg[d] = static_cast<float>(Q[q_base + d]) * scale;
+      } else {
+        #pragma unroll
+        for (int d = 0; d < D; ++d) q_reg[d] = 0.f;
+      }
+
+      // -- Online softmax state --
+      float m_i = -1e30f;
+      float l_i = 0.f;
+      float acc[D];
+      #pragma unroll
+      for (int d = 0; d < D; ++d) acc[d] = 0.f;
+
+      const int num_tiles = (seqlen_k + BN - 1) / BN;
+      const int q_pos     = q + q_offset;
+
+      for (int t = 0; t < num_tiles; ++t) {
+        const int k_start = t * BN;
+
+        // -- Cooperatively load K and V tiles into LDS --
+        //    BM threads load BN*D elements each (== BN*D / BM per thread).
+        for (int idx = tid; idx < BN * D; idx += BM) {
+          const int kk = idx / D;
+          const int dd = idx % D;
+          const int kpos = k_start + kk;
+          if (kpos < seqlen_k) {
+            const int k_base = b * k_time_stride + kpos * nheads * D + h * D;
+            const int v_base = b * v_time_stride + kpos * nheads * D + h * D;
+            s_k[kk][dd] = K[k_base + dd];
+            s_v[kk][dd] = V[v_base + dd];
+          } else {
+            s_k[kk][dd] = static_cast<scalar_t>(0);
+            s_v[kk][dd] = static_cast<scalar_t>(0);
+          }
+        }
+        __syncthreads();
+
+        if (q_valid) {
+          // -- s_tile[kk] = q · k_t, with bounds + causal mask --
+          float s_tile[BN];
+          float m_tile = -1e30f;
+          #pragma unroll
+          for (int kk = 0; kk < BN; ++kk) {
+            const int kpos = k_start + kk;
+            const bool oob    = kpos >= seqlen_k;
+            const bool masked = is_causal && kpos > q_pos;
+            if (oob || masked) {
+              s_tile[kk] = -1e30f;
+            } else {
+              float dot = 0.f;
+              #pragma unroll
+              for (int d = 0; d < D; ++d)
+                dot += q_reg[d] * static_cast<float>(s_k[kk][d]);
+              s_tile[kk] = dot;
+              if (dot > m_tile) m_tile = dot;
+            }
+          }
+
+          // -- Online softmax update --
+          const float m_new = fmaxf(m_i, m_tile);
+          const float alpha = (m_i == -1e30f) ? 0.f : __expf(m_i - m_new);
+
+          float l_tile = 0.f;
+          #pragma unroll
+          for (int kk = 0; kk < BN; ++kk) {
+            // exp(-inf - finite) == 0, but __expf is undefined for -inf
+            // on some HIP targets, so guard explicitly.
+            float e = (s_tile[kk] <= -1e29f) ? 0.f : __expf(s_tile[kk] - m_new);
+            s_tile[kk] = e;
+            l_tile += e;
+          }
+
+          // acc = alpha * acc + s_tile @ V_tile
+          #pragma unroll
+          for (int d = 0; d < D; ++d) {
+            float a = alpha * acc[d];
+            #pragma unroll
+            for (int kk = 0; kk < BN; ++kk)
+              a += s_tile[kk] * static_cast<float>(s_v[kk][d]);
+            acc[d] = a;
+          }
+          l_i = alpha * l_i + l_tile;
+          m_i = m_new;
+        }
+        __syncthreads();
+      }
+
+      // -- Final normalisation and store --
+      if (q_valid) {
+        const float inv_l = (l_i > 0.f) ? 1.f / l_i : 0.f;
+        const int o_base = b * seqlen_q * nheads * D + q * nheads * D + h * D;
+        #pragma unroll
+        for (int d = 0; d < D; ++d)
+          O[o_base + d] = static_cast<scalar_t>(acc[d] * inv_l);
+      }
+    }
+
+    // -------------------------------------------------------------------
+    // Decode-optimised kernel (seqlen_q == 1)
+    //
+    // The tiled forward kernel above uses one thread per Q row.  During
+    // autoregressive generation seqlen_q == 1, so that design would leave
+    // BM-1 of BM threads idle.  This kernel inverts the parallelisation:
+    // a single block handles one (batch, head), and the threads cooperate
+    // along the K dimension instead.
+    //
+    // Grid:  (1, nheads, batch)
+    // Block: BLOCK threads (BLOCK >= D, both powers of two)
+    //
+    // Phase 1  Compute all seqlen_k scores S[k] = Q . K[k]  (each thread
+    //          handles k = tid, tid+BLOCK, …) and stash them in LDS.
+    // Phase 2  Block-wide tree reduction for row max → exp(S - max) → sum.
+    // Phase 3  Output: thread tid (tid < D) accumulates one channel
+    //          O[d] = Σ_k P[k] · V[k][d] / sum.  V[k][.] is loaded with
+    //          the natural [k, d] memory layout, so the BLOCK threads
+    //          read coalesced D-wide vectors per k.
+    //
+    // LDS layout (one dynamic shared array; scratch is reused across
+    // phases — reduce buffer in phase 2, V-tile in phase 3):
+    //   [0           .. D)                      q_lds   (FP32 scaled Q)
+    //   [D           .. D+seqlen_k)             s_lds   (FP32 scores)
+    //   [D+seqlen_k  .. +max(BLOCK, V_TILE*D))  scratch
+    //
+    // V_TILE controls Phase-3 V-tiling: instead of every output channel
+    // streaming all seqlen_k V values from HBM with no reuse, the BLOCK
+    // threads cooperatively stage a V_TILE-wide slab of V into LDS once,
+    // then the D output-channel threads accumulate against the cached
+    // tile.  HBM reads for V drop by ~BLOCK/D for that phase.
+    //
+    // gfx1100 LDS budget is 64 KiB.  For D=64, BLOCK=64, V_TILE=64 the
+    // scratch region is 64*64*4 = 16 KiB, so seqlen_k can reach ~12 k.
+    // -------------------------------------------------------------------
+    template <typename scalar_t, int BLOCK, int D, int V_TILE>
+    __global__ void hip_flash_decode_kernel(
+        const scalar_t* __restrict__ Q,  // [batch, 1,        nheads, D]
+        const scalar_t* __restrict__ K,  // [batch, K_alloc,  nheads, D]
+        const scalar_t* __restrict__ V,  // [batch, V_alloc,  nheads, D]
+        scalar_t*       __restrict__ O,  // [batch, 1,        nheads, D]
+        const int seqlen_k,
+        const int k_time_stride,
+        const int v_time_stride,
+        const int nheads,
+        const float scale,
+        const bool is_causal,
+        const int q_offset)
+    {
+      const int b   = blockIdx.z;
+      const int h   = blockIdx.y;
+      const int tid = threadIdx.x;
+
+      extern __shared__ float smem[];
+      float* q_lds   = smem;                       // D floats
+      float* s_lds   = smem + D;                   // seqlen_k floats
+      float* scratch = s_lds + seqlen_k;           // reduce_buf OR v_tile
+
+      // ---- Load Q (FP32, scaled once) into LDS ----
+      for (int d = tid; d < D; d += BLOCK)
+        q_lds[d] = static_cast<float>(Q[b * nheads * D + h * D + d]) * scale;
+      __syncthreads();
+
+      // ---- Phase 1: S[k] = q · k_t  (+ causal mask) ----
+      const int q_pos = q_offset;  // seqlen_q == 1, so q_pos == offset
+      for (int k = tid; k < seqlen_k; k += BLOCK) {
+        if (is_causal && k > q_pos) {
+          s_lds[k] = -1e30f;
+          continue;
+        }
+        const int k_base = b * k_time_stride + k * nheads * D + h * D;
+        float dot = 0.f;
+        #pragma unroll
+        for (int d = 0; d < D; ++d)
+          dot += q_lds[d] * static_cast<float>(K[k_base + d]);
+        s_lds[k] = dot;
+      }
+      __syncthreads();
+
+      // ---- Phase 2a: row max via block-wide reduction (scratch=reduce_buf) ----
+      float local_max = -1e30f;
+      for (int k = tid; k < seqlen_k; k += BLOCK)
+        local_max = fmaxf(local_max, s_lds[k]);
+      scratch[tid] = local_max;
+      __syncthreads();
+      for (int s = BLOCK >> 1; s > 0; s >>= 1) {
+        if (tid < s) scratch[tid] = fmaxf(scratch[tid], scratch[tid + s]);
+        __syncthreads();
+      }
+      const float row_max = scratch[0];
+      __syncthreads();
+
+      // ---- Phase 2b: exp(S - max) and row sum ----
+      float local_sum = 0.f;
+      for (int k = tid; k < seqlen_k; k += BLOCK) {
+        const float e = (s_lds[k] <= -1e29f) ? 0.f : __expf(s_lds[k] - row_max);
+        s_lds[k] = e;
+        local_sum += e;
+      }
+      scratch[tid] = local_sum;
+      __syncthreads();
+      for (int s = BLOCK >> 1; s > 0; s >>= 1) {
+        if (tid < s) scratch[tid] += scratch[tid + s];
+        __syncthreads();
+      }
+      const float row_sum = scratch[0];
+      const float inv_sum = (row_sum > 0.f) ? 1.f / row_sum : 0.f;
+      __syncthreads();
+
+      // ---- Phase 3: O[d] = Σ_k P[k] · V[k][d] / sum, with V-tiling ----
+      // scratch is reinterpreted as v_tile (row-major [V_TILE][D] FP32).
+      float* v_tile = scratch;
+      float acc = 0.f;
+
+      for (int kt = 0; kt < seqlen_k; kt += V_TILE) {
+        // -- Cooperative load: BLOCK threads stage V_TILE * D elements --
+        for (int idx = tid; idx < V_TILE * D; idx += BLOCK) {
+          const int kk = idx / D;
+          const int dd = idx % D;
+          const int kpos = kt + kk;
+          float v_val = 0.f;
+          if (kpos < seqlen_k) {
+            const int v_idx = b * v_time_stride + kpos * nheads * D + h * D + dd;
+            v_val = static_cast<float>(V[v_idx]);
+          }
+          v_tile[idx] = v_val;
+        }
+        __syncthreads();
+
+        // -- Accumulate: each output-channel thread sums over this tile --
+        if (tid < D) {
+          const int kk_max = min((int)V_TILE, seqlen_k - kt);
+          for (int kk = 0; kk < kk_max; ++kk) {
+            const float p = s_lds[kt + kk];
+            acc += p * v_tile[kk * D + tid];
+          }
+        }
+        __syncthreads();
+      }
+
+      if (tid < D)
+        O[b * nheads * D + h * D + tid] = static_cast<scalar_t>(acc * inv_sum);
+    }
+
     // -------------------------------------------------------------------
     // Dispatcher called from FlashAttention::compute<Device::CUDA> below.
+    //
+    // KV-cache semantics (mirrors the CUDA CUTLASS path):
+    //   offset == 0  — prefilling / encoder self-attention:
+    //     keys/values contain the full context; cached_keys/values are
+    //     filled or updated by the layer *before* this call.
+    //   offset > 0   — autoregressive decoder step:
+    //     keys/values contain only the NEW tokens (seqlen_new, typically 1).
+    //     cached_keys/values hold tokens [0 .. offset-1] and have capacity
+    //     for at least offset+seqlen_new tokens.
+    //     We must: 1) write new tokens into the cache at position offset,
+    //              2) run attention with Q vs. full cache [0 .. offset+seqlen_new).
     // -------------------------------------------------------------------
     template <typename scalar_t>
     static void flash_attention_hip_impl(
@@ -536,31 +893,159 @@ namespace ctranslate2 {
         bool is_causal,
         dim_t offset)
     {
-      if (offset != 0)
-        throw std::runtime_error(
-            "Flash Attention HIP: KV-cache append path (offset > 0) is not yet implemented.");
-
-      const dim_t batch_size  = queries.dim(0);
-      const dim_t seqlen_q    = queries.dim(1);
-      const dim_t num_heads   = queries.dim(2);
-      const dim_t head_dim    = queries.dim(3);
-      const dim_t seqlen_k    = keys.dim(1);
+      const dim_t batch_size = queries.dim(0);
+      const dim_t seqlen_q   = queries.dim(1);
+      const dim_t num_heads  = queries.dim(2);
+      const dim_t head_dim   = queries.dim(3);
+      const dim_t seqlen_new = keys.dim(1);   // NEW tokens this step
 
       using DevT = typename cuda::DeviceType<scalar_t>::type;
       const DevT* q_ptr = reinterpret_cast<const DevT*>(queries.data<scalar_t>());
-      const DevT* k_ptr = reinterpret_cast<const DevT*>(keys.data<scalar_t>());
-      const DevT* v_ptr = reinterpret_cast<const DevT*>(values.data<scalar_t>());
+
+      hipStream_t stream = cuda::get_cuda_stream();
+
+      // Determine K/V pointers and their batch-stride depending on whether
+      // we are using the KV-cache or the raw keys/values tensors.
+      const DevT* k_ptr;
+      const DevT* v_ptr;
+      dim_t seqlen_k;        // number of KEY tokens to attend over
+      dim_t k_time_stride;   // K_alloc * nheads * head_dim (batch stride for K)
+      dim_t v_time_stride;   // V_alloc * nheads * head_dim
+
+      if (offset == 0) {
+        // --- Prefilling / encoder ---
+        k_ptr         = reinterpret_cast<const DevT*>(keys.data<scalar_t>());
+        v_ptr         = reinterpret_cast<const DevT*>(values.data<scalar_t>());
+        seqlen_k      = seqlen_new;
+        k_time_stride = seqlen_k * num_heads * head_dim;
+        v_time_stride = seqlen_k * num_heads * head_dim;
+      } else {
+        // --- Autoregressive decode step ---
+        // 1. Write new keys/values into the cache at position `offset`.
+        const dim_t cache_size = cached_keys->dim(1);
+        {
+          // grid: (num_heads, seqlen_new, batch)
+          //   blockIdx.x = head index  → matches kernel's h = blockIdx.x
+          //   blockIdx.y = time step   → matches kernel's t = blockIdx.y
+          //   blockIdx.z = batch index → matches kernel's b = blockIdx.z
+          dim3 grid(num_heads, seqlen_new, batch_size);
+          const int block = min((int)head_dim, 1024);
+          hipLaunchKernelGGL(
+              hip_kv_cache_write_kernel<DevT>,
+              grid, block, 0, stream,
+              reinterpret_cast<const DevT*>(keys.data<scalar_t>()),
+              reinterpret_cast<DevT*>(cached_keys->data<scalar_t>()),
+              (int)seqlen_new, (int)cache_size,
+              (int)num_heads, (int)head_dim, (int)offset);
+          hipLaunchKernelGGL(
+              hip_kv_cache_write_kernel<DevT>,
+              grid, block, 0, stream,
+              reinterpret_cast<const DevT*>(values.data<scalar_t>()),
+              reinterpret_cast<DevT*>(cached_values->data<scalar_t>()),
+              (int)seqlen_new, (int)cache_size,
+              (int)num_heads, (int)head_dim, (int)offset);
+        }
+        // 2. Attend over the full (now updated) cache.
+        k_ptr         = reinterpret_cast<const DevT*>(cached_keys->data<scalar_t>());
+        v_ptr         = reinterpret_cast<const DevT*>(cached_values->data<scalar_t>());
+        seqlen_k      = offset + seqlen_new;
+        k_time_stride = cache_size * num_heads * head_dim;
+        v_time_stride = cache_size * num_heads * head_dim;
+      }
 
       output.resize(queries.shape());
       DevT* o_ptr = reinterpret_cast<DevT*>(output.data<scalar_t>());
 
-      // Allocate FP32 attention score buffer: [batch, nheads, seqlen_q, seqlen_k]
+      // ----------------------------------------------------------------
+      // Fast path A: decode-optimised kernel for seqlen_q == 1.
+      // Used for every autoregressive generation step.  One block per
+      // (batch, head) — threads parallelise across the K dimension.
+      // ----------------------------------------------------------------
+      if (seqlen_q == 1) {
+        // LDS budget: D + seqlen_k + max(BLOCK, V_TILE*D) FP32 elements.
+        // gfx1100 has 64 KiB per block.  V_TILE is chosen per head_dim to
+        // maximise per-sync reuse without busting LDS.
+        const dim_t lds_budget_floats = 64 * 1024 / sizeof(float);
+        auto launch_decode =
+            [&](auto head_dim_const, auto block_const, auto vtile_const) -> bool {
+          constexpr int D = decltype(head_dim_const)::value;
+          constexpr int BLOCK = decltype(block_const)::value;
+          constexpr int V_TILE = decltype(vtile_const)::value;
+          if (head_dim != D) return false;
+          const size_t scratch =
+              std::max((size_t)BLOCK, (size_t)V_TILE * D);
+          const size_t lds_floats = (size_t)D + (size_t)seqlen_k + scratch;
+          if ((dim_t)lds_floats > lds_budget_floats) return false;
+          dim3 grid(1, num_heads, batch_size);
+          dim3 block(BLOCK);
+          const size_t lds_bytes = lds_floats * sizeof(float);
+          hipLaunchKernelGGL((hip_flash_decode_kernel<DevT, BLOCK, D, V_TILE>),
+                             grid, block, lds_bytes, stream,
+                             q_ptr, k_ptr, v_ptr, o_ptr,
+                             (int)seqlen_k,
+                             (int)k_time_stride, (int)v_time_stride,
+                             (int)num_heads, queries_scale,
+                             is_causal, (int)offset);
+          return true;
+        };
+        // BLOCK > D so the V-tile load is co-operative across more threads
+        // than there are output channels — gives ~BLOCK/D HBM-bandwidth
+        // reduction on V in phase 3 plus more parallelism in phase 1.
+        // V_TILE picked large to amortise the sync cost over more useful
+        // multiply-adds (D=64 -> 128 K-tokens per tile -> 32 KiB scratch).
+        if (launch_decode(std::integral_constant<int,  64>{},
+                          std::integral_constant<int, 128>{},
+                          std::integral_constant<int, 128>{})) return;
+        if (launch_decode(std::integral_constant<int, 128>{},
+                          std::integral_constant<int, 256>{},
+                          std::integral_constant<int,  64>{})) return;
+        // else: fall through to 3-pass for very long sequences or
+        //       unsupported head dimensions.
+      }
+
+      // ----------------------------------------------------------------
+      // Fast path B: tiled Flash Attention 2 kernel for supported head_dim.
+      // This avoids the O(seqlen_q * seqlen_k) score buffer entirely.
+      //
+      // Used when seqlen_q >= BM: the tiled kernel uses one thread per
+      // query row, so for tiny seqlen_q (already handled above for
+      // seqlen_q == 1, but also for prompt prefill of a handful of tokens)
+      // many threads would idle.  For 2..BM-1 query rows we still fall
+      // back to the 3-pass kernel, which parallelises over Q*K score
+      // elements instead.
+      // ----------------------------------------------------------------
+      constexpr int BM = 64;
+      constexpr int BN = 64;
+      if (seqlen_q >= BM) {
+        auto launch_tiled = [&](auto head_dim_const) -> bool {
+          constexpr int D = decltype(head_dim_const)::value;
+          if (head_dim != D) return false;
+          dim3 grid((seqlen_q + BM - 1) / BM, num_heads, batch_size);
+          dim3 block(BM);
+          hipLaunchKernelGGL((hip_flash_attn_fwd_tiled<DevT, BM, BN, D>),
+                             grid, block, 0, stream,
+                             q_ptr, k_ptr, v_ptr, o_ptr,
+                             (int)seqlen_q, (int)seqlen_k,
+                             (int)k_time_stride, (int)v_time_stride,
+                             (int)num_heads, queries_scale,
+                             is_causal, (int)offset);
+          return true;
+        };
+
+        if (launch_tiled(std::integral_constant<int, 64>{}))  return;
+        if (launch_tiled(std::integral_constant<int, 80>{}))  return;
+        if (launch_tiled(std::integral_constant<int, 128>{})) return;
+      }
+
+      // ----------------------------------------------------------------
+      // Fallback: three-pass reference implementation.  Used when head_dim
+      // is not one of the specialised values above.  Materialises the full
+      // [batch, nheads, seqlen_q, seqlen_k] score buffer in HBM.
+      // ----------------------------------------------------------------
       StorageView scores_buf({batch_size, num_heads, seqlen_q, seqlen_k},
                              DataType::FLOAT32, Device::CUDA);
       float* s_ptr = scores_buf.data<float>();
 
-      hipStream_t stream = cuda::get_cuda_stream();
-
       // --- Pass 1: Q @ K^T ---
       {
         const int total = seqlen_q * seqlen_k;
@@ -568,18 +1053,21 @@ namespace ctranslate2 {
         dim3 grid((total + block - 1) / block, num_heads, batch_size);
         hipLaunchKernelGGL(hip_attn_qk_kernel<DevT>, grid, block, 0, stream,
                            q_ptr, k_ptr, s_ptr,
-                           seqlen_q, seqlen_k, num_heads, head_dim,
+                           (int)seqlen_q, (int)seqlen_k, (int)k_time_stride,
+                           (int)num_heads, (int)head_dim,
                            queries_scale);
       }
 
-      // --- Pass 2: softmax (with causal mask) ---
+      // --- Pass 2: row-wise softmax (with causal mask) ---
       {
-        const int block = min((int)seqlen_k, 256);
+        // The tree reduction inside hip_attn_softmax_kernel requires
+        // blockDim.x to be a power of 2.  Round up min(seqlen_k, 256) to
+        // the next power of 2 (256 itself is already a power of 2, so the
+        // cap never breaks the invariant).  Extra threads contribute the
+        // identity values (-1e9 / 0.0) and are harmless.
+        int block = 1;
+        while (block < (int)std::min(seqlen_k, (dim_t)256)) block <<= 1;
         dim3 grid(seqlen_q, num_heads, batch_size);
-        // The kernel uses a flat pointer into S; we compute the nheads-stride
-        // by adjusting the base pointer per head inside the kernel via blockIdx.y.
-        // Pass nheads as a compile-time-unknown dynamic value: kernel reads it
-        // from gridDim.y.
         hipLaunchKernelGGL(hip_attn_softmax_kernel, grid, block,
                            block * sizeof(float),  // dynamic shared mem
                            stream,
@@ -593,7 +1081,8 @@ namespace ctranslate2 {
         dim3 grid(seqlen_q, num_heads, batch_size);
         hipLaunchKernelGGL(hip_attn_ov_kernel<DevT>, grid, block, 0, stream,
                            s_ptr, v_ptr, o_ptr,
-                           seqlen_k, num_heads, head_dim);
+                           (int)seqlen_k, (int)v_time_stride,
+                           (int)num_heads, (int)head_dim);
       }
     }
 

From 5e0fd50954d5b637e8fba40299894a61ed3c3bbe Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Mon, 11 May 2026 22:27:44 +0200
Subject: [PATCH 04/14] layers: enable FlashAttention on the encoder side
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related changes that together let the encoder's self-attention go
through FlashMultiHeadAttention when use_flash_attention=true:

- FlashMultiHeadAttention previously instantiated the FlashAttention op
  with its default is_causal=true, which only matches the decoder's
  autoregressive self-attention.  When the layer is used by the encoder
  every query would mask out its successors, producing wrong outputs.
  The op is now constructed with is_causal=_is_decoder, so encoder
  usage is non-causal and decoder usage stays causal.

- TransformerEncoder and WhisperEncoder build their layers without
  passing use_flash_attention through to TransformerEncoderLayer, so
  they always picked MultiHeadAttention regardless of the model's
  flag.  Both constructors now forward model.use_flash_attention().
  (Whisper has its own encoder class — patching only the generic
  TransformerEncoder is not enough.)

Whisper-medium / RX 7900 XTX numbers (Sq = Sk = 1500, B = 1):
  encoder-only:  64.2 ms → 59.3 ms   (1.08x)
  generate(30):  221   ms →  213 ms  (1.04x)
  generate(200): 1012  ms →  965 ms  (1.05x)
  generate(448): 2233  ms → 2154 ms  (1.04x)

Correctness preserved (FP16 rounding gives ~0.3% relative diff in the
encoder output; generated token sequences match exactly up to at least
max_length=200 across five random seeds).
---
 src/layers/flash_attention.cc | 6 +++++-
 src/layers/transformer.cc     | 3 ++-
 src/layers/whisper.cc         | 3 ++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/layers/flash_attention.cc b/src/layers/flash_attention.cc
index fb13b90fc..7a9eec4fa 100644
--- a/src/layers/flash_attention.cc
+++ b/src/layers/flash_attention.cc
@@ -110,7 +110,11 @@ namespace ctranslate2 {
 
       // init output
       StorageView context(dtype, device);
-      ops::FlashAttention fl_attn_ops(_queries_scale, _sliding_window);
+      // Causal masking only applies to the decoder's autoregressive self-attention.
+      // Encoder self-attention (and any non-decoder usage of FlashMultiHeadAttention)
+      // must run NON-causal — otherwise each token would only see its predecessors,
+      // which is wrong for bidirectional contexts.
+      ops::FlashAttention fl_attn_ops(_queries_scale, _sliding_window, /*is_causal=*/_is_decoder);
       fl_attn_ops(queries_proj, keys_proj, values_proj, context, cached_keys, cached_values, attention,
                   return_normalized_attention, rotary_cos, rotary_sin, rotary_interleaved, nullptr/*alibli*/, offset);
 
diff --git a/src/layers/transformer.cc b/src/layers/transformer.cc
index 4fea80f7f..6a9905e9c 100644
--- a/src/layers/transformer.cc
+++ b/src/layers/transformer.cc
@@ -393,7 +393,8 @@ namespace ctranslate2 {
                   scope + "/layer",
                   _num_heads,
                   model.get_flag_with_default(scope + "/pre_norm", true),
-                  model.get_enum_value<ops::ActivationType>(scope + "/activation")))
+                  model.get_enum_value<ops::ActivationType>(scope + "/activation"),
+                  _use_flash_attention))
       , _position_encoder(_layers.front()->get_self_attention().has_positional_embeddings()
                           ? nullptr
                           : build_position_encoder(model, scope + "/position_encodings", _embeddings))
diff --git a/src/layers/whisper.cc b/src/layers/whisper.cc
index 401c14677..763b47b28 100644
--- a/src/layers/whisper.cc
+++ b/src/layers/whisper.cc
@@ -17,7 +17,8 @@ namespace ctranslate2 {
                                                                  scope + "/layer",
                                                                  _num_heads,
                                                                  /*pre_norm=*/true,
-                                                                 ops::ActivationType::GELU))
+                                                                 ops::ActivationType::GELU,
+                                                                 model.use_flash_attention()))
       , _output_norm(model, scope + "/layer_norm")
     {
     }

From c95d914e685152a5b96eda427b18c0ae397be2e5 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Mon, 11 May 2026 22:40:02 +0200
Subject: [PATCH 05/14] ops: WMMA-accelerated Flash Attention kernel for RDNA3
 (gfx11)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds hip_flash_attn_wmma_fp16: a Flash Attention forward kernel that
uses the wave32 16x16x16 fp16-input / fp32-accumulator WMMA built-in
(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32) for both Q·K^T and P·V.

Block layout (one wave32 per block):
  BM_W = 16 query rows
  BN   = 16 key tokens per K/V tile
  Q tile loaded once (pre-scaled), K and V tiles streamed in
  Inner reduction over the head dimension via D/16 WMMA calls per
  output tile; the P·V phase fans out to D/16 output fragments held
  entirely in registers.  Online softmax with per-row (m_i, l_i)
  staged through LDS so the WMMA accumulator fragment layout is
  decoupled from the row-reduction code.

Activated in the dispatcher for FP16 inputs whenever head_dim is a
multiple of 16 and seqlen_q >= 16.  Falls through to the scalar
tiled kernel for BF16 and other shapes; correctness oracle (3-pass)
remains as the final fallback.

Also adds src/ops/wmma_probe.cu — a small companion file with a
probe kernel + C-ABI driver used to empirically verify the wave32
accumulator fragment layout on this hardware before writing the
real kernel.  The probe is reusable for future ports (BF16 path,
gfx12, other tile shapes).  Layout verified on gfx1100:
  lane l, slot s -> C[2*s + (l >> 4), l mod 16]

Whisper-medium / RX 7900 XTX numbers (vs. the scalar tiled kernel
that was the previous best):
  encoder B=1:  59.3 -> 57.6 ms  (1.03x)
  encoder B=4:  202.8 -> 182.6 ms (1.11x)  -- biggest win, scales
                                              with compute load
  generate 30:  227 -> 209 ms    (1.09x)
  generate 100: 549 -> 522 ms    (1.05x)
  generate 200: 1031 -> 968 ms   (1.07x)
  generate 448: 2203 -> 2099 ms  (1.05x)

Correctness preserved (FP16 rounding gives ~0.3% relative diff in
the encoder output; generated token sequences match exactly up to
max_length=200 across five random seeds).
---
 CMakeLists.txt                 |   1 +
 src/ops/flash_attention_gpu.cu | 296 ++++++++++++++++++++++++++++++++-
 src/ops/wmma_probe.cu          | 123 ++++++++++++++
 3 files changed, 411 insertions(+), 9 deletions(-)
 create mode 100644 src/ops/wmma_probe.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c125f806..d90084ea7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -238,6 +238,7 @@ set(CUDA_SOURCES
   src/ops/conv1d_gpu.cu
   src/ops/dequantize_gpu.cu
   src/ops/flash_attention_gpu.cu
+  src/ops/wmma_probe.cu
   src/ops/gather_gpu.cu
   src/ops/gumbel_max_gpu.cu
   src/ops/layer_norm_gpu.cu
diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 118860d75..73cf63f40 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -569,6 +569,259 @@ namespace ctranslate2 {
           static_cast<scalar_t>(out);
     }
 
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || !defined(__HIP_DEVICE_COMPILE__)
+    // -------------------------------------------------------------------
+    // RDNA3 WMMA Flash Attention forward kernel — FP16 only.
+    //
+    // Uses the wave32 16x16x16 fp16-input/fp32-accumulator WMMA built-in
+    // (__builtin_amdgcn_wmma_f32_16x16x16_f16_w32) for Q·K^T and P·V.
+    // Other paths in this file (3-pass / scalar tiled) remain for BF16,
+    // for non-multiple-of-16 head dimensions, and as the correctness oracle.
+    //
+    // Layout (wave32, RDNA3 — verified empirically via wmma_probe.cu):
+    //   A-fragment (16x16 fp16, row-major): each lane holds one full row
+    //     a_frag[j] = A[lane % 16, j],  for j = 0..15
+    //     (Lanes 16..31 carry a duplicate of rows 0..15.)
+    //   B-fragment (16x16 fp16, col-major): each lane holds one full column
+    //     b_frag[i] = B[i, lane % 16],  for i = 0..15
+    //   C-fragment (16x16 fp32, accumulator): each lane holds 8 elements
+    //     c_frag[s] = C[2*s + (lane >> 4), lane % 16],  for s = 0..7
+    //     (Lanes 0..15 hold even rows, lanes 16..31 hold odd rows.)
+    //
+    // S = Q · K^T  is mapped to D = A · B with:
+    //   A[i][k] = Q[i][k]                (Q tile, row-major in LDS)
+    //   B[k][j] = K[j][k]                (K tile transposed view)
+    //
+    // Block layout:
+    //   grid: (ceildiv(seqlen_q, BM), nheads, batch_size)
+    //   block: 32 threads (one wave32)
+    //   BM = 16 query rows per block,  BN = 16 key tokens per K/V tile
+    //
+    // LDS per block (D = 64):
+    //   q_lds[BM][D]    fp16  — Q tile, scaled by softmax-scale on load
+    //   k_lds[BN][D]    fp16  — current K tile
+    //   v_lds[BN][D]    fp16  — current V tile
+    //   p_lds[BM][BN]   fp16  — softmaxed S (used as A-fragment for P·V)
+    //   s_scratch[BM][BN] fp32 — temporary S before softmax
+    //   m_lds[BM]       fp32  — running max per query row
+    //   l_lds[BM]       fp32  — running sum (denominator) per query row
+    //   alpha_lds[BM]   fp32  — softmax correction factor per row
+    //   Total ≈ 18 KiB / block, well within the 64 KiB budget.
+    //
+    // O is held entirely in registers as `o_frag[D/16]` (4 v8f32 fragments
+    // for D = 64) per lane.  Each fragment covers a 16-column slice of O.
+    // -------------------------------------------------------------------
+    template <int D>
+    __global__ void hip_flash_attn_wmma_fp16(
+        const _Float16* __restrict__ Q,
+        const _Float16* __restrict__ K,
+        const _Float16* __restrict__ V,
+        _Float16*       __restrict__ O,
+        const int seqlen_q,
+        const int seqlen_k,
+        const int k_time_stride,
+        const int v_time_stride,
+        const int nheads,
+        const float scale,
+        const bool is_causal,
+        const int q_offset)
+    {
+      static_assert(D % 16 == 0, "head_dim must be a multiple of 16 for WMMA");
+      constexpr int BM = 16;
+      constexpr int BN = 16;
+      constexpr int DT = D / 16;  // number of D-tiles
+
+      using v16f16 = _Float16 __attribute__((ext_vector_type(16)));
+      using v8f32  = float    __attribute__((ext_vector_type(8)));
+
+      const int lane     = threadIdx.x;       // 0..31
+      const int b        = blockIdx.z;
+      const int h        = blockIdx.y;
+      const int q_tile   = blockIdx.x;
+      const int q_row_0  = q_tile * BM;
+
+      __shared__ _Float16 q_lds[BM][D];
+      __shared__ _Float16 k_lds[BN][D];
+      __shared__ _Float16 v_lds[BN][D];
+      __shared__ _Float16 p_lds[BM][BN];
+      __shared__ float    s_scratch[BM][BN];
+      __shared__ float    m_lds[BM];
+      __shared__ float    l_lds[BM];
+      __shared__ float    alpha_lds[BM];
+
+      // ---- Load Q tile, pre-scaled (so S = Q·K^T already has softmax-scale) ----
+      for (int idx = lane; idx < BM * D; idx += 32) {
+        const int row = idx / D;
+        const int col = idx % D;
+        const int q_row = q_row_0 + row;
+        float v = 0.f;
+        if (q_row < seqlen_q)
+          v = static_cast<float>(Q[b * seqlen_q * nheads * D
+                                   + q_row * nheads * D + h * D + col]) * scale;
+        q_lds[row][col] = static_cast<_Float16>(v);
+      }
+
+      // ---- Initialise running state and output accumulators ----
+      if (lane < BM) {
+        m_lds[lane] = -1e30f;
+        l_lds[lane] = 0.f;
+      }
+      v8f32 o_frag[DT];
+      #pragma unroll
+      for (int t = 0; t < DT; ++t)
+        #pragma unroll
+        for (int s = 0; s < 8; ++s) o_frag[t][s] = 0.f;
+
+      __syncthreads();
+
+      // ---- Iterate over K/V tiles ----
+      const int num_k_tiles = (seqlen_k + BN - 1) / BN;
+      for (int kt = 0; kt < num_k_tiles; ++kt) {
+        const int k_row_0 = kt * BN;
+
+        // -- Load K and V tiles --
+        for (int idx = lane; idx < BN * D; idx += 32) {
+          const int row = idx / D;
+          const int col = idx % D;
+          const int k_row = k_row_0 + row;
+          _Float16 kv_k = static_cast<_Float16>(0);
+          _Float16 kv_v = static_cast<_Float16>(0);
+          if (k_row < seqlen_k) {
+            kv_k = K[b * k_time_stride + k_row * nheads * D + h * D + col];
+            kv_v = V[b * v_time_stride + k_row * nheads * D + h * D + col];
+          }
+          k_lds[row][col] = kv_k;
+          v_lds[row][col] = kv_v;
+        }
+        __syncthreads();
+
+        // -- Compute S[BM=16][BN=16] = Q · K^T using WMMA --
+        v8f32 s_frag;
+        #pragma unroll
+        for (int s = 0; s < 8; ++s) s_frag[s] = 0.f;
+
+        // Inner reduction over D in 16-element chunks.
+        // A (Q row, row-major): a_frag[j] = q_lds[lane & 15][inner*16 + j]
+        // B (K transposed, col-major): b_frag[i] = k_lds[lane & 15][inner*16 + i]
+        // (col-major B's column index = lane mod 16, mapping to the K row j;
+        //  inner reduction index k maps to the D-position within the chunk.)
+        #pragma unroll
+        for (int inner = 0; inner < DT; ++inner) {
+          v16f16 a_frag, b_frag;
+          const int a_row = lane & 15;
+          const int b_col = lane & 15;
+          #pragma unroll
+          for (int x = 0; x < 16; ++x) {
+            a_frag[x] = q_lds[a_row][inner * 16 + x];
+            b_frag[x] = k_lds[b_col][inner * 16 + x];
+          }
+          s_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, s_frag);
+        }
+
+        // -- Write S to LDS with causal/OOB masking baked in --
+        {
+          const int s_col = lane & 15;
+          const int k_col_g = k_row_0 + s_col;
+          #pragma unroll
+          for (int s = 0; s < 8; ++s) {
+            const int s_row = 2 * s + (lane >> 4);
+            const int q_row_g = q_row_0 + s_row;
+            const int q_pos   = q_row_g + q_offset;
+            const bool oob    = (k_col_g >= seqlen_k) || (q_row_g >= seqlen_q);
+            const bool masked = is_causal && k_col_g > q_pos;
+            s_scratch[s_row][s_col] = (oob || masked) ? -1e30f : s_frag[s];
+          }
+        }
+        __syncthreads();
+
+        // -- Per-row softmax + online update (one thread per row, 16 rows / 32 threads) --
+        if (lane < BM) {
+          const int row = lane;
+          const float m_old = m_lds[row];
+          const float l_old = l_lds[row];
+
+          float m_tile = -1e30f;
+          #pragma unroll
+          for (int c = 0; c < BN; ++c)
+            m_tile = fmaxf(m_tile, s_scratch[row][c]);
+
+          const float m_new = fmaxf(m_old, m_tile);
+          const float alpha = (m_old == -1e30f) ? 0.f : __expf(m_old - m_new);
+
+          float l_tile = 0.f;
+          #pragma unroll
+          for (int c = 0; c < BN; ++c) {
+            const float v = s_scratch[row][c];
+            const float e = (v <= -1e29f) ? 0.f : __expf(v - m_new);
+            p_lds[row][c] = static_cast<_Float16>(e);  // P for the P·V WMMA
+            l_tile += e;
+          }
+
+          m_lds[row]     = m_new;
+          l_lds[row]     = alpha * l_old + l_tile;
+          alpha_lds[row] = alpha;
+        }
+        __syncthreads();
+
+        // -- Scale o_frag by per-row alpha --
+        {
+          #pragma unroll
+          for (int t = 0; t < DT; ++t) {
+            #pragma unroll
+            for (int s = 0; s < 8; ++s) {
+              const int o_row = 2 * s + (lane >> 4);
+              o_frag[t][s] *= alpha_lds[o_row];
+            }
+          }
+        }
+
+        // -- Compute O += P · V using WMMA (one V-N-tile per D-tile of O) --
+        // A (P, row-major):  a_frag[j] = p_lds[lane & 15][j]      (one row of P)
+        // B (V, col-major):  b_frag[i] = v_lds[i][t*16 + lane&15] (column t*16+col of V)
+        // Inner reduction goes over BN = 16 (the K-token dimension), in one step.
+        {
+          v16f16 a_frag;
+          const int a_row = lane & 15;
+          #pragma unroll
+          for (int x = 0; x < 16; ++x)
+            a_frag[x] = p_lds[a_row][x];
+
+          #pragma unroll
+          for (int t = 0; t < DT; ++t) {
+            v16f16 b_frag;
+            const int b_col = lane & 15;
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+              b_frag[i] = v_lds[i][t * 16 + b_col];
+
+            o_frag[t] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, o_frag[t]);
+          }
+        }
+        __syncthreads();
+      }
+
+      // ---- Normalise and store O ----
+      if (lane < BM)
+        alpha_lds[lane] = (l_lds[lane] > 0.f) ? 1.f / l_lds[lane] : 0.f;  // reuse alpha_lds as inv_l
+      __syncthreads();
+
+      #pragma unroll
+      for (int t = 0; t < DT; ++t) {
+        const int o_col_base = t * 16 + (lane & 15);
+        #pragma unroll
+        for (int s = 0; s < 8; ++s) {
+          const int o_row = 2 * s + (lane >> 4);
+          const int q_row_g = q_row_0 + o_row;
+          if (q_row_g < seqlen_q && o_col_base < D) {
+            const float val = o_frag[t][s] * alpha_lds[o_row];
+            O[b * seqlen_q * nheads * D + q_row_g * nheads * D + h * D + o_col_base]
+                = static_cast<_Float16>(val);
+          }
+        }
+      }
+    }
+#endif  // gfx11
+
     // -------------------------------------------------------------------
     // Tiled fused forward attention (Flash Attention 2 algorithm)
     //
@@ -1004,15 +1257,40 @@ namespace ctranslate2 {
       }
 
       // ----------------------------------------------------------------
-      // Fast path B: tiled Flash Attention 2 kernel for supported head_dim.
-      // This avoids the O(seqlen_q * seqlen_k) score buffer entirely.
-      //
-      // Used when seqlen_q >= BM: the tiled kernel uses one thread per
-      // query row, so for tiny seqlen_q (already handled above for
-      // seqlen_q == 1, but also for prompt prefill of a handful of tokens)
-      // many threads would idle.  For 2..BM-1 query rows we still fall
-      // back to the 3-pass kernel, which parallelises over Q*K score
-      // elements instead.
+      // Fast path B: WMMA-accelerated kernel for FP16 on RDNA3.
+      // Each block handles BM_W = 16 query rows in a single wavefront and
+      // uses the 16x16x16 wave32 WMMA built-in for Q·K^T and P·V.
+      // Compute throughput ~5-10x over the scalar tiled kernel.
+      // Only the FP16 path; BF16 falls through to scalar tiled.
+      // ----------------------------------------------------------------
+      if constexpr (std::is_same<scalar_t, float16_t>::value) {
+        constexpr int BM_W = 16;
+        auto launch_wmma = [&](auto head_dim_const) -> bool {
+          constexpr int D = decltype(head_dim_const)::value;
+          if (head_dim != D) return false;
+          if (D % 16 != 0)  return false;
+          if (seqlen_q < BM_W) return false;
+          dim3 grid((seqlen_q + BM_W - 1) / BM_W, num_heads, batch_size);
+          dim3 block(32);  // one wave32
+          hipLaunchKernelGGL((hip_flash_attn_wmma_fp16<D>),
+                             grid, block, 0, stream,
+                             reinterpret_cast<const _Float16*>(q_ptr),
+                             reinterpret_cast<const _Float16*>(k_ptr),
+                             reinterpret_cast<const _Float16*>(v_ptr),
+                             reinterpret_cast<_Float16*>(o_ptr),
+                             (int)seqlen_q, (int)seqlen_k,
+                             (int)k_time_stride, (int)v_time_stride,
+                             (int)num_heads, queries_scale,
+                             is_causal, (int)offset);
+          return true;
+        };
+        if (launch_wmma(std::integral_constant<int,  64>{})) return;
+        if (launch_wmma(std::integral_constant<int, 128>{})) return;
+      }
+
+      // Fast path C: scalar tiled Flash Attention 2 kernel.
+      // Fallback for BF16 and head dimensions WMMA doesn't cover.
+      // Used when seqlen_q >= BM (one thread per query row).
       // ----------------------------------------------------------------
       constexpr int BM = 64;
       constexpr int BN = 64;
diff --git a/src/ops/wmma_probe.cu b/src/ops/wmma_probe.cu
new file mode 100644
index 000000000..c2e5ff312
--- /dev/null
+++ b/src/ops/wmma_probe.cu
@@ -0,0 +1,123 @@
+// -----------------------------------------------------------------------------
+// WMMA probe kernel — verifies the wave32 RDNA3 fragment layout empirically.
+//
+// Strategy: feed a known A and B = identity into one 16x16x16 WMMA, then have
+// every lane write its 8 fp32 accumulator elements to an output buffer indexed
+// by lane and slot.  Comparing the result on the host to a reference GEMM
+// reveals which (row, col) of C each (lane, slot) pair corresponds to.
+//
+// Only compiled for HIP gfx11* targets.  Plain HIP / Clang built-ins, no
+// rocWMMA dependency.  Lives in a separate translation unit so it doesn't
+// pollute the production Flash Attention build.
+// -----------------------------------------------------------------------------
+#ifdef CT2_USE_HIP
+
+#include <hip/hip_runtime.h>
+
+namespace ctranslate2 {
+  namespace ops {
+    namespace wmma_probe {
+
+      using v16f16 = _Float16 __attribute__((ext_vector_type(16)));
+      using v8f32  = float    __attribute__((ext_vector_type(8)));
+      // __builtin_amdgcn_wmma_f32_16x16x16_f16_w32 is a Clang-internal
+      // __device__ built-in for gfx11.  No forward declaration needed.
+
+      // ---------------------------------------------------------------------
+      // Probe kernel.  One block, one wave (32 threads).
+      //
+      // Inputs:
+      //   A[16][16] row-major fp16    -- caller fills with A[i][j] = i*16 + j
+      //   B[16][16] col-major fp16    -- caller fills with identity (B[i][j] = i==j)
+      //
+      // Output:
+      //   out[lane*8 + slot] = the FP32 accumulator value held by `lane` at
+      //                        slot `slot` (0..7) after WMMA.
+      //
+      // With B = I, the WMMA result D = A * I = A, so out[lane*8 + slot] tells
+      // us which A[row][col] that (lane, slot) corresponds to.  This is the
+      // ground-truth layout map we need to design real WMMA kernels.
+      // ---------------------------------------------------------------------
+      __global__ void wmma_probe_kernel(const _Float16* __restrict__ A,
+                                        const _Float16* __restrict__ B,
+                                        float* __restrict__ out)
+      {
+        const int lane = threadIdx.x;            // 0 .. 31
+
+        // -- Load A row for this lane (rows 0..15; lanes 16..31 duplicate) --
+        v16f16 a_frag;
+        const int row = lane % 16;
+        #pragma unroll
+        for (int j = 0; j < 16; ++j)
+          a_frag[j] = A[row * 16 + j];
+
+        // -- Load B column for this lane (cols 0..15; lanes 16..31 duplicate) --
+        v16f16 b_frag;
+        const int col = lane % 16;
+        #pragma unroll
+        for (int i = 0; i < 16; ++i)
+          b_frag[i] = B[i * 16 + col];
+
+        // -- Zero the accumulator --
+        v8f32 c_frag;
+        #pragma unroll
+        for (int s = 0; s < 8; ++s) c_frag[s] = 0.f;
+
+        // -- The matmul --
+        c_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, c_frag);
+
+        // -- Spill every lane's 8 accumulator slots so the host can inspect --
+        #pragma unroll
+        for (int s = 0; s < 8; ++s)
+          out[lane * 8 + s] = c_frag[s];
+      }
+
+    }  // namespace wmma_probe
+  }  // namespace ops
+}  // namespace ctranslate2
+
+// ---------------------------------------------------------------------------
+// C-ABI driver — invoked via ctypes from a small Python script.
+// Allocates a 16x16 fp16 A (A[i][j] = i*16 + j) and identity B, runs the
+// probe kernel on the current device, and copies the per-lane accumulator
+// dump into the caller-provided buffer.
+// out must point to at least 32*8 floats of host memory.
+// Returns 0 on success, a hipError_t code otherwise.
+// ---------------------------------------------------------------------------
+extern "C" __declspec(dllexport)
+int ct2_wmma_probe_run(float* out_host)
+{
+  using ctranslate2::ops::wmma_probe::wmma_probe_kernel;
+
+  const int N = 16 * 16;
+  _Float16 A_host[N];
+  _Float16 B_host[N];
+  for (int i = 0; i < 16; ++i)
+    for (int j = 0; j < 16; ++j) {
+      A_host[i * 16 + j] = static_cast<_Float16>(i * 16 + j);
+      B_host[i * 16 + j] = static_cast<_Float16>(i == j ? 1 : 0);
+    }
+
+  _Float16 *A_d = nullptr, *B_d = nullptr;
+  float* out_d = nullptr;
+  hipError_t err;
+
+  err = hipMalloc(&A_d, N * sizeof(_Float16));   if (err) return (int)err;
+  err = hipMalloc(&B_d, N * sizeof(_Float16));   if (err) return (int)err;
+  err = hipMalloc(&out_d, 32 * 8 * sizeof(float)); if (err) return (int)err;
+
+  err = hipMemcpy(A_d, A_host, N * sizeof(_Float16), hipMemcpyHostToDevice);
+  if (err) return (int)err;
+  err = hipMemcpy(B_d, B_host, N * sizeof(_Float16), hipMemcpyHostToDevice);
+  if (err) return (int)err;
+
+  hipLaunchKernelGGL(wmma_probe_kernel, dim3(1), dim3(32), 0, 0, A_d, B_d, out_d);
+  err = hipDeviceSynchronize();
+  if (err) return (int)err;
+
+  err = hipMemcpy(out_host, out_d, 32 * 8 * sizeof(float), hipMemcpyDeviceToHost);
+  hipFree(A_d); hipFree(B_d); hipFree(out_d);
+  return (int)err;
+}
+
+#endif  // CT2_USE_HIP

From a8b3352d9ca2bc9b1874647a9bbfc0bd2f64923d Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Mon, 11 May 2026 22:58:04 +0200
Subject: [PATCH 06/14] ops: add BF16 WMMA path and BM_W=64 kernel (disabled by
 default)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two complementary additions to the WMMA Flash Attention path:

1. BF16 support.  The wave32 16x16x16 fragment layout is identical
   between the FP16 and BF16 WMMA built-ins on RDNA3 — the only
   difference is which intrinsic to call.  The existing BM_W=16
   kernel is now templated over the half-precision element type
   (HalfT in {_Float16, __bf16}) and selects the right built-in
   (__builtin_amdgcn_wmma_f32_16x16x16_{f16,bf16}_w32) via
   if constexpr.  The dispatcher matches scalar_t against
   float16_t / bfloat16_t and reinterpret_casts the data pointers
   to the underlying HalfT (same memory layout, the wrapper is just
   a host-side ABI thing).

   Whisper-medium with compute_type=bfloat16, RX 7900 XTX:
     encoder B=1:  59.8 -> 55.0 ms (1.09x), matches the FP16 win.

2. New kernel hip_flash_attn_wmma_fp16_bm64<D>: 4 wave32 wavefronts
   per block, BM_W = BN = 64, each K/V tile loaded once into LDS
   and shared across all 64 query rows.  Built but currently NOT
   dispatched (kept for future tuning).  On gfx1100 it is ~5-10%
   slower than the BM_W=16 variant because:
   - 48 KiB LDS allows only 1 block per CU vs. BM_W=16's 4 blocks
     per CU -> halved occupancy
   - Per-row softmax loop iterates BN=64 cols sequentially (4x more
     work per softmax thread)
   - The expected K/V HBM-reuse benefit doesn't materialise: Whisper
     attention is compute/LDS-bound, not HBM-bound (~580 MB total
     attention HBM traffic / 960 GB/s = 0.6 ms, vs. ~60 ms encoder)
   The kernel structure (per-wave S/P scratch, cooperative tile load)
   is the foundation for a future wave-shuffle-softmax variant that
   could drop LDS enough to make BM_W=64 actually a win.

Correctness preserved across all five test seeds for FP16; BF16
matches 4/5 (the fifth differs in the last token, expected from
BF16's 8-bit mantissa — the relative encoder diff is 1.85%).
---
 src/ops/flash_attention_gpu.cu | 357 +++++++++++++++++++++++++++++----
 1 file changed, 320 insertions(+), 37 deletions(-)

diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 73cf63f40..0263eae59 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -611,12 +611,15 @@ namespace ctranslate2 {
     // O is held entirely in registers as `o_frag[D/16]` (4 v8f32 fragments
     // for D = 64) per lane.  Each fragment covers a 16-column slice of O.
     // -------------------------------------------------------------------
-    template <int D>
+    // HalfT is the wave-level fp16/bf16 element type and determines which
+    // WMMA built-in we dispatch (f16 vs bf16).  Both variants share the
+    // same wave32 accumulator-fragment layout on RDNA3.
+    template <typename HalfT, int D>
     __global__ void hip_flash_attn_wmma_fp16(
-        const _Float16* __restrict__ Q,
-        const _Float16* __restrict__ K,
-        const _Float16* __restrict__ V,
-        _Float16*       __restrict__ O,
+        const HalfT* __restrict__ Q,
+        const HalfT* __restrict__ K,
+        const HalfT* __restrict__ V,
+        HalfT*       __restrict__ O,
         const int seqlen_q,
         const int seqlen_k,
         const int k_time_stride,
@@ -631,8 +634,8 @@ namespace ctranslate2 {
       constexpr int BN = 16;
       constexpr int DT = D / 16;  // number of D-tiles
 
-      using v16f16 = _Float16 __attribute__((ext_vector_type(16)));
-      using v8f32  = float    __attribute__((ext_vector_type(8)));
+      using v16f16 = HalfT __attribute__((ext_vector_type(16)));
+      using v8f32  = float __attribute__((ext_vector_type(8)));
 
       const int lane     = threadIdx.x;       // 0..31
       const int b        = blockIdx.z;
@@ -640,14 +643,14 @@ namespace ctranslate2 {
       const int q_tile   = blockIdx.x;
       const int q_row_0  = q_tile * BM;
 
-      __shared__ _Float16 q_lds[BM][D];
-      __shared__ _Float16 k_lds[BN][D];
-      __shared__ _Float16 v_lds[BN][D];
-      __shared__ _Float16 p_lds[BM][BN];
-      __shared__ float    s_scratch[BM][BN];
-      __shared__ float    m_lds[BM];
-      __shared__ float    l_lds[BM];
-      __shared__ float    alpha_lds[BM];
+      __shared__ HalfT q_lds[BM][D];
+      __shared__ HalfT k_lds[BN][D];
+      __shared__ HalfT v_lds[BN][D];
+      __shared__ HalfT p_lds[BM][BN];
+      __shared__ float s_scratch[BM][BN];
+      __shared__ float m_lds[BM];
+      __shared__ float l_lds[BM];
+      __shared__ float alpha_lds[BM];
 
       // ---- Load Q tile, pre-scaled (so S = Q·K^T already has softmax-scale) ----
       for (int idx = lane; idx < BM * D; idx += 32) {
@@ -658,7 +661,7 @@ namespace ctranslate2 {
         if (q_row < seqlen_q)
           v = static_cast<float>(Q[b * seqlen_q * nheads * D
                                    + q_row * nheads * D + h * D + col]) * scale;
-        q_lds[row][col] = static_cast<_Float16>(v);
+        q_lds[row][col] = static_cast<HalfT>(v);
       }
 
       // ---- Initialise running state and output accumulators ----
@@ -684,8 +687,8 @@ namespace ctranslate2 {
           const int row = idx / D;
           const int col = idx % D;
           const int k_row = k_row_0 + row;
-          _Float16 kv_k = static_cast<_Float16>(0);
-          _Float16 kv_v = static_cast<_Float16>(0);
+          HalfT kv_k = static_cast<HalfT>(0);
+          HalfT kv_v = static_cast<HalfT>(0);
           if (k_row < seqlen_k) {
             kv_k = K[b * k_time_stride + k_row * nheads * D + h * D + col];
             kv_v = V[b * v_time_stride + k_row * nheads * D + h * D + col];
@@ -715,7 +718,10 @@ namespace ctranslate2 {
             a_frag[x] = q_lds[a_row][inner * 16 + x];
             b_frag[x] = k_lds[b_col][inner * 16 + x];
           }
-          s_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, s_frag);
+          if constexpr (std::is_same<HalfT, _Float16>::value)
+            s_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, s_frag);
+          else
+            s_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, s_frag);
         }
 
         // -- Write S to LDS with causal/OOB masking baked in --
@@ -753,7 +759,7 @@ namespace ctranslate2 {
           for (int c = 0; c < BN; ++c) {
             const float v = s_scratch[row][c];
             const float e = (v <= -1e29f) ? 0.f : __expf(v - m_new);
-            p_lds[row][c] = static_cast<_Float16>(e);  // P for the P·V WMMA
+            p_lds[row][c] = static_cast<HalfT>(e);  // P for the P·V WMMA
             l_tile += e;
           }
 
@@ -794,7 +800,10 @@ namespace ctranslate2 {
             for (int i = 0; i < 16; ++i)
               b_frag[i] = v_lds[i][t * 16 + b_col];
 
-            o_frag[t] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, o_frag[t]);
+            if constexpr (std::is_same<HalfT, _Float16>::value)
+              o_frag[t] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a_frag, b_frag, o_frag[t]);
+            else
+              o_frag[t] = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a_frag, b_frag, o_frag[t]);
           }
         }
         __syncthreads();
@@ -815,6 +824,248 @@ namespace ctranslate2 {
           if (q_row_g < seqlen_q && o_col_base < D) {
             const float val = o_frag[t][s] * alpha_lds[o_row];
             O[b * seqlen_q * nheads * D + q_row_g * nheads * D + h * D + o_col_base]
+                = static_cast<HalfT>(val);
+          }
+        }
+      }
+    }
+    // -------------------------------------------------------------------
+    // Larger WMMA kernel — BM_W = BN = 64, 4 wave32 wavefronts per block.
+    //
+    // The key win over the BM_W = 16 kernel is K/V HBM-reuse: each K-tile
+    // and V-tile is loaded ONCE into LDS per block, and the four waves
+    // share the loaded tiles to compute attention over 64 query rows.
+    // Compared to BM_W = 16, total K-data / V-data read from HBM per
+    // encoder layer drops by ~4x.
+    //
+    // Block layout:
+    //   blockDim = 128 = 4 wave32
+    //   wave w (0..3) handles Q-rows [w*16, w*16 + 16) of this block's tile
+    //   All 128 threads cooperate on Q / K / V tile loads
+    //   WMMA, softmax, and P*V are per-wave (each wave on its 16-row slice)
+    //
+    // LDS layout:
+    //   q_lds[64][D]     fp16  — 8 KiB at D=64
+    //   k_lds[64][D]     fp16  — 8 KiB
+    //   v_lds[64][D]     fp16  — 8 KiB
+    //   s_scratch[4][16][64] fp32 — 16 KiB (one 16x64 slab per wave)
+    //   p_lds[4][16][64] fp16  — 8 KiB (P input for P*V WMMA)
+    //   m_lds/l_lds/alpha_lds[64] fp32 — ~0.8 KiB
+    //   Total ~49 KiB / block (fits in 64 KiB).
+    // -------------------------------------------------------------------
+    template <int D>
+    __global__ void hip_flash_attn_wmma_fp16_bm64(
+        const _Float16* __restrict__ Q,
+        const _Float16* __restrict__ K,
+        const _Float16* __restrict__ V,
+        _Float16*       __restrict__ O,
+        const int seqlen_q,
+        const int seqlen_k,
+        const int k_time_stride,
+        const int v_time_stride,
+        const int nheads,
+        const float scale,
+        const bool is_causal,
+        const int q_offset)
+    {
+      static_assert(D % 16 == 0, "head_dim must be a multiple of 16 for WMMA");
+      constexpr int BM_W = 64;
+      constexpr int BN   = 64;
+      constexpr int WAVES = 4;     // 128 / 32
+      constexpr int DT = D / 16;   // 4 for D=64
+      constexpr int NT = BN / 16;  // 4
+
+      using v16f16 = _Float16 __attribute__((ext_vector_type(16)));
+      using v8f32  = float    __attribute__((ext_vector_type(8)));
+
+      const int tid     = threadIdx.x;     // 0..127
+      const int wave    = tid >> 5;        // 0..3
+      const int lane    = tid & 31;        // 0..31
+      const int b       = blockIdx.z;
+      const int h       = blockIdx.y;
+      const int q_tile  = blockIdx.x;
+      const int q_block_0 = q_tile * BM_W;
+      const int q_row_0   = q_block_0 + wave * 16;  // this wave's first row
+
+      __shared__ _Float16 q_lds[BM_W][D];
+      __shared__ _Float16 k_lds[BN][D];
+      __shared__ _Float16 v_lds[BN][D];
+      __shared__ float    s_scratch[WAVES][16][BN];
+      __shared__ _Float16 p_lds[WAVES][16][BN];
+      __shared__ float    m_lds[BM_W];
+      __shared__ float    l_lds[BM_W];
+      __shared__ float    alpha_lds[BM_W];
+
+      // ---- Load Q tile (pre-scaled by softmax-scale) ----
+      for (int idx = tid; idx < BM_W * D; idx += 128) {
+        const int row = idx / D;
+        const int col = idx % D;
+        const int q_row = q_block_0 + row;
+        float v = 0.f;
+        if (q_row < seqlen_q)
+          v = static_cast<float>(Q[b * seqlen_q * nheads * D
+                                   + q_row * nheads * D + h * D + col]) * scale;
+        q_lds[row][col] = static_cast<_Float16>(v);
+      }
+
+      if (tid < BM_W) {
+        m_lds[tid] = -1e30f;
+        l_lds[tid] = 0.f;
+      }
+
+      v8f32 o_frag[DT];
+      #pragma unroll
+      for (int t = 0; t < DT; ++t)
+        #pragma unroll
+        for (int s = 0; s < 8; ++s) o_frag[t][s] = 0.f;
+
+      __syncthreads();
+
+      // ---- Loop over K/V tiles ----
+      const int num_k_tiles = (seqlen_k + BN - 1) / BN;
+      for (int kt = 0; kt < num_k_tiles; ++kt) {
+        const int k_block_0 = kt * BN;
+
+        // -- Cooperative K/V load (all 128 threads) --
+        for (int idx = tid; idx < BN * D; idx += 128) {
+          const int row = idx / D;
+          const int col = idx % D;
+          const int k_row = k_block_0 + row;
+          _Float16 kv_k = static_cast<_Float16>(0);
+          _Float16 kv_v = static_cast<_Float16>(0);
+          if (k_row < seqlen_k) {
+            kv_k = K[b * k_time_stride + k_row * nheads * D + h * D + col];
+            kv_v = V[b * v_time_stride + k_row * nheads * D + h * D + col];
+          }
+          k_lds[row][col] = kv_k;
+          v_lds[row][col] = kv_v;
+        }
+        __syncthreads();
+
+        // -- Per-wave Q*K^T into 4 S-fragments (one per N-tile) --
+        v8f32 s_frag[NT];
+        #pragma unroll
+        for (int nt = 0; nt < NT; ++nt) {
+          #pragma unroll
+          for (int s = 0; s < 8; ++s) s_frag[nt][s] = 0.f;
+
+          // Inner reduction over the head dimension
+          #pragma unroll
+          for (int inner = 0; inner < DT; ++inner) {
+            v16f16 a_frag, b_frag;
+            const int a_row = wave * 16 + (lane & 15);
+            const int b_col = nt   * 16 + (lane & 15);
+            #pragma unroll
+            for (int x = 0; x < 16; ++x) {
+              a_frag[x] = q_lds[a_row][inner * 16 + x];
+              b_frag[x] = k_lds[b_col][inner * 16 + x];
+            }
+            s_frag[nt] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
+                a_frag, b_frag, s_frag[nt]);
+          }
+        }
+
+        // -- Write S to per-wave scratch, with causal / OOB mask --
+        #pragma unroll
+        for (int nt = 0; nt < NT; ++nt) {
+          const int s_col_local = lane & 15;
+          const int s_col_bn    = nt * 16 + s_col_local;
+          const int k_col_g     = k_block_0 + s_col_bn;
+          #pragma unroll
+          for (int s = 0; s < 8; ++s) {
+            const int s_row_w = 2 * s + (lane >> 4);
+            const int q_row_g = q_row_0 + s_row_w;
+            const int q_pos   = q_row_g + q_offset;
+            const bool oob    = (k_col_g >= seqlen_k) || (q_row_g >= seqlen_q);
+            const bool masked = is_causal && k_col_g > q_pos;
+            s_scratch[wave][s_row_w][s_col_bn] = (oob || masked) ? -1e30f : s_frag[nt][s];
+          }
+        }
+        __syncthreads();
+
+        // -- Per-row softmax (one thread per row of the 64-row block) --
+        if (tid < BM_W) {
+          const int row   = tid;
+          const int w_row = row >> 4;            // which wave's chunk
+          const int r_w   = row & 15;            // row index within that chunk
+          const float m_old = m_lds[row];
+          const float l_old = l_lds[row];
+
+          float m_tile = -1e30f;
+          #pragma unroll
+          for (int c = 0; c < BN; ++c)
+            m_tile = fmaxf(m_tile, s_scratch[w_row][r_w][c]);
+
+          const float m_new = fmaxf(m_old, m_tile);
+          const float alpha = (m_old == -1e30f) ? 0.f : __expf(m_old - m_new);
+
+          float l_tile = 0.f;
+          #pragma unroll
+          for (int c = 0; c < BN; ++c) {
+            const float v = s_scratch[w_row][r_w][c];
+            const float e = (v <= -1e29f) ? 0.f : __expf(v - m_new);
+            p_lds[w_row][r_w][c] = static_cast<_Float16>(e);
+            l_tile += e;
+          }
+
+          m_lds[row]     = m_new;
+          l_lds[row]     = alpha * l_old + l_tile;
+          alpha_lds[row] = alpha;
+        }
+        __syncthreads();
+
+        // -- Scale o_frag by per-row alpha (using this wave's 16 rows) --
+        {
+          #pragma unroll
+          for (int t = 0; t < DT; ++t) {
+            #pragma unroll
+            for (int s = 0; s < 8; ++s) {
+              const int o_row_w  = 2 * s + (lane >> 4);
+              const int o_row_b  = wave * 16 + o_row_w;
+              o_frag[t][s] *= alpha_lds[o_row_b];
+            }
+          }
+        }
+
+        // -- O += P * V via WMMA (inner reduction over BN in 16-chunks) --
+        #pragma unroll
+        for (int t = 0; t < DT; ++t) {
+          #pragma unroll
+          for (int n_in = 0; n_in < NT; ++n_in) {
+            v16f16 a_frag, b_frag;
+            const int a_row = lane & 15;
+            const int b_col = lane & 15;
+            #pragma unroll
+            for (int j = 0; j < 16; ++j)
+              a_frag[j] = p_lds[wave][a_row][n_in * 16 + j];
+            #pragma unroll
+            for (int i = 0; i < 16; ++i)
+              b_frag[i] = v_lds[n_in * 16 + i][t * 16 + b_col];
+
+            o_frag[t] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
+                a_frag, b_frag, o_frag[t]);
+          }
+        }
+        __syncthreads();
+      }
+
+      // ---- Normalise (1/l) ----
+      if (tid < BM_W)
+        alpha_lds[tid] = (l_lds[tid] > 0.f) ? 1.f / l_lds[tid] : 0.f;
+      __syncthreads();
+
+      // ---- Store O ----
+      #pragma unroll
+      for (int t = 0; t < DT; ++t) {
+        const int o_col = t * 16 + (lane & 15);
+        #pragma unroll
+        for (int s = 0; s < 8; ++s) {
+          const int o_row_w = 2 * s + (lane >> 4);
+          const int o_row_b = wave * 16 + o_row_w;
+          const int q_row_g = q_block_0 + o_row_b;
+          if (q_row_g < seqlen_q && o_col < D) {
+            const float val = o_frag[t][s] * alpha_lds[o_row_b];
+            O[b * seqlen_q * nheads * D + q_row_g * nheads * D + h * D + o_col]
                 = static_cast<_Float16>(val);
           }
         }
@@ -1257,35 +1508,67 @@ namespace ctranslate2 {
       }
 
       // ----------------------------------------------------------------
-      // Fast path B: WMMA-accelerated kernel for FP16 on RDNA3.
-      // Each block handles BM_W = 16 query rows in a single wavefront and
-      // uses the 16x16x16 wave32 WMMA built-in for Q·K^T and P·V.
-      // Compute throughput ~5-10x over the scalar tiled kernel.
-      // Only the FP16 path; BF16 falls through to scalar tiled.
+      // Fast path B: WMMA-accelerated kernels on RDNA3.
+      //   - BM_W = 16 variant: single wave32 per block, used for both FP16
+      //     and BF16 inputs (same wave32 fragment layout; different built-in).
+      //   - BM_W = 64 variant exists in this TU as future-work; currently
+      //     not dispatched — see the comment above launch_wmma_bm64.
       // ----------------------------------------------------------------
-      if constexpr (std::is_same<scalar_t, float16_t>::value) {
-        constexpr int BM_W = 16;
-        auto launch_wmma = [&](auto head_dim_const) -> bool {
+      if constexpr (std::is_same<scalar_t, float16_t>::value
+                 || std::is_same<scalar_t, bfloat16_t>::value) {
+        using HalfT = std::conditional_t<
+            std::is_same<scalar_t, float16_t>::value, _Float16, __bf16>;
+
+        auto launch_wmma_bm64 = [&](auto head_dim_const) -> bool {
+          constexpr int D = decltype(head_dim_const)::value;
+          constexpr int BM_W = 64;
+          if (head_dim != D) return false;
+          if (D % 16 != 0)   return false;
+          if (seqlen_q < BM_W) return false;
+          if constexpr (!std::is_same<HalfT, _Float16>::value) return false;
+          // (BM_W=64 BF16 variant not generated; only FP16 specialisation exists.)
+          dim3 grid((seqlen_q + BM_W - 1) / BM_W, num_heads, batch_size);
+          dim3 block(128);  // 4 wave32
+          if constexpr (std::is_same<HalfT, _Float16>::value) {
+            hipLaunchKernelGGL((hip_flash_attn_wmma_fp16_bm64<D>),
+                               grid, block, 0, stream,
+                               reinterpret_cast<const _Float16*>(q_ptr),
+                               reinterpret_cast<const _Float16*>(k_ptr),
+                               reinterpret_cast<const _Float16*>(v_ptr),
+                               reinterpret_cast<_Float16*>(o_ptr),
+                               (int)seqlen_q, (int)seqlen_k,
+                               (int)k_time_stride, (int)v_time_stride,
+                               (int)num_heads, queries_scale,
+                               is_causal, (int)offset);
+          }
+          return true;
+        };
+        // BM_W=64 currently disabled (see kernel comment for tuning notes).
+        // if (launch_wmma_bm64(std::integral_constant<int, 64>{})) return;
+        (void)launch_wmma_bm64;
+
+        auto launch_wmma_bm16 = [&](auto head_dim_const) -> bool {
           constexpr int D = decltype(head_dim_const)::value;
+          constexpr int BM_W = 16;
           if (head_dim != D) return false;
-          if (D % 16 != 0)  return false;
+          if (D % 16 != 0)   return false;
           if (seqlen_q < BM_W) return false;
           dim3 grid((seqlen_q + BM_W - 1) / BM_W, num_heads, batch_size);
           dim3 block(32);  // one wave32
-          hipLaunchKernelGGL((hip_flash_attn_wmma_fp16<D>),
+          hipLaunchKernelGGL((hip_flash_attn_wmma_fp16<HalfT, D>),
                              grid, block, 0, stream,
-                             reinterpret_cast<const _Float16*>(q_ptr),
-                             reinterpret_cast<const _Float16*>(k_ptr),
-                             reinterpret_cast<const _Float16*>(v_ptr),
-                             reinterpret_cast<_Float16*>(o_ptr),
+                             reinterpret_cast<const HalfT*>(q_ptr),
+                             reinterpret_cast<const HalfT*>(k_ptr),
+                             reinterpret_cast<const HalfT*>(v_ptr),
+                             reinterpret_cast<HalfT*>(o_ptr),
                              (int)seqlen_q, (int)seqlen_k,
                              (int)k_time_stride, (int)v_time_stride,
                              (int)num_heads, queries_scale,
                              is_causal, (int)offset);
           return true;
         };
-        if (launch_wmma(std::integral_constant<int,  64>{})) return;
-        if (launch_wmma(std::integral_constant<int, 128>{})) return;
+        if (launch_wmma_bm16(std::integral_constant<int,  64>{})) return;
+        if (launch_wmma_bm16(std::integral_constant<int, 128>{})) return;
       }
 
       // Fast path C: scalar tiled Flash Attention 2 kernel.

From 230ed0eb5346af0c5690ec052adec971cc9e6a07 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 11:58:48 +0200
Subject: [PATCH 07/14] tests: add pytest correctness suite and standalone
 benchmark for HIP Flash Attention

python/tests/test_flash_attention.py (CI-friendly pytest, 9 tests):
  - test_flash_attention_encoder_matches_standard[float16|bfloat16]
    Verifies Flash=ON encoder output matches Flash=OFF within
    rounding tolerances (FP16: max_abs_diff <= 0.5, rel <= 0.5%;
    BF16: <= 2.0, rel <= 3%).  Also prints the per-layer score-
    buffer size that the online-softmax path avoids materialising.
  - test_flash_attention_generate_fp16_token_match[42|123|777|999|1234]
    Five seeds, generates 20 tokens each, asserts byte-identical
    token sequences between Flash=ON and Flash=OFF.
  - test_softmax_block_size_regression
    Specifically guards the bug from commit d9016a58: a 2^k
    block-size requirement in the softmax tree reduction that
    caused all five seeds to emit token 50411 as the first
    generated token regardless of audio input.  Asserts that the
    set of first tokens across five distinct seeds has size > 1.
  - test_flash_attention_score_buffer_savings
    Pure documentation test that prints, for a range of common
    transformer shapes (Whisper-medium encoder/decoder, hypothetical
    4k and 32k LLM contexts), how much HBM the standard attention
    path would allocate for the [B, H, Sq, Sk] FP32 score matrix.

Tests are skipped automatically if the faster-whisper-medium snapshot
isn't already in the local HuggingFace cache, so they don't pull
network in CI environments that haven't pre-populated models.

python/tests/benchmark_flash_attention.py (standalone, not pytest):
  - Direct HBM measurement via ctypes -> hipMemGetInfo on the active
    HIP device.  Reports both the persistent model footprint and the
    working-set growth after a representative generate() call,
    Flash=ON vs Flash=OFF.
  - Performance measurement of encoder (B=1, B=4) and generate
    (max_length=30/100/200/448), GPU-synced before timing.

Measured on Whisper-medium, RX 7900 XTX, gfx1100:
  HBM: model 2.87 GiB OFF -> 2.67 GiB ON (-200 MiB persistent);
       working set +359 MiB OFF -> +342 MiB ON (-17 MiB peak).
  Speed: encoder B=1 1.08x, B=4 1.11x; generate 1.03-1.07x across
         max_length 30..448.  Stable across runs.

Both files include a Windows local-build dev-loop helper that
attempts to add the ROCm SDK wheel's bin directories to the DLL
search path before importing ctranslate2.  No-op in normal CI.
---
 python/tests/benchmark_flash_attention.py | 274 ++++++++++++++++++++++
 python/tests/test_flash_attention.py      | 269 +++++++++++++++++++++
 2 files changed, 543 insertions(+)
 create mode 100644 python/tests/benchmark_flash_attention.py
 create mode 100644 python/tests/test_flash_attention.py

diff --git a/python/tests/benchmark_flash_attention.py b/python/tests/benchmark_flash_attention.py
new file mode 100644
index 000000000..f36799a5d
--- /dev/null
+++ b/python/tests/benchmark_flash_attention.py
@@ -0,0 +1,274 @@
+"""Standalone benchmark script for the native HIP Flash Attention path.
+
+Measures both speed and HBM footprint of `flash_attention=True` vs the
+standard MultiHeadAttention oracle, on the encoder pass and on `generate`
+at a few different max_length values.
+
+This is *not* a pytest — it's run manually (`python benchmark_flash_attention.py`)
+because the numbers are timing-sensitive and we don't want regression
+flapping in CI.  pytest tests for correctness live in test_flash_attention.py.
+
+HBM is measured via the HIP runtime's `hipMemGetInfo`, called through
+ctypes.  On Windows the runtime DLL is `amdhip64_*.dll`; on Linux it's
+`libamdhip64.so`.
+
+Usage:
+    python benchmark_flash_attention.py [--model PATH] [--runs N]
+"""
+
+import argparse
+import ctypes
+import os
+import sys
+import time
+
+import numpy as np
+
+
+# ----------------------------------------------------------------------------
+# Local-build DLL loader (mirrors test_flash_attention.py).
+# ----------------------------------------------------------------------------
+if sys.platform == "win32":
+    import site
+
+    for site_dir in site.getsitepackages() + [site.getusersitepackages()]:
+        for sub in ("_rocm_sdk_core/bin", "_rocm_sdk_libraries_custom/bin"):
+            cand = os.path.join(site_dir, *sub.split("/"))
+            if os.path.isdir(cand):
+                try:
+                    os.add_dll_directory(cand)
+                except (FileNotFoundError, OSError):
+                    pass
+
+import ctranslate2  # noqa: E402
+
+
+# ----------------------------------------------------------------------------
+# HIP runtime memory query via ctypes.
+# Returns (free_bytes, total_bytes) for the currently-active device.
+# ----------------------------------------------------------------------------
+def _load_hip_runtime():
+    if sys.platform == "win32":
+        # The DLL name on Windows has a major-version suffix that varies.
+        site_dir = next(
+            d for d in (
+                os.path.join(s, "_rocm_sdk_core", "bin")
+                for s in (__import__("site").getsitepackages()
+                          + [__import__("site").getusersitepackages()])
+            ) if os.path.isdir(d)
+        )
+        candidates = sorted(
+            f for f in os.listdir(site_dir)
+            if f.startswith("amdhip64") and f.endswith(".dll")
+        )
+        if not candidates:
+            raise FileNotFoundError("amdhip64_*.dll not found in ROCm SDK bin")
+        return ctypes.CDLL(os.path.join(site_dir, candidates[-1]))
+    else:
+        return ctypes.CDLL("libamdhip64.so")
+
+
+_hip = _load_hip_runtime()
+_hip.hipMemGetInfo.argtypes = [ctypes.POINTER(ctypes.c_size_t),
+                               ctypes.POINTER(ctypes.c_size_t)]
+_hip.hipMemGetInfo.restype = ctypes.c_int
+_hip.hipDeviceSynchronize.argtypes = []
+_hip.hipDeviceSynchronize.restype = ctypes.c_int
+
+
+def hbm_free_total():
+    """(free_bytes, total_bytes) on the active HIP device."""
+    free = ctypes.c_size_t()
+    total = ctypes.c_size_t()
+    rc = _hip.hipMemGetInfo(ctypes.byref(free), ctypes.byref(total))
+    if rc != 0:
+        raise RuntimeError(f"hipMemGetInfo returned {rc}")
+    return free.value, total.value
+
+
+def gpu_sync():
+    _hip.hipDeviceSynchronize()
+
+
+# ----------------------------------------------------------------------------
+# Bench helpers.
+# ----------------------------------------------------------------------------
+def _mel(seed=0):
+    rng = np.random.default_rng(seed)
+    return rng.standard_normal((1, 80, 3000)).astype(np.float32)
+
+
+def _bench(fn, runs=20, warmup=3):
+    for _ in range(warmup):
+        fn()
+    gpu_sync()
+    t0 = time.perf_counter()
+    for _ in range(runs):
+        fn()
+    gpu_sync()
+    return (time.perf_counter() - t0) / runs * 1000  # ms per call
+
+
+def measure_hbm_delta(loader_fn, work_fn):
+    """Build a model with `loader_fn`, baseline its persistent HBM, run
+    `work_fn(model)` once (so the allocator pool grows), then measure the
+    additional HBM held.  Returns (model_bytes, work_bytes_added)."""
+    gpu_sync()
+    free_empty, _ = hbm_free_total()
+
+    model = loader_fn()
+    gpu_sync()
+    free_after_load, _ = hbm_free_total()
+    model_bytes = free_empty - free_after_load
+
+    # Warm up enough to make the allocator pool reach its working-set peak.
+    for _ in range(3):
+        work_fn(model)
+    gpu_sync()
+    free_after_work, _ = hbm_free_total()
+    work_bytes = free_after_load - free_after_work
+    return model, model_bytes, work_bytes
+
+
+def fmt_bytes(n):
+    n = abs(n)
+    if n < 1024 * 1024:
+        return f"{n/1024:.1f} KiB"
+    if n < 1024 * 1024 * 1024:
+        return f"{n/1024/1024:.1f} MiB"
+    return f"{n/1024/1024/1024:.2f} GiB"
+
+
+# ----------------------------------------------------------------------------
+# Main benchmark.
+# ----------------------------------------------------------------------------
+PROMPTS = [[50258, 50259, 50360]]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model",
+        default=None,
+        help="Path to a converted faster-whisper-medium snapshot. "
+             "If omitted, the HuggingFace cache is searched.",
+    )
+    parser.add_argument("--runs", type=int, default=20)
+    parser.add_argument(
+        "--compute-type", default="float16", choices=["float16", "bfloat16"],
+    )
+    args = parser.parse_args()
+
+    if args.model is None:
+        snaps = os.path.expanduser(
+            "~/.cache/huggingface/hub/"
+            "models--Systran--faster-whisper-medium/snapshots"
+        )
+        if not os.path.isdir(snaps):
+            sys.exit("No --model given and faster-whisper-medium not cached.")
+        shas = [d for d in os.listdir(snaps)
+                if os.path.isfile(os.path.join(snaps, d, "model.bin"))]
+        if not shas:
+            sys.exit("No converted snapshot found in HF cache.")
+        args.model = os.path.join(snaps, shas[0])
+
+    print(f"Model: {args.model}")
+    print(f"Compute type: {args.compute_type}, runs/case: {args.runs}\n")
+
+    mel = _mel(seed=0)
+
+    # ------------------------------------------------------------------
+    # HBM measurement: load both variants, measure persistent + working set.
+    # ------------------------------------------------------------------
+    print("=" * 70)
+    print("HBM footprint")
+    print("=" * 70)
+    for flash in (False, True):
+        label = "Flash=ON " if flash else "Flash=OFF"
+
+        def load():
+            return ctranslate2.models.Whisper(
+                args.model, device="cuda",
+                compute_type=args.compute_type, flash_attention=flash,
+            )
+
+        def work(m):
+            r = m.generate(
+                ctranslate2.StorageView.from_array(mel),
+                PROMPTS, beam_size=1, max_length=100,
+            )
+            return r
+
+        model, model_b, work_b = measure_hbm_delta(load, work)
+        print(
+            f"  {label}:  model = {fmt_bytes(model_b)},  "
+            f"working set (generate max_length=100) = +{fmt_bytes(work_b)}"
+        )
+        del model
+        gpu_sync()
+
+    # Theoretical Flash Attention savings on the encoder score buffer.
+    sq = sk = 1500
+    h = 16
+    saved_per_layer = sq * sk * h * 4
+    print()
+    print(
+        f"  Score-matrix that the standard path materialises and Flash "
+        f"avoids, per encoder layer:"
+    )
+    print(f"    {sq}x{sk}x{h} heads x FP32 = {fmt_bytes(saved_per_layer)} per layer")
+    print(f"    24 layers => up to {fmt_bytes(24 * saved_per_layer)} of HBM "
+          f"traffic per encoder pass")
+    print()
+
+    # ------------------------------------------------------------------
+    # Performance: encoder-only, then generate() at several lengths.
+    # ------------------------------------------------------------------
+    print("=" * 70)
+    print("Performance")
+    print("=" * 70)
+
+    m_off = ctranslate2.models.Whisper(
+        args.model, device="cuda",
+        compute_type=args.compute_type, flash_attention=False,
+    )
+    m_on = ctranslate2.models.Whisper(
+        args.model, device="cuda",
+        compute_type=args.compute_type, flash_attention=True,
+    )
+
+    def enc_fn(m):
+        return lambda: m.encode(ctranslate2.StorageView.from_array(mel),
+                                to_cpu=False)
+
+    for batch in (1, 4):
+        mel_b = np.stack([mel[0]] * batch, axis=0)
+
+        def enc_b(m):
+            return lambda: m.encode(ctranslate2.StorageView.from_array(mel_b),
+                                    to_cpu=False)
+
+        off_ms = _bench(enc_b(m_off), runs=args.runs)
+        on_ms  = _bench(enc_b(m_on),  runs=args.runs)
+        print(
+            f"  encoder (B={batch}, Sq=Sk=1500):  "
+            f"OFF={off_ms:6.2f} ms  ON={on_ms:6.2f} ms  speedup={off_ms/on_ms:.2f}x"
+        )
+
+    print()
+    for max_len in (30, 100, 200, 448):
+        def gen_b(m, ml=max_len):
+            return lambda: m.generate(
+                ctranslate2.StorageView.from_array(mel),
+                PROMPTS, beam_size=1, max_length=ml,
+            )
+        off_ms = _bench(gen_b(m_off), runs=max(args.runs // 4, 3))
+        on_ms  = _bench(gen_b(m_on),  runs=max(args.runs // 4, 3))
+        print(
+            f"  generate (max_length={max_len:3d}):  "
+            f"OFF={off_ms:7.1f} ms  ON={on_ms:7.1f} ms  speedup={off_ms/on_ms:.2f}x"
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/tests/test_flash_attention.py b/python/tests/test_flash_attention.py
new file mode 100644
index 000000000..2bb3d21d0
--- /dev/null
+++ b/python/tests/test_flash_attention.py
@@ -0,0 +1,269 @@
+"""Correctness tests for the native HIP Flash Attention path.
+
+These tests exercise the FP16 and BF16 WMMA kernels, the decode kernel, and
+the KV-cache write path against the standard MultiHeadAttention oracle.
+A model snapshot (Systran/faster-whisper-medium) is required; the test is
+skipped if it isn't already on disk so it can run in environments without
+network access.
+
+The tests are written to be CI-friendly: only correctness is asserted; the
+memory footprint of the materialised attention score buffer that the Flash
+path avoids is printed for context, not asserted.
+"""
+
+import os
+import sys
+
+import numpy as np
+import pytest
+
+# Local-build dev-loop helper: when the test is run against a non-installed
+# ctranslate2 source tree (i.e. python/ctranslate2 imports a freshly-built
+# DLL but the ROCm SDK is only available via the pip-installed
+# _rocm_sdk_core/_rocm_sdk_libraries_custom wheels), Python 3.8+ no longer
+# honours PATH for DLL search and the load fails before we get to import.
+# Find the SDK directories among site-packages and add them up-front.  In a
+# normal CI setup this is a no-op (the SDK is already discoverable).
+if sys.platform == "win32":
+    import site
+
+    for site_dir in site.getsitepackages() + [site.getusersitepackages()]:
+        for sub in ("_rocm_sdk_core/bin", "_rocm_sdk_libraries_custom/bin"):
+            cand = os.path.join(site_dir, *sub.split("/"))
+            if os.path.isdir(cand):
+                try:
+                    os.add_dll_directory(cand)
+                except (FileNotFoundError, OSError):
+                    pass
+
+import ctranslate2
+
+import test_utils
+
+
+# ----------------------------------------------------------------------------
+# Model discovery — uses an existing CT2 Whisper snapshot if available.
+# ----------------------------------------------------------------------------
+def _find_whisper_medium():
+    """Return the path to a converted faster-whisper-medium snapshot, or None."""
+    hf_cache = os.path.expanduser("~/.cache/huggingface/hub")
+    model_dir = os.path.join(
+        hf_cache,
+        "models--Systran--faster-whisper-medium",
+        "snapshots",
+    )
+    if not os.path.isdir(model_dir):
+        return None
+    # snapshots/<sha>/{model.bin, config.json, tokenizer.json, ...}
+    for sha in os.listdir(model_dir):
+        full = os.path.join(model_dir, sha)
+        if os.path.isfile(os.path.join(full, "model.bin")):
+            return full
+    return None
+
+
+_MODEL_PATH = _find_whisper_medium()
+
+require_model = pytest.mark.skipif(
+    _MODEL_PATH is None,
+    reason="faster-whisper-medium snapshot not cached locally",
+)
+
+
+# ----------------------------------------------------------------------------
+# Helpers
+# ----------------------------------------------------------------------------
+def _mel(seed=0):
+    """Synthetic 30-second mel spectrogram, deterministic for a given seed."""
+    rng = np.random.default_rng(seed)
+    return rng.standard_normal((1, 80, 3000)).astype(np.float32)
+
+
+def _encoder_output(model, mel):
+    """Run the encoder and return the result as a CPU FP32 numpy array."""
+    features = ctranslate2.StorageView.from_array(mel)
+    out = model.encode(features, to_cpu=True)
+    # BF16 outputs aren't directly numpy-castable.
+    if out.dtype != ctranslate2.DataType.float32:
+        out = out.to(ctranslate2.DataType.float32)
+    return np.array(out)
+
+
+def _score_buffer_bytes(seqlen_q, seqlen_k, num_heads=16, batch=1):
+    """Bytes the standard attention path would allocate for the
+    [B, H, Sq, Sk] FP32 score matrix.  This is what Flash Attention's
+    online-softmax design avoids materialising in HBM."""
+    return batch * num_heads * seqlen_q * seqlen_k * 4
+
+
+# ----------------------------------------------------------------------------
+# Encoder correctness — Flash=ON must match Flash=OFF within a small tolerance.
+# ----------------------------------------------------------------------------
+@test_utils.require_cuda
+@require_model
+@pytest.mark.parametrize(
+    "compute_type,abs_tol,rel_tol",
+    [
+        # FP16 path: 11-bit mantissa, accumulators are FP32.  The diff is
+        # dominated by re-ordering of summations in WMMA vs. rocBLAS-GEMM.
+        ("float16", 0.5, 5e-3),
+        # BF16 path: only 8-bit mantissa, larger accumulated rounding.
+        ("bfloat16", 2.0, 3e-2),
+    ],
+)
+def test_flash_attention_encoder_matches_standard(compute_type, abs_tol, rel_tol):
+    """The Flash Attention encoder output must match the standard path
+    within float16/bfloat16 rounding noise."""
+    mel = _mel(seed=0)
+
+    m_off = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type=compute_type, flash_attention=False,
+    )
+    m_on = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type=compute_type, flash_attention=True,
+    )
+
+    out_off = _encoder_output(m_off, mel)
+    out_on  = _encoder_output(m_on,  mel)
+
+    assert out_off.shape == out_on.shape
+    diff = np.abs(out_off - out_on)
+    max_diff  = float(diff.max())
+    max_abs   = float(np.abs(out_off).max()) + 1e-8
+    rel_diff  = max_diff / max_abs
+
+    # Informative — what Flash Attention's online softmax saves us in HBM
+    # per layer (Whisper-medium encoder: Sq = Sk = 1500, 16 heads).
+    saved_bytes = _score_buffer_bytes(1500, 1500, num_heads=16, batch=1)
+    print(
+        f"\n[{compute_type}] encoder max_abs_diff={max_diff:.4f}, "
+        f"rel_diff={rel_diff*100:.3f}%; "
+        f"per-layer score-buffer avoided = {saved_bytes/1024/1024:.1f} MiB"
+    )
+
+    assert max_diff <= abs_tol, (
+        f"{compute_type} encoder max diff {max_diff:.4f} exceeds {abs_tol}"
+    )
+    assert rel_diff <= rel_tol, (
+        f"{compute_type} encoder rel diff {rel_diff*100:.3f}% exceeds "
+        f"{rel_tol*100:.3f}%"
+    )
+
+
+# ----------------------------------------------------------------------------
+# generate() correctness — exercises decode-kernel + KV-cache write path.
+# FP16 should produce token-identical output to the standard path; BF16 may
+# differ in the last few tokens due to the smaller mantissa.
+# ----------------------------------------------------------------------------
+PROMPTS = [[50258, 50259, 50360]]
+
+
+@test_utils.require_cuda
+@require_model
+@pytest.mark.parametrize("seed", [42, 123, 777, 999, 1234])
+def test_flash_attention_generate_fp16_token_match(seed):
+    """FP16 Flash=ON must produce identical token sequences to Flash=OFF
+    across multiple random inputs.
+
+    This is also the regression test for the softmax-reduction block-size
+    bug: that bug caused the very first generated token to always be 50411
+    regardless of the input.  Five distinct seeds with distinct expected
+    tokens makes silent regression effectively impossible.
+    """
+    mel = _mel(seed=seed)
+
+    m_off = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=False,
+    )
+    m_on = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=True,
+    )
+
+    feat_off = ctranslate2.StorageView.from_array(mel)
+    feat_on  = ctranslate2.StorageView.from_array(mel)
+
+    r_off = m_off.generate(feat_off, PROMPTS, beam_size=1, max_length=20)
+    r_on  = m_on.generate(feat_on,   PROMPTS, beam_size=1, max_length=20)
+
+    tok_off = r_off[0].sequences_ids[0]
+    tok_on  = r_on[0].sequences_ids[0]
+    assert tok_off == tok_on, (
+        f"seed={seed}: Flash=ON produced {tok_on} but oracle is {tok_off}"
+    )
+
+
+# ----------------------------------------------------------------------------
+# Regression test for the softmax-reduction-block-size bug.
+#
+# Symptom (before the fix in commit d9016a58): generate() with Flash=ON would
+# emit token 50411 as the first generated token regardless of the audio input,
+# because the per-row tree reduction in hip_attn_softmax_kernel ran with
+# block = min(seqlen_k, 256), and for seqlen_k = 3 (the Whisper prompt
+# prefill with three tokens) the reduction silently dropped the third score.
+#
+# This test reproduces the exact pre-fix configuration and asserts that the
+# first generated token actually depends on the audio input.
+# ----------------------------------------------------------------------------
+@test_utils.require_cuda
+@require_model
+def test_softmax_block_size_regression():
+    """Different audio inputs must produce different first generated tokens
+    when Flash Attention is enabled.  Guards against a re-emergence of the
+    softmax reduction block-size bug."""
+    m_on = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=True,
+    )
+
+    first_tokens = set()
+    for seed in [42, 123, 777, 999, 1234]:
+        mel = _mel(seed=seed)
+        feat = ctranslate2.StorageView.from_array(mel)
+        # max_length must exceed the 3 prompt tokens for the generation step
+        # to actually emit a token; we only inspect index 0.
+        r = m_on.generate(feat, PROMPTS, beam_size=1, max_length=5)
+        seq = r[0].sequences_ids[0]
+        assert len(seq) >= 1, f"seed={seed}: no token generated"
+        first_tokens.add(seq[0])
+
+    assert len(first_tokens) > 1, (
+        "Flash Attention generated the same first token for five distinct "
+        f"random inputs ({first_tokens}) — softmax-reduction block-size bug "
+        "may have re-surfaced (see commit d9016a58)."
+    )
+
+
+# ----------------------------------------------------------------------------
+# Informational test: what does Flash Attention save in peak HBM for
+# attention score buffers?  Reported, not asserted — it's hardware-independent
+# and serves as documentation of the algorithmic benefit.
+# ----------------------------------------------------------------------------
+def test_flash_attention_score_buffer_savings():
+    """Print what Flash Attention's online softmax avoids materialising in
+    HBM at typical Whisper / LLM shapes.  Useful context when reading the
+    timing numbers."""
+    cases = [
+        ("Whisper-medium encoder layer",  1, 16, 1500, 1500),
+        ("Whisper-medium decoder self-attn (max_length=200)", 1, 16, 1, 200),
+        ("Whisper-medium decoder cross-attn", 1, 16, 1, 1500),
+        ("Hypothetical LLM @ 4k context",  1, 32, 4096, 4096),
+        ("Hypothetical LLM @ 32k context", 1, 32, 32768, 32768),
+    ]
+    print(
+        "\nScore-matrix HBM footprint that Flash Attention's online softmax "
+        "avoids materialising (per attention call):"
+    )
+    print(f"  {'shape':45s}  {'bytes':>10s}")
+    for name, b, h, sq, sk in cases:
+        n = _score_buffer_bytes(sq, sk, num_heads=h, batch=b)
+        if n < 1024 * 1024:
+            n_human = f"{n/1024:.1f} KiB"
+        elif n < 1024 * 1024 * 1024:
+            n_human = f"{n/1024/1024:.1f} MiB"
+        else:
+            n_human = f"{n/1024/1024/1024:.2f} GiB"
+        print(f"  {name:45s}  {n_human:>10s}")

From 6f38a77e1c5701d5f37e8dbd941c8638430109d6 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 12:25:00 +0200
Subject: [PATCH 08/14] ops: keep wmma_probe.cu out of the production DLL
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wmma_probe kernel is a one-shot reverse-engineering tool for the
RDNA3 wave32 WMMA fragment layout — it's how the layout used by
hip_flash_attn_wmma_fp16 was discovered.  Useful when porting to a new
gfx target / precision / tile shape, but no business living in the
production DLL where it just exports an unused C symbol
(ct2_wmma_probe_run) and bloats the binary.

Comment it out of CMakeLists.txt's CUDA_SOURCES and document how to
re-enable it at the top of wmma_probe.cu.  Source file stays in the
tree so future ports of the WMMA kernels can rerun it.
---
 CMakeLists.txt        |  8 +++++++-
 src/ops/wmma_probe.cu | 17 +++++++++++++++--
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d90084ea7..3b8059370 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -238,7 +238,13 @@ set(CUDA_SOURCES
   src/ops/conv1d_gpu.cu
   src/ops/dequantize_gpu.cu
   src/ops/flash_attention_gpu.cu
-  src/ops/wmma_probe.cu
+  # src/ops/wmma_probe.cu  — RDNA3 WMMA fragment-layout probe (dev tool).
+  #                         Builds a tiny kernel + C-ABI driver used to
+  #                         reverse-engineer the wave32 accumulator layout.
+  #                         Off by default to keep it out of the production
+  #                         DLL; uncomment to rebuild and rerun the layout
+  #                         probe (see python/tests/benchmark_flash_attention.py
+  #                         and the kernel's own header comment).
   src/ops/gather_gpu.cu
   src/ops/gumbel_max_gpu.cu
   src/ops/layer_norm_gpu.cu
diff --git a/src/ops/wmma_probe.cu b/src/ops/wmma_probe.cu
index c2e5ff312..fabc3f27a 100644
--- a/src/ops/wmma_probe.cu
+++ b/src/ops/wmma_probe.cu
@@ -1,14 +1,27 @@
 // -----------------------------------------------------------------------------
 // WMMA probe kernel — verifies the wave32 RDNA3 fragment layout empirically.
 //
+// NOT built by default.  This file is excluded from CMakeLists.txt's
+// CUDA_SOURCES list (see the comment next to flash_attention_gpu.cu in
+// CMakeLists.txt).  It is kept in the source tree as a dev tool: when porting
+// the WMMA Flash Attention kernels to a new gfx target, a new precision
+// (e.g. bf16, fp8), or a new tile shape, run this probe first to discover
+// the wave-level fragment layout the WMMA built-in produces — saves days of
+// trial-and-error.
+//
+// To rebuild and run:
+//   1) Uncomment `src/ops/wmma_probe.cu` in CMakeLists.txt's CUDA_SOURCES.
+//   2) Re-cmake and rebuild ctranslate2.dll.
+//   3) Run `python python/tests/benchmark_flash_attention.py` (or call
+//      ct2_wmma_probe_run via ctypes from any Python script).
+//
 // Strategy: feed a known A and B = identity into one 16x16x16 WMMA, then have
 // every lane write its 8 fp32 accumulator elements to an output buffer indexed
 // by lane and slot.  Comparing the result on the host to a reference GEMM
 // reveals which (row, col) of C each (lane, slot) pair corresponds to.
 //
 // Only compiled for HIP gfx11* targets.  Plain HIP / Clang built-ins, no
-// rocWMMA dependency.  Lives in a separate translation unit so it doesn't
-// pollute the production Flash Attention build.
+// rocWMMA dependency.
 // -----------------------------------------------------------------------------
 #ifdef CT2_USE_HIP
 

From 74c4d47e78ab618bf7eb928602f532ab3a366315 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 12:25:37 +0200
Subject: [PATCH 09/14] ops: remove the disabled BM_W=64 WMMA kernel

The hip_flash_attn_wmma_fp16_bm64 kernel was added speculatively as a
"larger Q-tile so K/V loads from HBM amortise across more query rows"
experiment, but never dispatched because empirically on gfx1100 it is
~5-10% slower than the BM_W=16 variant:

- 48 KiB LDS allows only 1 block per CU vs. BM_W=16's 4 blocks per CU
  -> roughly halved wave occupancy
- The per-row softmax loop sequentially iterates BN=64 cols, 4x the
  work per softmax thread vs. BM_W=16
- The expected HBM-bandwidth-reduction benefit doesn't materialise
  because Whisper attention is compute/LDS-bound, not HBM-bound:
  total attention HBM traffic is ~580 MB which at 960 GB/s is 0.6 ms,
  vs. ~60 ms of encoder time

The lesson is in the git history; carrying disabled 240-line code around
is just a maintenance liability.  If a future wave-shuffle-softmax
refactor shrinks the LDS footprint below ~24 KiB, the kernel can be
brought back from this commit cleanly.

The dispatcher's BM_W=16 path remains the WMMA fast path for both
FP16 and BF16 inputs with head_dim in {64, 128} and seqlen_q >= 16.

Verified: all 9 tests in python/tests/test_flash_attention.py still
pass after the removal; Whisper-medium encoder/generate speedups
unchanged.
---
 src/ops/flash_attention_gpu.cu | 279 +--------------------------------
 1 file changed, 4 insertions(+), 275 deletions(-)

diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 0263eae59..24525a770 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -829,248 +829,6 @@ namespace ctranslate2 {
         }
       }
     }
-    // -------------------------------------------------------------------
-    // Larger WMMA kernel — BM_W = BN = 64, 4 wave32 wavefronts per block.
-    //
-    // The key win over the BM_W = 16 kernel is K/V HBM-reuse: each K-tile
-    // and V-tile is loaded ONCE into LDS per block, and the four waves
-    // share the loaded tiles to compute attention over 64 query rows.
-    // Compared to BM_W = 16, total K-data / V-data read from HBM per
-    // encoder layer drops by ~4x.
-    //
-    // Block layout:
-    //   blockDim = 128 = 4 wave32
-    //   wave w (0..3) handles Q-rows [w*16, w*16 + 16) of this block's tile
-    //   All 128 threads cooperate on Q / K / V tile loads
-    //   WMMA, softmax, and P*V are per-wave (each wave on its 16-row slice)
-    //
-    // LDS layout:
-    //   q_lds[64][D]     fp16  — 8 KiB at D=64
-    //   k_lds[64][D]     fp16  — 8 KiB
-    //   v_lds[64][D]     fp16  — 8 KiB
-    //   s_scratch[4][16][64] fp32 — 16 KiB (one 16x64 slab per wave)
-    //   p_lds[4][16][64] fp16  — 8 KiB (P input for P*V WMMA)
-    //   m_lds/l_lds/alpha_lds[64] fp32 — ~0.8 KiB
-    //   Total ~49 KiB / block (fits in 64 KiB).
-    // -------------------------------------------------------------------
-    template <int D>
-    __global__ void hip_flash_attn_wmma_fp16_bm64(
-        const _Float16* __restrict__ Q,
-        const _Float16* __restrict__ K,
-        const _Float16* __restrict__ V,
-        _Float16*       __restrict__ O,
-        const int seqlen_q,
-        const int seqlen_k,
-        const int k_time_stride,
-        const int v_time_stride,
-        const int nheads,
-        const float scale,
-        const bool is_causal,
-        const int q_offset)
-    {
-      static_assert(D % 16 == 0, "head_dim must be a multiple of 16 for WMMA");
-      constexpr int BM_W = 64;
-      constexpr int BN   = 64;
-      constexpr int WAVES = 4;     // 128 / 32
-      constexpr int DT = D / 16;   // 4 for D=64
-      constexpr int NT = BN / 16;  // 4
-
-      using v16f16 = _Float16 __attribute__((ext_vector_type(16)));
-      using v8f32  = float    __attribute__((ext_vector_type(8)));
-
-      const int tid     = threadIdx.x;     // 0..127
-      const int wave    = tid >> 5;        // 0..3
-      const int lane    = tid & 31;        // 0..31
-      const int b       = blockIdx.z;
-      const int h       = blockIdx.y;
-      const int q_tile  = blockIdx.x;
-      const int q_block_0 = q_tile * BM_W;
-      const int q_row_0   = q_block_0 + wave * 16;  // this wave's first row
-
-      __shared__ _Float16 q_lds[BM_W][D];
-      __shared__ _Float16 k_lds[BN][D];
-      __shared__ _Float16 v_lds[BN][D];
-      __shared__ float    s_scratch[WAVES][16][BN];
-      __shared__ _Float16 p_lds[WAVES][16][BN];
-      __shared__ float    m_lds[BM_W];
-      __shared__ float    l_lds[BM_W];
-      __shared__ float    alpha_lds[BM_W];
-
-      // ---- Load Q tile (pre-scaled by softmax-scale) ----
-      for (int idx = tid; idx < BM_W * D; idx += 128) {
-        const int row = idx / D;
-        const int col = idx % D;
-        const int q_row = q_block_0 + row;
-        float v = 0.f;
-        if (q_row < seqlen_q)
-          v = static_cast<float>(Q[b * seqlen_q * nheads * D
-                                   + q_row * nheads * D + h * D + col]) * scale;
-        q_lds[row][col] = static_cast<_Float16>(v);
-      }
-
-      if (tid < BM_W) {
-        m_lds[tid] = -1e30f;
-        l_lds[tid] = 0.f;
-      }
-
-      v8f32 o_frag[DT];
-      #pragma unroll
-      for (int t = 0; t < DT; ++t)
-        #pragma unroll
-        for (int s = 0; s < 8; ++s) o_frag[t][s] = 0.f;
-
-      __syncthreads();
-
-      // ---- Loop over K/V tiles ----
-      const int num_k_tiles = (seqlen_k + BN - 1) / BN;
-      for (int kt = 0; kt < num_k_tiles; ++kt) {
-        const int k_block_0 = kt * BN;
-
-        // -- Cooperative K/V load (all 128 threads) --
-        for (int idx = tid; idx < BN * D; idx += 128) {
-          const int row = idx / D;
-          const int col = idx % D;
-          const int k_row = k_block_0 + row;
-          _Float16 kv_k = static_cast<_Float16>(0);
-          _Float16 kv_v = static_cast<_Float16>(0);
-          if (k_row < seqlen_k) {
-            kv_k = K[b * k_time_stride + k_row * nheads * D + h * D + col];
-            kv_v = V[b * v_time_stride + k_row * nheads * D + h * D + col];
-          }
-          k_lds[row][col] = kv_k;
-          v_lds[row][col] = kv_v;
-        }
-        __syncthreads();
-
-        // -- Per-wave Q*K^T into 4 S-fragments (one per N-tile) --
-        v8f32 s_frag[NT];
-        #pragma unroll
-        for (int nt = 0; nt < NT; ++nt) {
-          #pragma unroll
-          for (int s = 0; s < 8; ++s) s_frag[nt][s] = 0.f;
-
-          // Inner reduction over the head dimension
-          #pragma unroll
-          for (int inner = 0; inner < DT; ++inner) {
-            v16f16 a_frag, b_frag;
-            const int a_row = wave * 16 + (lane & 15);
-            const int b_col = nt   * 16 + (lane & 15);
-            #pragma unroll
-            for (int x = 0; x < 16; ++x) {
-              a_frag[x] = q_lds[a_row][inner * 16 + x];
-              b_frag[x] = k_lds[b_col][inner * 16 + x];
-            }
-            s_frag[nt] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
-                a_frag, b_frag, s_frag[nt]);
-          }
-        }
-
-        // -- Write S to per-wave scratch, with causal / OOB mask --
-        #pragma unroll
-        for (int nt = 0; nt < NT; ++nt) {
-          const int s_col_local = lane & 15;
-          const int s_col_bn    = nt * 16 + s_col_local;
-          const int k_col_g     = k_block_0 + s_col_bn;
-          #pragma unroll
-          for (int s = 0; s < 8; ++s) {
-            const int s_row_w = 2 * s + (lane >> 4);
-            const int q_row_g = q_row_0 + s_row_w;
-            const int q_pos   = q_row_g + q_offset;
-            const bool oob    = (k_col_g >= seqlen_k) || (q_row_g >= seqlen_q);
-            const bool masked = is_causal && k_col_g > q_pos;
-            s_scratch[wave][s_row_w][s_col_bn] = (oob || masked) ? -1e30f : s_frag[nt][s];
-          }
-        }
-        __syncthreads();
-
-        // -- Per-row softmax (one thread per row of the 64-row block) --
-        if (tid < BM_W) {
-          const int row   = tid;
-          const int w_row = row >> 4;            // which wave's chunk
-          const int r_w   = row & 15;            // row index within that chunk
-          const float m_old = m_lds[row];
-          const float l_old = l_lds[row];
-
-          float m_tile = -1e30f;
-          #pragma unroll
-          for (int c = 0; c < BN; ++c)
-            m_tile = fmaxf(m_tile, s_scratch[w_row][r_w][c]);
-
-          const float m_new = fmaxf(m_old, m_tile);
-          const float alpha = (m_old == -1e30f) ? 0.f : __expf(m_old - m_new);
-
-          float l_tile = 0.f;
-          #pragma unroll
-          for (int c = 0; c < BN; ++c) {
-            const float v = s_scratch[w_row][r_w][c];
-            const float e = (v <= -1e29f) ? 0.f : __expf(v - m_new);
-            p_lds[w_row][r_w][c] = static_cast<_Float16>(e);
-            l_tile += e;
-          }
-
-          m_lds[row]     = m_new;
-          l_lds[row]     = alpha * l_old + l_tile;
-          alpha_lds[row] = alpha;
-        }
-        __syncthreads();
-
-        // -- Scale o_frag by per-row alpha (using this wave's 16 rows) --
-        {
-          #pragma unroll
-          for (int t = 0; t < DT; ++t) {
-            #pragma unroll
-            for (int s = 0; s < 8; ++s) {
-              const int o_row_w  = 2 * s + (lane >> 4);
-              const int o_row_b  = wave * 16 + o_row_w;
-              o_frag[t][s] *= alpha_lds[o_row_b];
-            }
-          }
-        }
-
-        // -- O += P * V via WMMA (inner reduction over BN in 16-chunks) --
-        #pragma unroll
-        for (int t = 0; t < DT; ++t) {
-          #pragma unroll
-          for (int n_in = 0; n_in < NT; ++n_in) {
-            v16f16 a_frag, b_frag;
-            const int a_row = lane & 15;
-            const int b_col = lane & 15;
-            #pragma unroll
-            for (int j = 0; j < 16; ++j)
-              a_frag[j] = p_lds[wave][a_row][n_in * 16 + j];
-            #pragma unroll
-            for (int i = 0; i < 16; ++i)
-              b_frag[i] = v_lds[n_in * 16 + i][t * 16 + b_col];
-
-            o_frag[t] = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(
-                a_frag, b_frag, o_frag[t]);
-          }
-        }
-        __syncthreads();
-      }
-
-      // ---- Normalise (1/l) ----
-      if (tid < BM_W)
-        alpha_lds[tid] = (l_lds[tid] > 0.f) ? 1.f / l_lds[tid] : 0.f;
-      __syncthreads();
-
-      // ---- Store O ----
-      #pragma unroll
-      for (int t = 0; t < DT; ++t) {
-        const int o_col = t * 16 + (lane & 15);
-        #pragma unroll
-        for (int s = 0; s < 8; ++s) {
-          const int o_row_w = 2 * s + (lane >> 4);
-          const int o_row_b = wave * 16 + o_row_w;
-          const int q_row_g = q_block_0 + o_row_b;
-          if (q_row_g < seqlen_q && o_col < D) {
-            const float val = o_frag[t][s] * alpha_lds[o_row_b];
-            O[b * seqlen_q * nheads * D + q_row_g * nheads * D + h * D + o_col]
-                = static_cast<_Float16>(val);
-          }
-        }
-      }
-    }
 #endif  // gfx11
 
     // -------------------------------------------------------------------
@@ -1508,45 +1266,16 @@ namespace ctranslate2 {
       }
 
       // ----------------------------------------------------------------
-      // Fast path B: WMMA-accelerated kernels on RDNA3.
-      //   - BM_W = 16 variant: single wave32 per block, used for both FP16
-      //     and BF16 inputs (same wave32 fragment layout; different built-in).
-      //   - BM_W = 64 variant exists in this TU as future-work; currently
-      //     not dispatched — see the comment above launch_wmma_bm64.
+      // Fast path B: WMMA-accelerated kernel on RDNA3.
+      // One wave32 per block (BM_W = BN = 16), used for both FP16 and BF16
+      // inputs.  Wave32 fragment layout is identical between fp16 and bf16
+      // on gfx11; only the WMMA built-in differs (selected via if constexpr).
       // ----------------------------------------------------------------
       if constexpr (std::is_same<scalar_t, float16_t>::value
                  || std::is_same<scalar_t, bfloat16_t>::value) {
         using HalfT = std::conditional_t<
             std::is_same<scalar_t, float16_t>::value, _Float16, __bf16>;
 
-        auto launch_wmma_bm64 = [&](auto head_dim_const) -> bool {
-          constexpr int D = decltype(head_dim_const)::value;
-          constexpr int BM_W = 64;
-          if (head_dim != D) return false;
-          if (D % 16 != 0)   return false;
-          if (seqlen_q < BM_W) return false;
-          if constexpr (!std::is_same<HalfT, _Float16>::value) return false;
-          // (BM_W=64 BF16 variant not generated; only FP16 specialisation exists.)
-          dim3 grid((seqlen_q + BM_W - 1) / BM_W, num_heads, batch_size);
-          dim3 block(128);  // 4 wave32
-          if constexpr (std::is_same<HalfT, _Float16>::value) {
-            hipLaunchKernelGGL((hip_flash_attn_wmma_fp16_bm64<D>),
-                               grid, block, 0, stream,
-                               reinterpret_cast<const _Float16*>(q_ptr),
-                               reinterpret_cast<const _Float16*>(k_ptr),
-                               reinterpret_cast<const _Float16*>(v_ptr),
-                               reinterpret_cast<_Float16*>(o_ptr),
-                               (int)seqlen_q, (int)seqlen_k,
-                               (int)k_time_stride, (int)v_time_stride,
-                               (int)num_heads, queries_scale,
-                               is_causal, (int)offset);
-          }
-          return true;
-        };
-        // BM_W=64 currently disabled (see kernel comment for tuning notes).
-        // if (launch_wmma_bm64(std::integral_constant<int, 64>{})) return;
-        (void)launch_wmma_bm64;
-
         auto launch_wmma_bm16 = [&](auto head_dim_const) -> bool {
           constexpr int D = decltype(head_dim_const)::value;
           constexpr int BM_W = 16;

From 1fc1dbd8ce69899e7817502c8c8924a286e0eec6 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 12:27:56 +0200
Subject: [PATCH 10/14] tests: extend HIP Flash Attention pytest with batch +
 variable-prompt cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two new parameterised tests:

- test_flash_attention_encoder_batched[2|4]
  Runs the encoder with batch_size in {2, 4} and asserts Flash=ON
  output matches Flash=OFF on every batch element.  The previous
  encoder test only covered B=1 — this catches any per-batch
  indexing bugs in the WMMA tile loads / dispatch (grid.z == batch).

- test_flash_attention_variable_prompt_length[1|3|5|8]
  Runs generate() with prompt lengths 1, 3, 5, and 8.  This is the
  case where the decoder's first forward pass has seqlen_q != 1 and
  also seqlen_q < BM_W=16, so the dispatcher routes through the
  3-pass fallback kernel rather than the WMMA or decode paths.
  Specifically covers the dispatcher branch we never explicitly
  exercised before, and re-checks the n_prompt=3 case that
  originally triggered the softmax-block-size bug.

All 15 tests pass on Whisper-medium / RX 7900 XTX.
---
 python/tests/test_flash_attention.py | 81 ++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/python/tests/test_flash_attention.py b/python/tests/test_flash_attention.py
index 2bb3d21d0..236b27bd3 100644
--- a/python/tests/test_flash_attention.py
+++ b/python/tests/test_flash_attention.py
@@ -237,6 +237,87 @@ def test_softmax_block_size_regression():
     )
 
 
+# ----------------------------------------------------------------------------
+# Batch > 1 — exercises the WMMA path with a non-trivial batch dimension.
+# Whisper-medium encoder runs at Sq = Sk = 1500, so this also covers the
+# largest tile-count case.  Both flash and standard paths must agree per
+# batch element.
+# ----------------------------------------------------------------------------
+@test_utils.require_cuda
+@require_model
+@pytest.mark.parametrize("batch_size", [2, 4])
+def test_flash_attention_encoder_batched(batch_size):
+    """Encoder correctness for batch_size > 1."""
+    mels = np.stack(
+        [np.random.default_rng(seed).standard_normal((80, 3000)).astype(np.float32)
+         for seed in range(batch_size)],
+        axis=0,
+    )
+
+    m_off = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=False,
+    )
+    m_on = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=True,
+    )
+
+    out_off = _encoder_output(m_off, mels)
+    out_on  = _encoder_output(m_on,  mels)
+    assert out_off.shape == out_on.shape == (batch_size, 1500, 1024)
+
+    diff = np.abs(out_off - out_on)
+    max_diff = float(diff.max())
+    max_abs  = float(np.abs(out_off).max()) + 1e-8
+    rel = max_diff / max_abs
+    print(f"\n[B={batch_size}] encoder max_abs_diff={max_diff:.4f}, "
+          f"rel_diff={rel*100:.3f}%")
+    assert max_diff <= 0.5, f"B={batch_size} max diff {max_diff:.4f} > 0.5"
+
+
+# ----------------------------------------------------------------------------
+# Prompt-prefill correctness across a few prefill lengths.
+# This exercises the seqlen_q path that's neither pure decode (seqlen_q=1)
+# nor the WMMA fast path (seqlen_q >= 16), forcing the scalar-tiled and
+# 3-pass fallback kernels to also get a correctness pass.
+# ----------------------------------------------------------------------------
+@test_utils.require_cuda
+@require_model
+@pytest.mark.parametrize("n_prompt", [1, 3, 5, 8])
+def test_flash_attention_variable_prompt_length(n_prompt):
+    """generate() must agree between Flash=ON and OFF for varying numbers
+    of prompt tokens — covers seqlen_q = 1, 3, 5, 8 in the prefill step,
+    which routes through the decode-kernel (1), 3-pass fallback (3, 5, 8)
+    pieces of the dispatcher."""
+    mel = _mel(seed=0)
+    base = [50258, 50259, 50360, 50364, 1029, 290, 264, 7184]
+    prompt = [base[:n_prompt]]
+
+    m_off = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=False,
+    )
+    m_on = ctranslate2.models.Whisper(
+        _MODEL_PATH, device="cuda",
+        compute_type="float16", flash_attention=True,
+    )
+
+    r_off = m_off.generate(
+        ctranslate2.StorageView.from_array(mel),
+        prompt, beam_size=1, max_length=n_prompt + 5,
+    )
+    r_on = m_on.generate(
+        ctranslate2.StorageView.from_array(mel),
+        prompt, beam_size=1, max_length=n_prompt + 5,
+    )
+    tok_off = r_off[0].sequences_ids[0]
+    tok_on  = r_on[0].sequences_ids[0]
+    assert tok_off == tok_on, (
+        f"n_prompt={n_prompt}: Flash=ON {tok_on} vs Flash=OFF {tok_off}"
+    )
+
+
 # ----------------------------------------------------------------------------
 # Informational test: what does Flash Attention save in peak HBM for
 # attention score buffers?  Reported, not asserted — it's hardware-independent

From 145db0a675d14b9c0185dedb45ac71ca46e81e8b Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 12:32:46 +0200
Subject: [PATCH 11/14] ops: refresh HIP Flash Attention top-level docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The HIP section's introductory comment was written when there were only
two code paths (tiled + 3-pass).  Update it to reflect the current four
paths (WMMA, decode, tiled, 3-pass) with a small table of which inputs
each path handles.

Also expand the Kernel 2 (softmax) doc-comment to call out the
power-of-two block-size invariant and reference commit d9016a58 — the
bug that caught it produced the same first generated token (50411) for
every input because the 3-prompt-token prefill ran the tree reduction
with blockDim.x = 3 and silently dropped the third element.  Worth
guarding against a future maintainer "simplifying" the launch code.
---
 src/ops/flash_attention_gpu.cu | 55 ++++++++++++++++++++--------------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 24525a770..3cb06da74 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -374,29 +374,31 @@ namespace ctranslate2 {
 // ---------------------------------------------------------------------------
 // HIP Flash Attention — native implementation for AMD GPUs (gfx1100 / RDNA3+)
 //
-// Two code paths are provided:
+// Four code paths.  The dispatcher (flash_attention_hip_impl, below) picks
+// the most efficient one that matches the input shape and dtype:
 //
-// 1) Tiled fused kernel (hip_flash_attn_fwd_tiled<DevT, BM, BN, D>)
-//    Implements the Flash Attention 2 forward algorithm: a single grid block
-//    processes BM contiguous query rows of one (batch, head), streams K/V in
-//    BN-wide tiles through LDS, and maintains an online softmax state
-//    (m_i, l_i) plus an FP32 output accumulator in registers.  S = Q@K^T is
-//    NEVER materialised in HBM — memory and bandwidth scale with O(N·D)
-//    instead of O(N^2).  Used whenever head_dim matches one of the
-//    specialised values (64 / 80 / 128).
+//   Path           Used when                            FP-paths    LDS S?
+//   ─────────────────────────────────────────────────────────────────────────
+//   WMMA           seqlen_q >= 16, D % 16 == 0,         fp16, bf16  no
+//                  D in {64, 128}, RDNA3+ (gfx11)
+//   Decode         seqlen_q == 1, D in {64, 128},       fp16, bf16  fits in
+//                  seqlen_k fits in LDS                              LDS
+//   Tiled (scalar) seqlen_q >= 64 (BM), D in {64,80,128} fp16, bf16  no
+//                  (covers BF16 and head_dim=80 where WMMA isn't built)
+//   3-pass         everything else                      fp16, bf16  materialised
+//                  (also serves as correctness oracle)
 //
-// 2) Three-pass fallback (hip_attn_qk_kernel + softmax + ov)
-//    Simple and provably correct reference path that materialises the full
-//    [batch, nheads, seqlen_q, seqlen_k] score buffer.  Kept as a fallback
-//    for head dimensions that the tiled kernel is not specialised for, and
-//    as an oracle for correctness comparisons.
+// All paths use the same [batch, seqlen, nheads, head_dim] memory layout
+// and FP32 accumulators.  Q and K/V come from the layer split-heads in
+// that shape; the KV-cache (cached_keys, cached_values) shares the same
+// layout but with a larger second-dim allocation (cache_size >= offset +
+// seqlen_new) to amortise the cost of growing it.
 //
-// Memory layout of all tensors: [batch, seqlen, nheads, head_dim]
-// Supports FP16 and BF16 inputs; FP32 accumulators throughout.
-//
-// Limitations in this initial implementation:
-//   - Rotary embeddings and ALiBi are expected to be pre-applied by the caller.
-//   - No backward pass (inference only).
+// Limitations:
+//   - Inference only — no backward pass.
+//   - Rotary embeddings and ALiBi must be pre-applied by the layer.
+//   - WMMA path is FP16+BF16 only and gfx11+ (uses the wave32 16x16x16
+//     WMMA built-in).
 // ---------------------------------------------------------------------------
 
 
@@ -471,9 +473,16 @@ namespace ctranslate2 {
 
     // -------------------------------------------------------------------
     // Kernel 2 — row-wise softmax with optional causal mask
-    // Grid:  (seqlen_q, nheads, batch)   — so gridDim = (seqlen_q, nheads, batch)
-    // Block: (min(seqlen_k, 256), 1, 1)
-    // Uses shared memory reduction; wavefront-safe for both wf32 and wf64.
+    // Grid:  (seqlen_q, nheads, batch)
+    // Block: next-power-of-2 up to min(seqlen_k, 256)
+    //
+    // The block size MUST be a power of two: the per-row max/sum tree
+    // reduction (`for (s = blockDim.x >> 1; s > 0; s >>= 1)`) drops
+    // elements at odd boundaries otherwise.  Extra threads beyond
+    // seqlen_k contribute identity values (-1e9 for max, 0 for sum) and
+    // are harmless.  See commit d9016a58 — the bug this caught caused
+    // generate() to always emit token 50411 because the 3-token prompt
+    // prefill ran the reduction with blockDim.x = 3.
     // -------------------------------------------------------------------
     __global__ void hip_attn_softmax_kernel(
         float* __restrict__ S,       // [batch, nheads, seqlen_q, seqlen_k] — modified in place

From d74222c5071ce83ddc5fa9695e8e230de1cf0bc3 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 12:36:04 +0200
Subject: [PATCH 12/14] docs: add CHANGELOG entry for HIP Flash Attention
 support

Lists the four dispatched kernels (WMMA, scalar tiled, decode, 3-pass),
the KV-cache write path, the build flag pair, and the measured speedup
on Whisper-medium / RX 7900 XTX.
---
 CHANGELOG.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 86d8a12f3..62d1b7d11 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,15 @@
 
 ### New features
 
+* Native HIP Flash Attention for AMD RDNA3+ GPUs. Adds four dispatched
+  kernels (WMMA via the wave32 16x16x16 fp16/bf16 built-in, scalar tiled,
+  decode-optimised for `seqlen_q == 1`, and a 3-pass correctness oracle)
+  plus the KV-cache write path for autoregressive decoding. Enabled with
+  `-DWITH_HIP=ON -DWITH_FLASH_ATTN=ON`. Encoder and decoder self-attention
+  use this path when `flash_attention=True` is passed at model load time.
+  ~1.08-1.11x encoder speedup and ~1.05-1.09x `generate()` speedup on
+  Whisper-medium / RX 7900 XTX vs. the standard MultiHeadAttention path.
+
 ### Fixes and improvements
 
 ## [v4.7.1](https://github.com/OpenNMT/CTranslate2/releases/tag/v4.7.1) (2026-02-04)

From 73b7a868215a666c0251f865f6efee5c50dde088 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 12:48:28 +0200
Subject: [PATCH 13/14] style: apply black/isort and fix flake8 F541 in HIP
 Flash Attention tests

CI's check-python-style step caught:
  - black formatting differences (long argument lists wanted on separate lines,
    a couple of generator-expression layouts)
  - isort import-order: test_utils + ctranslate2 needed a blank-line group
    separation
  - flake8 F541: one f-string with no placeholders in benchmark_flash_attention.py
    line 231 (the savings-banner print spans two lines and only the second one
    actually interpolates -- both got prefixed with f"" by mistake).

All three checks now pass locally with the same versions CI pins
(black==22.*, isort==5.*, flake8==3.8.*); all 15 pytest tests still pass.
---
 python/tests/benchmark_flash_attention.py |  85 +++++++++------
 python/tests/test_flash_attention.py      | 120 +++++++++++++---------
 2 files changed, 128 insertions(+), 77 deletions(-)

diff --git a/python/tests/benchmark_flash_attention.py b/python/tests/benchmark_flash_attention.py
index f36799a5d..bf5922408 100644
--- a/python/tests/benchmark_flash_attention.py
+++ b/python/tests/benchmark_flash_attention.py
@@ -24,7 +24,6 @@
 
 import numpy as np
 
-
 # ----------------------------------------------------------------------------
 # Local-build DLL loader (mirrors test_flash_attention.py).
 # ----------------------------------------------------------------------------
@@ -51,14 +50,19 @@ def _load_hip_runtime():
     if sys.platform == "win32":
         # The DLL name on Windows has a major-version suffix that varies.
         site_dir = next(
-            d for d in (
+            d
+            for d in (
                 os.path.join(s, "_rocm_sdk_core", "bin")
-                for s in (__import__("site").getsitepackages()
-                          + [__import__("site").getusersitepackages()])
-            ) if os.path.isdir(d)
+                for s in (
+                    __import__("site").getsitepackages()
+                    + [__import__("site").getusersitepackages()]
+                )
+            )
+            if os.path.isdir(d)
         )
         candidates = sorted(
-            f for f in os.listdir(site_dir)
+            f
+            for f in os.listdir(site_dir)
             if f.startswith("amdhip64") and f.endswith(".dll")
         )
         if not candidates:
@@ -69,8 +73,10 @@ def _load_hip_runtime():
 
 
 _hip = _load_hip_runtime()
-_hip.hipMemGetInfo.argtypes = [ctypes.POINTER(ctypes.c_size_t),
-                               ctypes.POINTER(ctypes.c_size_t)]
+_hip.hipMemGetInfo.argtypes = [
+    ctypes.POINTER(ctypes.c_size_t),
+    ctypes.POINTER(ctypes.c_size_t),
+]
 _hip.hipMemGetInfo.restype = ctypes.c_int
 _hip.hipDeviceSynchronize.argtypes = []
 _hip.hipDeviceSynchronize.restype = ctypes.c_int
@@ -151,11 +157,13 @@ def main():
         "--model",
         default=None,
         help="Path to a converted faster-whisper-medium snapshot. "
-             "If omitted, the HuggingFace cache is searched.",
+        "If omitted, the HuggingFace cache is searched.",
     )
     parser.add_argument("--runs", type=int, default=20)
     parser.add_argument(
-        "--compute-type", default="float16", choices=["float16", "bfloat16"],
+        "--compute-type",
+        default="float16",
+        choices=["float16", "bfloat16"],
     )
     args = parser.parse_args()
 
@@ -166,8 +174,11 @@ def main():
         )
         if not os.path.isdir(snaps):
             sys.exit("No --model given and faster-whisper-medium not cached.")
-        shas = [d for d in os.listdir(snaps)
-                if os.path.isfile(os.path.join(snaps, d, "model.bin"))]
+        shas = [
+            d
+            for d in os.listdir(snaps)
+            if os.path.isfile(os.path.join(snaps, d, "model.bin"))
+        ]
         if not shas:
             sys.exit("No converted snapshot found in HF cache.")
         args.model = os.path.join(snaps, shas[0])
@@ -188,14 +199,18 @@ def main():
 
         def load():
             return ctranslate2.models.Whisper(
-                args.model, device="cuda",
-                compute_type=args.compute_type, flash_attention=flash,
+                args.model,
+                device="cuda",
+                compute_type=args.compute_type,
+                flash_attention=flash,
             )
 
         def work(m):
             r = m.generate(
                 ctranslate2.StorageView.from_array(mel),
-                PROMPTS, beam_size=1, max_length=100,
+                PROMPTS,
+                beam_size=1,
+                max_length=100,
             )
             return r
 
@@ -213,12 +228,14 @@ def work(m):
     saved_per_layer = sq * sk * h * 4
     print()
     print(
-        f"  Score-matrix that the standard path materialises and Flash "
-        f"avoids, per encoder layer:"
+        "  Score-matrix that the standard path materialises and Flash "
+        "avoids, per encoder layer:"
     )
     print(f"    {sq}x{sk}x{h} heads x FP32 = {fmt_bytes(saved_per_layer)} per layer")
-    print(f"    24 layers => up to {fmt_bytes(24 * saved_per_layer)} of HBM "
-          f"traffic per encoder pass")
+    print(
+        f"    24 layers => up to {fmt_bytes(24 * saved_per_layer)} of HBM "
+        f"traffic per encoder pass"
+    )
     print()
 
     # ------------------------------------------------------------------
@@ -229,27 +246,31 @@ def work(m):
     print("=" * 70)
 
     m_off = ctranslate2.models.Whisper(
-        args.model, device="cuda",
-        compute_type=args.compute_type, flash_attention=False,
+        args.model,
+        device="cuda",
+        compute_type=args.compute_type,
+        flash_attention=False,
     )
     m_on = ctranslate2.models.Whisper(
-        args.model, device="cuda",
-        compute_type=args.compute_type, flash_attention=True,
+        args.model,
+        device="cuda",
+        compute_type=args.compute_type,
+        flash_attention=True,
     )
 
     def enc_fn(m):
-        return lambda: m.encode(ctranslate2.StorageView.from_array(mel),
-                                to_cpu=False)
+        return lambda: m.encode(ctranslate2.StorageView.from_array(mel), to_cpu=False)
 
     for batch in (1, 4):
         mel_b = np.stack([mel[0]] * batch, axis=0)
 
         def enc_b(m):
-            return lambda: m.encode(ctranslate2.StorageView.from_array(mel_b),
-                                    to_cpu=False)
+            return lambda: m.encode(
+                ctranslate2.StorageView.from_array(mel_b), to_cpu=False
+            )
 
         off_ms = _bench(enc_b(m_off), runs=args.runs)
-        on_ms  = _bench(enc_b(m_on),  runs=args.runs)
+        on_ms = _bench(enc_b(m_on), runs=args.runs)
         print(
             f"  encoder (B={batch}, Sq=Sk=1500):  "
             f"OFF={off_ms:6.2f} ms  ON={on_ms:6.2f} ms  speedup={off_ms/on_ms:.2f}x"
@@ -257,13 +278,17 @@ def enc_b(m):
 
     print()
     for max_len in (30, 100, 200, 448):
+
         def gen_b(m, ml=max_len):
             return lambda: m.generate(
                 ctranslate2.StorageView.from_array(mel),
-                PROMPTS, beam_size=1, max_length=ml,
+                PROMPTS,
+                beam_size=1,
+                max_length=ml,
             )
+
         off_ms = _bench(gen_b(m_off), runs=max(args.runs // 4, 3))
-        on_ms  = _bench(gen_b(m_on),  runs=max(args.runs // 4, 3))
+        on_ms = _bench(gen_b(m_on), runs=max(args.runs // 4, 3))
         print(
             f"  generate (max_length={max_len:3d}):  "
             f"OFF={off_ms:7.1f} ms  ON={on_ms:7.1f} ms  speedup={off_ms/on_ms:.2f}x"
diff --git a/python/tests/test_flash_attention.py b/python/tests/test_flash_attention.py
index 236b27bd3..ba62ded0f 100644
--- a/python/tests/test_flash_attention.py
+++ b/python/tests/test_flash_attention.py
@@ -36,10 +36,10 @@
                 except (FileNotFoundError, OSError):
                     pass
 
-import ctranslate2
-
 import test_utils
 
+import ctranslate2
+
 
 # ----------------------------------------------------------------------------
 # Model discovery — uses an existing CT2 Whisper snapshot if available.
@@ -117,22 +117,26 @@ def test_flash_attention_encoder_matches_standard(compute_type, abs_tol, rel_tol
     mel = _mel(seed=0)
 
     m_off = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type=compute_type, flash_attention=False,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type=compute_type,
+        flash_attention=False,
     )
     m_on = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type=compute_type, flash_attention=True,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type=compute_type,
+        flash_attention=True,
     )
 
     out_off = _encoder_output(m_off, mel)
-    out_on  = _encoder_output(m_on,  mel)
+    out_on = _encoder_output(m_on, mel)
 
     assert out_off.shape == out_on.shape
     diff = np.abs(out_off - out_on)
-    max_diff  = float(diff.max())
-    max_abs   = float(np.abs(out_off).max()) + 1e-8
-    rel_diff  = max_diff / max_abs
+    max_diff = float(diff.max())
+    max_abs = float(np.abs(out_off).max()) + 1e-8
+    rel_diff = max_diff / max_abs
 
     # Informative — what Flash Attention's online softmax saves us in HBM
     # per layer (Whisper-medium encoder: Sq = Sk = 1500, 16 heads).
@@ -143,9 +147,9 @@ def test_flash_attention_encoder_matches_standard(compute_type, abs_tol, rel_tol
         f"per-layer score-buffer avoided = {saved_bytes/1024/1024:.1f} MiB"
     )
 
-    assert max_diff <= abs_tol, (
-        f"{compute_type} encoder max diff {max_diff:.4f} exceeds {abs_tol}"
-    )
+    assert (
+        max_diff <= abs_tol
+    ), f"{compute_type} encoder max diff {max_diff:.4f} exceeds {abs_tol}"
     assert rel_diff <= rel_tol, (
         f"{compute_type} encoder rel diff {rel_diff*100:.3f}% exceeds "
         f"{rel_tol*100:.3f}%"
@@ -175,25 +179,29 @@ def test_flash_attention_generate_fp16_token_match(seed):
     mel = _mel(seed=seed)
 
     m_off = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=False,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=False,
     )
     m_on = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=True,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=True,
     )
 
     feat_off = ctranslate2.StorageView.from_array(mel)
-    feat_on  = ctranslate2.StorageView.from_array(mel)
+    feat_on = ctranslate2.StorageView.from_array(mel)
 
     r_off = m_off.generate(feat_off, PROMPTS, beam_size=1, max_length=20)
-    r_on  = m_on.generate(feat_on,   PROMPTS, beam_size=1, max_length=20)
+    r_on = m_on.generate(feat_on, PROMPTS, beam_size=1, max_length=20)
 
     tok_off = r_off[0].sequences_ids[0]
-    tok_on  = r_on[0].sequences_ids[0]
-    assert tok_off == tok_on, (
-        f"seed={seed}: Flash=ON produced {tok_on} but oracle is {tok_off}"
-    )
+    tok_on = r_on[0].sequences_ids[0]
+    assert (
+        tok_off == tok_on
+    ), f"seed={seed}: Flash=ON produced {tok_on} but oracle is {tok_off}"
 
 
 # ----------------------------------------------------------------------------
@@ -215,8 +223,10 @@ def test_softmax_block_size_regression():
     when Flash Attention is enabled.  Guards against a re-emergence of the
     softmax reduction block-size bug."""
     m_on = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=True,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=True,
     )
 
     first_tokens = set()
@@ -249,30 +259,38 @@ def test_softmax_block_size_regression():
 def test_flash_attention_encoder_batched(batch_size):
     """Encoder correctness for batch_size > 1."""
     mels = np.stack(
-        [np.random.default_rng(seed).standard_normal((80, 3000)).astype(np.float32)
-         for seed in range(batch_size)],
+        [
+            np.random.default_rng(seed).standard_normal((80, 3000)).astype(np.float32)
+            for seed in range(batch_size)
+        ],
         axis=0,
     )
 
     m_off = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=False,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=False,
     )
     m_on = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=True,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=True,
     )
 
     out_off = _encoder_output(m_off, mels)
-    out_on  = _encoder_output(m_on,  mels)
+    out_on = _encoder_output(m_on, mels)
     assert out_off.shape == out_on.shape == (batch_size, 1500, 1024)
 
     diff = np.abs(out_off - out_on)
     max_diff = float(diff.max())
-    max_abs  = float(np.abs(out_off).max()) + 1e-8
+    max_abs = float(np.abs(out_off).max()) + 1e-8
     rel = max_diff / max_abs
-    print(f"\n[B={batch_size}] encoder max_abs_diff={max_diff:.4f}, "
-          f"rel_diff={rel*100:.3f}%")
+    print(
+        f"\n[B={batch_size}] encoder max_abs_diff={max_diff:.4f}, "
+        f"rel_diff={rel*100:.3f}%"
+    )
     assert max_diff <= 0.5, f"B={batch_size} max diff {max_diff:.4f} > 0.5"
 
 
@@ -295,27 +313,35 @@ def test_flash_attention_variable_prompt_length(n_prompt):
     prompt = [base[:n_prompt]]
 
     m_off = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=False,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=False,
     )
     m_on = ctranslate2.models.Whisper(
-        _MODEL_PATH, device="cuda",
-        compute_type="float16", flash_attention=True,
+        _MODEL_PATH,
+        device="cuda",
+        compute_type="float16",
+        flash_attention=True,
     )
 
     r_off = m_off.generate(
         ctranslate2.StorageView.from_array(mel),
-        prompt, beam_size=1, max_length=n_prompt + 5,
+        prompt,
+        beam_size=1,
+        max_length=n_prompt + 5,
     )
     r_on = m_on.generate(
         ctranslate2.StorageView.from_array(mel),
-        prompt, beam_size=1, max_length=n_prompt + 5,
+        prompt,
+        beam_size=1,
+        max_length=n_prompt + 5,
     )
     tok_off = r_off[0].sequences_ids[0]
-    tok_on  = r_on[0].sequences_ids[0]
-    assert tok_off == tok_on, (
-        f"n_prompt={n_prompt}: Flash=ON {tok_on} vs Flash=OFF {tok_off}"
-    )
+    tok_on = r_on[0].sequences_ids[0]
+    assert (
+        tok_off == tok_on
+    ), f"n_prompt={n_prompt}: Flash=ON {tok_on} vs Flash=OFF {tok_off}"
 
 
 # ----------------------------------------------------------------------------
@@ -328,10 +354,10 @@ def test_flash_attention_score_buffer_savings():
     HBM at typical Whisper / LLM shapes.  Useful context when reading the
     timing numbers."""
     cases = [
-        ("Whisper-medium encoder layer",  1, 16, 1500, 1500),
+        ("Whisper-medium encoder layer", 1, 16, 1500, 1500),
         ("Whisper-medium decoder self-attn (max_length=200)", 1, 16, 1, 200),
         ("Whisper-medium decoder cross-attn", 1, 16, 1, 1500),
-        ("Hypothetical LLM @ 4k context",  1, 32, 4096, 4096),
+        ("Hypothetical LLM @ 4k context", 1, 32, 4096, 4096),
         ("Hypothetical LLM @ 32k context", 1, 32, 32768, 32768),
     ]
     print(

From 0afe71ff528ba7603139b5b805f10bffc9f9b582 Mon Sep 17 00:00:00 2001
From: tonde <Tonde@live.de>
Date: Tue, 12 May 2026 14:49:45 +0200
Subject: [PATCH 14/14] ops: fix multi-arch HIP build by gating the WMMA
 dispatcher call
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The wheel CI builds the ROCm wheel with
  --offload-arch=gfx1030 --offload-arch=gfx1100 ...
i.e. one device pass per architecture.  In the device pass for gfx1030
the WMMA kernel template's body is gated out (the wave32 WMMA built-in
only exists on gfx11+/gfx12+), so its name is not in scope.  The
dispatcher's `hipLaunchKernelGGL((hip_flash_attn_wmma_fp16<HalfT, D>), …)`
call is parsed by the compiler in every pass (including device passes,
even though the dispatcher itself is host-only), so on gfx1030 it fails:

  flash_attention_gpu.cu:1296:31: error: use of undeclared identifier
  'hip_flash_attn_wmma_fp16'
  1 error generated when compiling for gfx1030.

This affected all three ROCm CI jobs:
  build-python-wheels-rocm (ubuntu-24.04)
  build-python-wheels-rocm (windows-2025)
  build-and-push-docker-images (rocm)

Two-level fix:

1. Compile-time gate: wrap the dispatcher's WMMA launch (lambda + the
   two `if (launch_wmma_bm16(...)) return;` calls) with the *same*
   `#if defined(__gfx11..) || !defined(__HIP_DEVICE_COMPILE__)` guard
   that already wraps the kernel definition.  In a non-gfx11 device
   pass the entire WMMA call site is preprocessor-skipped, so the
   undeclared-identifier error is gone.

2. Runtime gate: even when the file was compiled with some gfx11
   target in the arch list, the resulting multi-arch wheel may run
   on a non-gfx11 GPU at execution time, in which case the WMMA
   kernel isn't in the loaded device binary.  Add a one-time
   `gcnArchName` inspection (via cuda::get_device_properties) that
   only enters the WMMA path on a `gfx1*` / `gfx2*` device.  On any
   other arch the dispatcher falls through to the scalar tiled /
   3-pass kernels.

Verified locally:
  - Single-arch build (gfx1100 only): WMMA path active, 15/15 tests pass.
  - Multi-arch build (gfx1030;gfx1100;gfx1101;gfx1102): compiles
    cleanly, 15/15 tests pass on the gfx1100 host.
---
 src/ops/flash_attention_gpu.cu | 83 ++++++++++++++++++++++++----------
 1 file changed, 60 insertions(+), 23 deletions(-)

diff --git a/src/ops/flash_attention_gpu.cu b/src/ops/flash_attention_gpu.cu
index 3cb06da74..2f3a77919 100644
--- a/src/ops/flash_attention_gpu.cu
+++ b/src/ops/flash_attention_gpu.cu
@@ -1275,38 +1275,75 @@ namespace ctranslate2 {
       }
 
       // ----------------------------------------------------------------
-      // Fast path B: WMMA-accelerated kernel on RDNA3.
+      // Fast path B: WMMA-accelerated kernel on RDNA3 (gfx11+).
       // One wave32 per block (BM_W = BN = 16), used for both FP16 and BF16
       // inputs.  Wave32 fragment layout is identical between fp16 and bf16
       // on gfx11; only the WMMA built-in differs (selected via if constexpr).
+      //
+      // Two gates here are deliberate:
+      //
+      // 1. The runtime gate (`wmma_supported_at_runtime`) inspects the
+      //    active device's gcnArchName so multi-arch wheels (e.g. CI builds
+      //    against gfx1030;gfx1100;…) only dispatch WMMA when running on a
+      //    GPU that actually has the WMMA built-in.  On other archs we
+      //    fall through to the scalar tiled / 3-pass kernels.
+      //
+      // 2. The compile-time gate (`#if defined(__gfx11..) || !__HIP_DEVICE_COMPILE__`)
+      //    matches the gate around `hip_flash_attn_wmma_fp16`'s definition.
+      //    In a non-gfx11 device pass the kernel template isn't visible, so
+      //    even though `hipLaunchKernelGGL` only generates host-side code,
+      //    the kernel name has to resolve for the compiler to parse the
+      //    call site.  Wrapping the launch with the same guard means the
+      //    call site simply isn't present in that pass.
       // ----------------------------------------------------------------
       if constexpr (std::is_same<scalar_t, float16_t>::value
                  || std::is_same<scalar_t, bfloat16_t>::value) {
         using HalfT = std::conditional_t<
             std::is_same<scalar_t, float16_t>::value, _Float16, __bf16>;
 
-        auto launch_wmma_bm16 = [&](auto head_dim_const) -> bool {
-          constexpr int D = decltype(head_dim_const)::value;
-          constexpr int BM_W = 16;
-          if (head_dim != D) return false;
-          if (D % 16 != 0)   return false;
-          if (seqlen_q < BM_W) return false;
-          dim3 grid((seqlen_q + BM_W - 1) / BM_W, num_heads, batch_size);
-          dim3 block(32);  // one wave32
-          hipLaunchKernelGGL((hip_flash_attn_wmma_fp16<HalfT, D>),
-                             grid, block, 0, stream,
-                             reinterpret_cast<const HalfT*>(q_ptr),
-                             reinterpret_cast<const HalfT*>(k_ptr),
-                             reinterpret_cast<const HalfT*>(v_ptr),
-                             reinterpret_cast<HalfT*>(o_ptr),
-                             (int)seqlen_q, (int)seqlen_k,
-                             (int)k_time_stride, (int)v_time_stride,
-                             (int)num_heads, queries_scale,
-                             is_causal, (int)offset);
-          return true;
-        };
-        if (launch_wmma_bm16(std::integral_constant<int,  64>{})) return;
-        if (launch_wmma_bm16(std::integral_constant<int, 128>{})) return;
+        // Runtime arch check — must be true to enter the WMMA path even
+        // when this TU was compiled with a gfx11 target somewhere in the
+        // arch list (multi-arch wheels).
+        bool wmma_supported_at_runtime = false;
+        {
+          const int device_id = ctranslate2::get_device_index(ctranslate2::Device::CUDA);
+          const auto& dprops = ctranslate2::cuda::get_device_properties(device_id);
+          const char* arch = dprops.gcnArchName;
+          // gcnArchName looks like "gfx1100" or "gfx1100:sramecc-:xnack-".
+          // WMMA is available on gfx11* and gfx12* (RDNA3 / RDNA4).
+          wmma_supported_at_runtime =
+              arch && arch[0]=='g' && arch[1]=='f' && arch[2]=='x' && arch[3]=='1'
+              && (arch[4]=='1' || arch[4]=='2');
+        }
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || !defined(__HIP_DEVICE_COMPILE__)
+        if (wmma_supported_at_runtime) {
+          auto launch_wmma_bm16 = [&](auto head_dim_const) -> bool {
+            constexpr int D = decltype(head_dim_const)::value;
+            constexpr int BM_W = 16;
+            if (head_dim != D) return false;
+            if (D % 16 != 0)   return false;
+            if (seqlen_q < BM_W) return false;
+            dim3 grid((seqlen_q + BM_W - 1) / BM_W, num_heads, batch_size);
+            dim3 block(32);  // one wave32
+            hipLaunchKernelGGL((hip_flash_attn_wmma_fp16<HalfT, D>),
+                               grid, block, 0, stream,
+                               reinterpret_cast<const HalfT*>(q_ptr),
+                               reinterpret_cast<const HalfT*>(k_ptr),
+                               reinterpret_cast<const HalfT*>(v_ptr),
+                               reinterpret_cast<HalfT*>(o_ptr),
+                               (int)seqlen_q, (int)seqlen_k,
+                               (int)k_time_stride, (int)v_time_stride,
+                               (int)num_heads, queries_scale,
+                               is_causal, (int)offset);
+            return true;
+          };
+          if (launch_wmma_bm16(std::integral_constant<int,  64>{})) return;
+          if (launch_wmma_bm16(std::integral_constant<int, 128>{})) return;
+        }
+#else
+        (void)wmma_supported_at_runtime;
+#endif
       }
 
       // Fast path C: scalar tiled Flash Attention 2 kernel.