amd
diff --git a/‎aie_kernels/aie2/dual_gemv_silu_mul.cc‎
Lines changed: 79 additions & 0 deletions b/‎aie_kernels/aie2/dual_gemv_silu_mul.cc‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎aie_kernels/aie2p/dual_gemv_silu_mul.cc‎
Lines changed: 90 additions & 0 deletions b/‎aie_kernels/aie2p/dual_gemv_silu_mul.cc‎
Lines changed: 90 additions & 0 deletions
diff --git a/‎iron/operators/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎iron/operators/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎iron/operators/dual_gemv_silu_mul/design.py‎
Lines changed: 180 additions & 0 deletions b/‎iron/operators/dual_gemv_silu_mul/design.py‎
Lines changed: 180 additions & 0 deletions
@@ -0,0 +1,79 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2.
+// Same structure as AIE2+ variant but uses LUT-based getTanhBf16.
+
+#define NOCPP
+
+#include "../aie_kernel_utils.h"
+#include "lut_based_ops.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+#include <type_traits>
+
+static bfloat16 left_buf[1024] __attribute__((aligned(64)));
+static bfloat16 right_buf[1024] __attribute__((aligned(64)));
+
+template <uint32_t r>
+void matvec_vectorized(uint32_t m,
+                       uint32_t k,
+                       const bfloat16 *__restrict a,
+                       const bfloat16 *__restrict b,
+                       bfloat16 *__restrict c)
+{
+    ::aie::set_rounding(aie::rounding_mode::conv_even);
+    bfloat16 *c_end = c + m;
+    const bfloat16 *b_end = b + k;
+    for (; c < c_end; c++) {
+        aie::accum acc = aie::zeros<accfloat, r>();
+        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
+            aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
+            aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
+            acc = aie::mac(acc, a_vec, b_vec);
+        }
+        *c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
+    }
+}
+
+extern "C" {
+
+void dual_gemv_matvec_bf16(uint32_t m,
+                           uint32_t k,
+                           uint32_t row_offset,
+                           const bfloat16 *__restrict a_in,
+                           const bfloat16 *__restrict b_in,
+                           uint32_t phase)
+{
+    bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
+    dst += row_offset;
+    matvec_vectorized<64>(m, k, a_in, b_in, dst);
+}
+
+void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
+{
+    event0();
+
+    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
+    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    AIE_PREPARE_FOR_PIPELINING
+    for (int i = 0; i < m_output; i += 16) {
+        aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
+        aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
+
+        aie::vector<bfloat16, 16> half_x = aie::mul(left_val, register_0_5);
+        aie::vector<bfloat16, 16> tanh_half_x = getTanhBf16(half_x);
+        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        auto silu_output = aie::mul(left_val, sigmoid_approx);
+
+        auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
+        aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
+    }
+
+    event1();
+}
+
+} // extern "C"
@@ -0,0 +1,90 @@
+// SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+// Fused dual-GEMV + SiLU + elementwise multiply kernel for AIE2+.
+//
+// Computes: output = silu(W1 @ x) * (W2 @ x)
+//
+// Two entry points called from the NPU design's core body:
+//   1. dual_gemv_matvec_bf16: GEMV writing to FIFO buffer c_out + row_offset
+//   2. dual_gemv_silu_mul_bf16: reads from static left_buf/right_buf, writes to FIFO c_out
+//
+// The static buffers are written via scalar stores (from matvec) and read
+// via aie::load_v in the silu_mul phase. Aligned to 64 bytes for safe vector access.
+
+#define NOCPP
+
+#include "../aie_kernel_utils.h"
+
+#include <aie_api/aie.hpp>
+#include <stdint.h>
+#include <type_traits>
+
+static bfloat16 left_buf[1024] __attribute__((aligned(64)));
+static bfloat16 right_buf[1024] __attribute__((aligned(64)));
+
+template <uint32_t r>
+void matvec_vectorized(uint32_t m,
+                       uint32_t k,
+                       const bfloat16 *__restrict a,
+                       const bfloat16 *__restrict b,
+                       bfloat16 *__restrict c)
+{
+    ::aie::set_rounding(aie::rounding_mode::conv_even);
+    bfloat16 *c_end = c + m;
+    const bfloat16 *b_end = b + k;
+    for (; c < c_end; c++) {
+        aie::accum acc = aie::zeros<accfloat, r>();
+        AIE_LOOP_MIN_ITERATION_COUNT(2)
+        for (const bfloat16 *__restrict b_cur = b; b_cur < b_end; b_cur += r, a += r) {
+            aie::vector<bfloat16, r> a_vec = aie::load_v<r>(a);
+            aie::vector<bfloat16, r> b_vec = aie::load_v<r>(b_cur);
+            acc = aie::mac(acc, a_vec, b_vec);
+        }
+        *c = static_cast<bfloat16>(aie::reduce_add(acc.template to_vector<float>()));
+    }
+}
+
+extern "C" {
+
+// Phase 1 & 2: GEMV writing to a static buffer (left_buf or right_buf)
+// phase=0 writes to left_buf, phase=1 writes to right_buf
+void dual_gemv_matvec_bf16(uint32_t m,
+                           uint32_t k,
+                           uint32_t row_offset,
+                           const bfloat16 *__restrict a_in,
+                           const bfloat16 *__restrict b_in,
+                           uint32_t phase)
+{
+    bfloat16 *dst = (phase == 0) ? left_buf : right_buf;
+    dst += row_offset;
+    matvec_vectorized<64>(m, k, a_in, b_in, dst);
+}
+
+// Phase 3: silu(left_buf) * right_buf -> c_out (FIFO buffer)
+void dual_gemv_silu_mul_bf16(bfloat16 *__restrict c_out, int32_t m_output)
+{
+    event0();
+
+    aie::vector<bfloat16, 16> register_0_5 = aie::broadcast<bfloat16, 16>(0.5f);
+    aie::vector<bfloat16, 16> register_1 = aie::broadcast<bfloat16, 16>(1.0f);
+    AIE_PREPARE_FOR_PIPELINING
+    for (int i = 0; i < m_output; i += 16) {
+        aie::vector<bfloat16, 16> left_val = aie::load_v<16>(left_buf + i);
+        aie::vector<bfloat16, 16> right_val = aie::load_v<16>(right_buf + i);
+
+        // SiLU(x) = x * sigmoid(x) = x * 0.5 * (1 + tanh(x/2))
+        auto half_x = aie::mul(left_val, register_0_5);
+        auto tanh_half_x = aie::tanh<bfloat16>(half_x.to_vector<float>());
+        auto tanh_half_x_approx = aie::add(tanh_half_x, register_1);
+        aie::vector<bfloat16, 16> sigmoid_approx = aie::mul(tanh_half_x_approx, register_0_5);
+        auto silu_output = aie::mul(left_val, sigmoid_approx);
+
+        auto fused_output = aie::mul(silu_output.to_vector<bfloat16>(), right_val);
+        aie::store_v(c_out + i, fused_output.to_vector<bfloat16>());
+    }
+
+    event1();
+}
+
+} // extern "C"
@@ -3,6 +3,7 @@
 
 from .axpy.op import AIEAXPY
 from .dequant.op import AIEDequant
+from .dual_gemv_silu_mul.op import AIEDualGEMVSiLUMul
 from .elementwise_add.op import AIEElementwiseAdd
 from .elementwise_mul.op import AIEElementwiseMul
 from .gelu.op import AIEGELU
 
@@ -0,0 +1,180 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+from pathlib import Path
+from ml_dtypes import bfloat16
+import argparse
+
+import aie.dialects.index as index
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.helpers.dialects.scf import _for as range_
+from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
+from aie.iron.placers import SequentialPlacer
+from aie.iron.device import NPU1, NPU2
+
+"""
+Dual matrix-vector + SiLU + elementwise multiply design.
+
+Computes: output = silu(W1 @ x) * (W2 @ x)
+
+W1 and W2 rows are pre-interleaved in DDR by the operator (op.py).
+GEMV phases write to kernel-internal static buffers (left_buf, right_buf)
+controlled by a phase parameter. The silu_mul phase reads from those
+buffers and writes the result to the output C FIFO.
+
+Each AIE core:
+  1. Acquires vector x (held in L1 for both GEMV passes)
+  2. Consumes W1 rows from A FIFO, writes dot products to left_buf (phase=0)
+  3. Consumes W2 rows from A FIFO, writes dot products to right_buf (phase=1)
+  4. Computes silu(left_buf) * right_buf -> C FIFO output
+"""
+
+
+def my_dual_gemv_silu_mul(dev, cols, M, K, m_input, m_output=None):
+    if m_output is None:
+        m_output = m_input
+
+    assert m_output % m_input == 0 and m_output >= m_input
+    assert m_output <= M // cols
+    assert (M // cols) % m_output == 0
+    assert m_input <= M // cols
+    assert (M // cols) % m_input == 0
+
+    dtype_in = np.dtype[bfloat16]
+    dtype_out = np.dtype[bfloat16]
+
+    assert M % cols == 0
+
+    dev_ty = NPU1() if dev == "npu" else NPU2()
+
+    # L1 tile types
+    L1_A_ty = np.ndarray[(m_input, K), dtype_in]
+    L1_B_ty = np.ndarray[(K,), dtype_in]
+    L1_C_ty = np.ndarray[(m_output,), dtype_out]
+
+    # L3 (DDR) buffer types
+    L3_W_ty = np.ndarray[(2 * M, K), dtype_in]
+    L3_B_ty = np.ndarray[(K,), dtype_in]
+    L3_C_ty = np.ndarray[(M,), dtype_out]
+
+    # GEMV: writes to left_buf (phase=0) or right_buf (phase=1)
+    matvec = Kernel(
+        "dual_gemv_matvec_bf16",
+        "dual_gemv_silu_mul.o",
+        [np.int32, np.int32, np.int32, L1_A_ty, L1_B_ty, np.int32],
+    )
+
+    # SiLU+Mul: reads from static left_buf/right_buf, writes to C FIFO
+    silu_mul_fn = Kernel(
+        "dual_gemv_silu_mul_bf16",
+        "dual_gemv_silu_mul.o",
+        [L1_C_ty, np.int32],
+    )
+
+    # ObjectFIFOs: 2 inputs + 1 output = fits AIE DMA channel limits
+    A_fifos = [ObjectFifo(L1_A_ty, name=f"A_{i}", depth=2) for i in range(cols)]
+    B_fifos = [ObjectFifo(L1_B_ty, name=f"B_{i}", depth=1) for i in range(cols)]
+    C_fifos = [ObjectFifo(L1_C_ty, name=f"C_{i}", depth=2) for i in range(cols)]
+
+    def core_body(A_fifo, B_fifo, C_fifo, matvec_fn, silu_mul):
+        for _ in range_(0xFFFFFFFF):
+            b = B_fifo.acquire(1)
+            for i_idx in range_(M // m_output // cols):
+                # Phase 1: W1 rows -> left_buf (phase=0)
+                for j_idx in range_(m_output // m_input):
+                    j_i32 = index.casts(T.i32(), j_idx)
+                    row_offset = j_i32 * m_input
+                    a = A_fifo.acquire(1)
+                    matvec_fn(m_input, K, row_offset, a, b, 0)
+                    A_fifo.release(1)
+                # Phase 2: W2 rows -> right_buf (phase=1)
+                for j_idx in range_(m_output // m_input):
+                    j_i32 = index.casts(T.i32(), j_idx)
+                    row_offset = j_i32 * m_input
+                    a = A_fifo.acquire(1)
+                    matvec_fn(m_input, K, row_offset, a, b, 1)
+                    A_fifo.release(1)
+                # Phase 3: silu(left_buf) * right_buf -> output
+                c = C_fifo.acquire(1)
+                silu_mul(c, m_output)
+                C_fifo.release(1)
+            B_fifo.release(1)
+
+    workers = [
+        Worker(
+            core_body,
+            [
+                A_fifos[i].cons(),
+                B_fifos[i].cons(),
+                C_fifos[i].prod(),
+                matvec,
+                silu_mul_fn,
+            ],
+        )
+        for i in range(cols)
+    ]
+
+    # Interleaved weight distribution per column
+    rows_per_col = M // cols
+    A_taps = [
+        TensorAccessPattern(
+            tensor_dims=(2 * M, K),
+            offset=col * 2 * rows_per_col * K,
+            sizes=[1, 1, 1, 2 * rows_per_col * K],
+            strides=[0, 0, 0, 1],
+        )
+        for col in range(cols)
+    ]
+
+    # Output collection
+    C_taps = [
+        TensorAccessPattern(
+            tensor_dims=(1, M),
+            offset=col * (M // cols),
+            sizes=[1, 1, 1, (M // cols)],
+            strides=[0, 0, 0, 1],
+        )
+        for col in range(cols)
+    ]
+
+    rt = Runtime()
+    with rt.sequence(L3_W_ty, L3_B_ty, L3_C_ty) as (W, B, C):
+        rt.start(*workers)
+        tg = rt.task_group()
+        for i in range(cols):
+            rt.fill(A_fifos[i].prod(), W, A_taps[i], task_group=tg)
+            rt.fill(B_fifos[i].prod(), B, task_group=tg)
+        for i in range(cols):
+            rt.drain(C_fifos[i].cons(), C, C_taps[i], task_group=tg, wait=True)
+        rt.finish_task_group(tg)
+
+    return Program(dev_ty, rt).resolve_program(SequentialPlacer())
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser(
+        prog="AIE Dual GEMV + SiLU + Mul Design",
+    )
+    argparser.add_argument("--dev", type=str, choices=["npu", "npu2"], default="npu")
+    argparser.add_argument("-M", type=int, required=True)
+    argparser.add_argument("-K", type=int, required=True)
+    argparser.add_argument("-m", type=int, required=True, dest="m_input")
+    argparser.add_argument("--m-output", type=int, default=None, dest="m_output")
+    argparser.add_argument("--cols", type=int, required=True)
+    argparser.add_argument(
+        "--output-file-path",
+        "-o",
+        type=str,
+        help="Output file path for the generated MLIR module",
+    )
+    args = argparser.parse_args()
+    module = my_dual_gemv_silu_mul(
+        args.dev, args.cols, args.M, args.K, args.m_input, args.m_output
+    )
+
+    output_file_path = Path(args.output_file_path)
+
+    with open(output_file_path, "w") as f:
+        f.write(str(module))