From 532fc529504cdb6fd340df0038f2a7e006e7f333 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Sun, 22 Feb 2026 00:55:31 +0000
Subject: [PATCH 01/19] [EP] Add batched low-latency RDMA send path

Add batched RDMA send handling and related low-latency path updates, including dispatch-side token-count signaling and receiver-side immediate data handling for batched sends.
---
 ep/bench/buffer.py        |  34 +++++++++
 ep/include/ep_config.hpp  |  12 ++++
 ep/include/uccl_ibgda.cuh |  10 ++-
 ep/src/internode_ll.cu    | 141 +++++++++++++++++++++++++++++++-------
 ep/src/rdma.cpp           |   9 ++-
 5 files changed, 179 insertions(+), 27 deletions(-)

diff --git a/ep/bench/buffer.py b/ep/bench/buffer.py
index b49a528f9..e2e5c65a1 100644
--- a/ep/bench/buffer.py
+++ b/ep/bench/buffer.py
@@ -281,6 +281,24 @@ def low_latency_dispatch(
                 hidden=x.shape[1],
                 num_experts=num_experts,
             )
+        
+        # DEBUG: Print data before CUDA kernel launch
+        if os.environ.get("DEBUG_DISPATCH", "0") == "1":
+            print(f"\n{'='*60}", flush=True)
+            print(f"[DEBUG] low_latency_dispatch - BEFORE CUDA kernel", flush=True)
+            print(f"{'='*60}", flush=True)
+            print(f"  rank: {self.rank}", flush=True)
+            print(f"  x.shape: {x.shape}, dtype: {x.dtype}", flush=True)
+            print(f"  x.data_ptr: {hex(x.data_ptr())}", flush=True)
+            print(f"  x[:3, :8]:\n{x[:3, :8]}", flush=True)
+            print(f"  topk_idx.shape: {topk_idx.shape}, dtype: {topk_idx.dtype}", flush=True)
+            print(f"  topk_idx[:5]:\n{topk_idx[:5]}", flush=True)
+            print(f"  num_max_dispatch_tokens_per_rank: {num_max_dispatch_tokens_per_rank}", flush=True)
+            print(f"  num_experts: {num_experts}", flush=True)
+            print(f"  use_fp8: {use_fp8}, round_scale: {round_scale}, use_ue8m0: {use_ue8m0}", flush=True)
+            print(f"  async_finish: {async_finish}, return_recv_hook: {return_recv_hook}", flush=True)
+            print(f"{'='*60}\n", flush=True)
+        
         (
             packed_recv_x,
             packed_recv_x_scales,
@@ -309,6 +327,22 @@ def low_latency_dispatch(
             x.size(1),
             num_experts,
         )
+        
+        # DEBUG: Print data after CUDA kernel returns
+        if os.environ.get("DEBUG_DISPATCH", "0") == "1":
+            import torch
+            torch.cuda.synchronize()  # Wait for kernel to complete
+            print(f"\n{'='*60}", flush=True)
+            print(f"[DEBUG] low_latency_dispatch - AFTER CUDA kernel", flush=True)
+            print(f"{'='*60}", flush=True)
+            print(f"  packed_recv_x.shape: {packed_recv_x.shape}, dtype: {packed_recv_x.dtype}", flush=True)
+            print(f"  packed_recv_count: {packed_recv_count}", flush=True)
+            print(f"  packed_recv_src_info.shape: {packed_recv_src_info.shape}", flush=True)
+            print(f"  packed_recv_layout_range.shape: {packed_recv_layout_range.shape}", flush=True)
+            if packed_recv_x_scales is not None:
+                print(f"  packed_recv_x_scales.shape: {packed_recv_x_scales.shape}", flush=True)
+            print(f"{'='*60}\n", flush=True)
+        
         tensors_to_record = (
             x,
             topk_idx,
diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 451b3d27b..082fcdf29 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -196,8 +196,20 @@ struct LowLatencyLayout {
     size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
 
     // Send buffer
+#ifdef LAM_DEV
+    // Lam: Buffer layout for batched RDMA sends:
+    // ┌─────────────────────────────────┬──────────────────────────────────────────────┐
+    // │ Temp buffer (offset 0)          │ RDMA batch buffer (offset num_max_tokens)    │
+    // │ rdma_x[token_idx]               │ rdma_x[num_max_tokens + expert*max + slot]   │
+    // │ Size: num_max_tokens * msg_size │ Size: num_experts * num_max_tokens * msg_size│
+    // └─────────────────────────────────┴──────────────────────────────────────────────┘
+    // Flow: FP8 cast -> temp buffer -> copy to rdma_batch_buffer -> batch RDMA send
+    size_t dispatch_send_buffer_bytes =
+        (num_experts + 1) * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+#else
     size_t dispatch_send_buffer_bytes =
         num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+#endif
     size_t combine_send_buffer_bytes = num_experts *
                                        num_max_dispatch_tokens_per_rank *
                                        num_bytes_per_combine_msg;
diff --git a/ep/include/uccl_ibgda.cuh b/ep/include/uccl_ibgda.cuh
index 41b48d1cd..753a5e63f 100644
--- a/ep/include/uccl_ibgda.cuh
+++ b/ep/include/uccl_ibgda.cuh
@@ -29,7 +29,7 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
     int expert_idx, int lane_id, int message_idx,
     uint64_t const* d2h_channel_addrs, int num_d2h_channel_addrs,
     bool is_combine, int low_latency_buffer_idx = 0, uint64_t atomic_offset = 0,
-    uint64_t atomic_val = 0) {
+    uint64_t atomic_val = 0, int num_tokens = 1) {
   // NOTE(MaoZiming): different from the nvshmemi_ibgda_put_nbi_warp in
   // ibgda_device.cuh, we don't do warp-cooperation.
   if (lane_id != 0) return;
@@ -67,6 +67,10 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
       cmd.atomic_val = atomic_val;
     } else {
       cmd.expert_idx = expert_idx;
+      // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
+      cmd.atomic_val = (num_tokens <= 0 || num_tokens > 255)
+                           ? 1
+                           : static_cast<uint8_t>(num_tokens);
     }
     h->atomic_set_and_commit(cmd, &slot);
   }
@@ -115,6 +119,10 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
         cmd.atomic_val = atomic_val;
       } else {
         cmd.expert_idx = expert_idx;
+        // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
+        cmd.atomic_val = (num_tokens <= 0 || num_tokens > 255)
+                             ? 1
+                             : static_cast<uint8_t>(num_tokens);
       }
       h->atomic_set_and_commit(cmd, &slot);
       break;
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index e8c352af5..2a57e20ca 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -12,6 +12,9 @@ namespace cg = cooperative_groups;
 namespace uccl {
 namespace internode_ll {
 
+// Lam: Global lock for debug printing (ensures printf calls don't interleave)
+__device__ int g_print_lock = 0;
+
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 constexpr int kNumMaxWarpGroups = 16;
 #else
@@ -53,6 +56,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     int64_t* dispatch_wait_recv_cost_stats, void* rdma_recv_x,
     int* rdma_recv_count, void* rdma_x, void const* x, int64_t const* topk_idx,
     int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
+    int* atomic_send_counter_per_expert,
     int* next_clean, int64_t* next_clean_second, int num_next_clean_int,
     int num_tokens, int num_max_dispatch_tokens_per_rank, int num_topk,
     int num_experts, int rank, int num_ranks, int num_warp_groups,
@@ -63,6 +67,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     void* atomic_buffer_ptr = nullptr,
     int64_t* rdma_recv_count_internode = nullptr,
     int* grid_sync_barrier_ptr = nullptr) {
+// #ifdef LAM_DEV
+  // if (blockIdx.x == 0 && threadIdx.x == 0) {
+  //   printf("[LAM_DEV] dispatch called\n");
+  // }
+// #endif
   auto const sm_id = static_cast<int>(blockIdx.x);
   auto const thread_id = static_cast<int>(threadIdx.x);
   auto const warp_id = thread_id / WARP_SIZE, lane_id = get_lane_id();
@@ -98,6 +107,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   // Expert counts
   __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
 
+  // Lam: Send slots for each topk destination (for batched send buffer layout)
+  constexpr int kNumMaxTopK = 9;
+  __shared__ int shared_send_slots[kNumMaxTopK];
+  __shared__ int shared_dst_experts[kNumMaxTopK];
+
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   // initialize barrier
   amd::barrier_init(1);
@@ -136,6 +150,20 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                              : -1;
       thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
 
+      // Lam: Allocate send slots for each topk destination
+      // Each warp (warp_id < num_topk) allocates a slot for its destination expert
+      if (warp_id < num_topk && lane_id == 0) {
+        shared_dst_experts[warp_id] = dst_expert_idx;
+        if (dst_expert_idx >= 0) {
+          shared_send_slots[warp_id] =
+              atomicAdd(atomic_send_counter_per_expert + dst_expert_idx, 1);
+        } else {
+          shared_send_slots[warp_id] = -1;
+        }
+      }
+      // Sync to make shared_send_slots visible to all threads
+      sync_barrier_1((num_warps - 1) * WARP_SIZE);
+
 // FP8 cast
 #pragma unroll
       for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
@@ -221,15 +249,15 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                         dst_rank, max_nvl_peers, 0)
                 : 0;
         if (dst_p2p_ptr == 0) {
-          __threadfence_system();
-          uccl::nvshmemi_ibgda_put_nbi_warp(
-              dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
-              src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
-              num_bytes_per_msg, dst_rank,
-              /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting
-                                                 // rb.
-              lane_id, slot_idx, d2h_channel_addrs, num_d2h_channel_addrs,
-              false, low_latency_buffer_idx);
+          // Lam: IBGDA -> copy temp to rdma_batch_buffer, batch send later
+          auto const lam_slot = shared_send_slots[warp_id];
+          auto const batch_buf_offset = num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+          auto const batch_buf_ptr = static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+              (dst_expert_idx * num_max_dispatch_tokens_per_rank + lam_slot) * num_bytes_per_msg;
+          auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
+          auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
+          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
+                             src_int4_ptr, ld_nc_global, st_na_global);
         } else {
           // Intra-node: use direct memory copy via IPC
           auto const* src_int4_ptr = reinterpret_cast<int4 const*>(src_ptr);
@@ -288,6 +316,78 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     }
   }
   __syncthreads();
+
+  // Lam: Grid-wide sync before batch send phase.
+  // __syncthreads() only syncs within a single thread block (SM).
+  // The token loop distributes tokens round-robin across SMs
+  // (token_idx = sm_id, stepping by num_sms). When num_tokens < num_sms,
+  // most SMs skip the token loop and pass __syncthreads() immediately,
+  // while the SMs processing tokens are still writing to the batch buffer
+  // and incrementing atomic_send_counter_per_expert.
+  // Without grid sync, the batch send phase can read a stale/partial
+  // counter and send fewer tokens than actually produced, causing the
+  // receiver to hang waiting for data that never arrives.
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+  amd::grid_sync(grid_sync_barrier_ptr, num_sms);
+#else
+  cg::this_grid().sync();
+#endif
+
+  // Lam: Batch RDMA send phase - send entire expert buffer in ONE IBGDA call
+  // Each warp group handles one expert (only first sub_warp does the send)
+  if (responsible_expert_idx < num_experts && sub_warp_id == 0) {
+    auto const dst_rank = responsible_expert_idx / num_local_experts;
+    auto const dst_expert_local_idx = responsible_expert_idx % num_local_experts;
+
+    // Check if this destination is inter-node (needs IBGDA batch send)
+    // IPC destinations were already sent in the token loop
+    auto const test_dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x);
+    auto const dst_p2p_ptr =
+        ipc_rdma_base_ptrs
+            ? uccl::get_ipc_p2p_ptr(test_dst_ptr, ipc_rdma_base_ptrs, rank,
+                                    dst_rank, max_nvl_peers, 0)
+            : 0;
+
+    if (dst_p2p_ptr == 0) {
+      // Inter-node: batch send ALL tokens for this expert in ONE call
+      auto const num_tokens_to_send =
+          atomic_send_counter_per_expert[responsible_expert_idx];
+
+      if (num_tokens_to_send > 0) {
+        auto const batch_buf_offset =
+            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        // Source: start of this expert's batch buffer (contiguous)
+        auto const batch_buf_ptr =
+            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+            responsible_expert_idx * num_max_dispatch_tokens_per_rank *
+                num_bytes_per_msg;
+        auto const src_ptr = reinterpret_cast<uint64_t>(batch_buf_ptr);
+        // Destination: start of this expert's recv buffer on remote rank
+        auto const dst_ptr =
+            reinterpret_cast<uint64_t>(rdma_recv_x) +
+            dst_expert_local_idx * num_ranks *
+                num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
+            rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        // Total bytes: all tokens for this expert
+        auto const total_bytes = num_tokens_to_send * num_bytes_per_msg;
+
+        __threadfence_system();
+
+        uccl::nvshmemi_ibgda_put_nbi_warp(
+            dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
+            src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
+            total_bytes, dst_rank,
+            /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting rb.
+            lane_id, /*slot=*/0, d2h_channel_addrs, num_d2h_channel_addrs,
+            false, low_latency_buffer_idx, 0, 0, num_tokens_to_send);
+        
+      }
+    }
+    // IPC: already sent in the token loop, nothing to do here
+  }
+
+  __threadfence_system();  // Ensure batch sends are visible before count sends
+
   // Issue count sends
   if (responsible_expert_idx < num_experts and sub_warp_id == 0 and
       lane_id == 0) {
@@ -329,10 +429,12 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr),
                             -num_tokens_sent - 1);
     }
+
     // Clean workspace for next use
     atomic_counter_per_expert[responsible_expert_idx] = 0;
     atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
 
+    atomic_send_counter_per_expert[responsible_expert_idx] = 0;
     // Clean `packed_recv_count`
     if (dst_rank == 0) packed_recv_count[dst_expert_local_idx] = 0;
   }
@@ -440,13 +542,6 @@ LOW_LATENCY_DISPATCH_RECV:
           num_recv_tokens_internode != 0 ? -num_recv_tokens_internode - 1 : 0;
       num_recv_tokens_ipc =
           num_recv_tokens_ipc != 0 ? -num_recv_tokens_ipc - 1 : 0;
-      // printf(
-      //     "num_recv_tokens_internode: %d, num_recv_tokens_ipc: %d, src_rank:"
-      //     "%d, rank: %d, max_nvl_peers: %d, responsible_expert_idx: %d,"
-      //     "num_experts: %d, num_local_experts: %d\n",
-      //     num_recv_tokens_internode, num_recv_tokens_ipc, src_rank, rank,
-      //     max_nvl_peers, responsible_expert_idx, num_experts,
-      //     num_local_experts);
       num_recv_tokens = num_recv_tokens_internode + num_recv_tokens_ipc;
       recv_token_begin_idx =
           atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
@@ -522,8 +617,6 @@ LOW_LATENCY_DISPATCH_RECV:
         }
       }
     }
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[dispatch] RECV finished\n");
   }
 }
 
@@ -556,8 +649,10 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
   auto atomic_counter_per_expert = static_cast<int*>(workspace);
   auto atomic_finish_counter_per_expert =
       atomic_counter_per_expert + num_experts;
-  auto grid_sync_barrier_ptr = atomic_finish_counter_per_expert + num_experts;
-  EP_HOST_ASSERT((num_experts * 2 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
+  auto atomic_send_counter_per_expert =
+      atomic_finish_counter_per_expert + num_experts;
+  auto grid_sync_barrier_ptr = atomic_send_counter_per_expert + num_experts;
+  EP_HOST_ASSERT((num_experts * 3 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
 
   // FP8 checks
   if (use_ue8m0)
@@ -575,6 +670,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         cumulative_local_expert_recv_stats, dispatch_wait_recv_cost_stats,    \
         rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                    \
         atomic_counter_per_expert, atomic_finish_counter_per_expert,          \
+        atomic_send_counter_per_expert,                                        \
         next_clean, next_clean_second, num_next_clean_int, num_tokens,        \
         num_max_dispatch_tokens_per_rank, num_topk, num_experts, rank,        \
         num_ranks, num_warp_groups, num_warps_per_group, round_scale, phases, \
@@ -946,8 +1042,6 @@ __global__ __launch_bounds__(1024, 1) void combine(
 // Receiving phase
 LOW_LATENCY_COMBINE_RECV:
   if ((phases & LOW_LATENCY_RECV_PHASE) == 0) {
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[combine] SEND finished\n");
     return;
   }
   // Wait all ranks to arrive
@@ -1060,8 +1154,6 @@ LOW_LATENCY_COMBINE_RECV:
        token_idx * hidden_bf16_int4)[hidden_idx] = combined_int4;
     }
 
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[combine] RECV finished\n");
   }
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
@@ -1103,7 +1195,6 @@ void combine(void* combined_x, void* rdma_recv_x, int* rdma_recv_flag,
 
   constexpr int kNumTMABytesPerWarp = 12 * (512 + 16);
   int const smem_size = kNumTMABytesPerWarp * num_warps;
-  // printf("Combine launched\n");
 
 #define COMBINE_LAUNCH_CASE(hidden)                                          \
   {                                                                          \
diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index cfa24d96c..f28bfb4aa 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -1307,9 +1307,16 @@ static void post_rdma_async_batched_fast_mode(
                                                get_low_latency(cmd.cmd_type)};
 #endif
 #ifdef USE_RECEIVER_BARRIER
+        // Lam: Low-latency: num_tokens from cmd.atomic_val (GPU sets it); else 1.
+        // Use atomic_val whenever GPU set it (non-zero), not only when
+        // get_low_latency(cmd_type); dispatch can also set atomic_val.
+        uint32_t num_tokens_imm =
+            cmd.atomic_val ? static_cast<uint32_t>(cmd.atomic_val) : 1u;
+        // get_is_combine(cmd.cmd_type) ? printf("Receiving this combine imm? num_tokens_imm: %d, cmd.atomic_val: %d\n", num_tokens_imm, cmd.atomic_val) : printf("Receiving this dispatch imm? num_tokens_imm: %d, cmd.atomic_val: %d\n", num_tokens_imm, cmd.atomic_val);
+        // fflush(stdout);
         uint32_t imm = WriteImm::Pack(get_is_combine(cmd.cmd_type),
                                       get_low_latency(cmd.cmd_type),
-                                      cmd.expert_idx, 1, my_rank)
+                                      cmd.expert_idx, num_tokens_imm, my_rank)
                            .GetImmData();
         ibv_wr_rdma_write_imm(qpx, ctx->remote_rkey, remote_addr, htonl(imm));
 #else

From 2cbed00192061f1db3924be2f59f0bae30499468 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Sun, 22 Feb 2026 07:25:05 +0000
Subject: [PATCH 02/19] Cleanup code and comments

---
 ep/bench/buffer.py        | 34 ----------------------------------
 ep/include/ep_config.hpp  | 20 ++++++++------------
 ep/include/uccl_ibgda.cuh | 10 ++++------
 ep/src/internode_ll.cu    | 20 ++++++++++++++------
 ep/src/rdma.cpp           |  5 -----
 5 files changed, 26 insertions(+), 63 deletions(-)

diff --git a/ep/bench/buffer.py b/ep/bench/buffer.py
index b4afc3f23..14ef8cc34 100644
--- a/ep/bench/buffer.py
+++ b/ep/bench/buffer.py
@@ -284,24 +284,6 @@ def low_latency_dispatch(
                 hidden=x.shape[1],
                 num_experts=num_experts,
             )
-        
-        # DEBUG: Print data before CUDA kernel launch
-        if os.environ.get("DEBUG_DISPATCH", "0") == "1":
-            print(f"\n{'='*60}", flush=True)
-            print(f"[DEBUG] low_latency_dispatch - BEFORE CUDA kernel", flush=True)
-            print(f"{'='*60}", flush=True)
-            print(f"  rank: {self.rank}", flush=True)
-            print(f"  x.shape: {x.shape}, dtype: {x.dtype}", flush=True)
-            print(f"  x.data_ptr: {hex(x.data_ptr())}", flush=True)
-            print(f"  x[:3, :8]:\n{x[:3, :8]}", flush=True)
-            print(f"  topk_idx.shape: {topk_idx.shape}, dtype: {topk_idx.dtype}", flush=True)
-            print(f"  topk_idx[:5]:\n{topk_idx[:5]}", flush=True)
-            print(f"  num_max_dispatch_tokens_per_rank: {num_max_dispatch_tokens_per_rank}", flush=True)
-            print(f"  num_experts: {num_experts}", flush=True)
-            print(f"  use_fp8: {use_fp8}, round_scale: {round_scale}, use_ue8m0: {use_ue8m0}", flush=True)
-            print(f"  async_finish: {async_finish}, return_recv_hook: {return_recv_hook}", flush=True)
-            print(f"{'='*60}\n", flush=True)
-        
         (
             packed_recv_x,
             packed_recv_x_scales,
@@ -330,22 +312,6 @@ def low_latency_dispatch(
             x.size(1),
             num_experts,
         )
-        
-        # DEBUG: Print data after CUDA kernel returns
-        if os.environ.get("DEBUG_DISPATCH", "0") == "1":
-            import torch
-            torch.cuda.synchronize()  # Wait for kernel to complete
-            print(f"\n{'='*60}", flush=True)
-            print(f"[DEBUG] low_latency_dispatch - AFTER CUDA kernel", flush=True)
-            print(f"{'='*60}", flush=True)
-            print(f"  packed_recv_x.shape: {packed_recv_x.shape}, dtype: {packed_recv_x.dtype}", flush=True)
-            print(f"  packed_recv_count: {packed_recv_count}", flush=True)
-            print(f"  packed_recv_src_info.shape: {packed_recv_src_info.shape}", flush=True)
-            print(f"  packed_recv_layout_range.shape: {packed_recv_layout_range.shape}", flush=True)
-            if packed_recv_x_scales is not None:
-                print(f"  packed_recv_x_scales.shape: {packed_recv_x_scales.shape}", flush=True)
-            print(f"{'='*60}\n", flush=True)
-        
         tensors_to_record = (
             x,
             topk_idx,
diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 082fcdf29..e4e96bcbc 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -196,20 +196,16 @@ struct LowLatencyLayout {
     size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
 
     // Send buffer
-#ifdef LAM_DEV
-    // Lam: Buffer layout for batched RDMA sends:
-    // ┌─────────────────────────────────┬──────────────────────────────────────────────┐
-    // │ Temp buffer (offset 0)          │ RDMA batch buffer (offset num_max_tokens)    │
-    // │ rdma_x[token_idx]               │ rdma_x[num_max_tokens + expert*max + slot]   │
-    // │ Size: num_max_tokens * msg_size │ Size: num_experts * num_max_tokens * msg_size│
-    // └─────────────────────────────────┴──────────────────────────────────────────────┘
-    // Flow: FP8 cast -> temp buffer -> copy to rdma_batch_buffer -> batch RDMA send
+    // Buffer layout for RDMA sends, used by the batched RDMA-send path in the dispatch-LL kernel.
+    // ┌──────────────────────────────────────────┬──────────────────────────────────────────────────────────┐
+    // │ Temp buffer (offset 0)                   │ Per-expert RDMA batch buffer (offset num_max_token)      │
+    // │ rdma_x[token_idx]                        │ rdma_x[num_max_token + expert * num_max_token + slot]    │
+    // │ Size: num_max_token * msg_size           │ Size: num_experts * num_max_token * msg_size             │
+    // └──────────────────────────────────────────┴──────────────────────────────────────────────────────────┘
+    // Flow: (optional FP8 cast) -> temp buffer -> copy to per-expert batch buffer -> batched RDMA send
+    // TODO: Support per-GPU destination batching in this path.
     size_t dispatch_send_buffer_bytes =
         (num_experts + 1) * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
-#else
-    size_t dispatch_send_buffer_bytes =
-        num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
-#endif
     size_t combine_send_buffer_bytes = num_experts *
                                        num_max_dispatch_tokens_per_rank *
                                        num_bytes_per_combine_msg;
diff --git a/ep/include/uccl_ibgda.cuh b/ep/include/uccl_ibgda.cuh
index 753a5e63f..424fddec3 100644
--- a/ep/include/uccl_ibgda.cuh
+++ b/ep/include/uccl_ibgda.cuh
@@ -68,9 +68,8 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
     } else {
       cmd.expert_idx = expert_idx;
       // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
-      cmd.atomic_val = (num_tokens <= 0 || num_tokens > 255)
-                           ? 1
-                           : static_cast<uint8_t>(num_tokens);
+      EP_DEVICE_ASSERT(num_tokens > 0 && num_tokens <= 255);
+      cmd.atomic_val = static_cast<uint8_t>(num_tokens);
     }
     h->atomic_set_and_commit(cmd, &slot);
   }
@@ -120,9 +119,8 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
       } else {
         cmd.expert_idx = expert_idx;
         // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
-        cmd.atomic_val = (num_tokens <= 0 || num_tokens > 255)
-                             ? 1
-                             : static_cast<uint8_t>(num_tokens);
+        EP_DEVICE_ASSERT(num_tokens > 0 && num_tokens <= 255);
+        cmd.atomic_val = static_cast<uint8_t>(num_tokens);
       }
       h->atomic_set_and_commit(cmd, &slot);
       break;
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 728e2b844..303883916 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -67,11 +67,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     void* atomic_buffer_ptr = nullptr,
     int64_t* rdma_recv_count_internode = nullptr,
     int* grid_sync_barrier_ptr = nullptr) {
-// #ifdef LAM_DEV
-  // if (blockIdx.x == 0 && threadIdx.x == 0) {
-  //   printf("[LAM_DEV] dispatch called\n");
-  // }
-// #endif
   auto const sm_id = static_cast<int>(blockIdx.x);
   auto const thread_id = static_cast<int>(threadIdx.x);
   auto const warp_id = thread_id / WARP_SIZE, lane_id = get_lane_id();
@@ -443,6 +438,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 // Receiving phase
 LOW_LATENCY_DISPATCH_RECV:
   if ((phases & LOW_LATENCY_RECV_PHASE) == 0) {
+    // if (blockIdx.x == 0 && threadIdx.x == 0)
+    //   printf("[combine] SEND finished\n");
     return;
   }
 
@@ -542,6 +539,13 @@ LOW_LATENCY_DISPATCH_RECV:
           num_recv_tokens_internode != 0 ? -num_recv_tokens_internode - 1 : 0;
       num_recv_tokens_ipc =
           num_recv_tokens_ipc != 0 ? -num_recv_tokens_ipc - 1 : 0;
+      // printf(
+      //     "num_recv_tokens_internode: %d, num_recv_tokens_ipc: %d, src_rank:"
+      //     "%d, rank: %d, max_nvl_peers: %d, responsible_expert_idx: %d,"
+      //     "num_experts: %d, num_local_experts: %d\n",
+      //     num_recv_tokens_internode, num_recv_tokens_ipc, src_rank, rank,
+      //     max_nvl_peers, responsible_expert_idx, num_experts,
+      //     num_local_experts);
       num_recv_tokens = num_recv_tokens_internode + num_recv_tokens_ipc;
       recv_token_begin_idx =
           atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
@@ -617,6 +621,8 @@ LOW_LATENCY_DISPATCH_RECV:
         }
       }
     }
+    // if (blockIdx.x == 0 && threadIdx.x == 0)
+    //   printf("[dispatch] RECV finished\n");
   }
 }
 
@@ -1153,7 +1159,9 @@ LOW_LATENCY_COMBINE_RECV:
       (static_cast<int4*>(combined_x) +
        token_idx * hidden_bf16_int4)[hidden_idx] = combined_int4;
     }
-
+    
+      // if (blockIdx.x == 0 && threadIdx.x == 0)
+      //   printf("[combine] RECV finished\n");
   }
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index 4f42072e6..101388796 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -1399,13 +1399,8 @@ static void post_rdma_async_batched_fast_mode(
                                                get_low_latency(cmd.cmd_type)};
 #endif
 #ifdef USE_RECEIVER_BARRIER
-        // Lam: Low-latency: num_tokens from cmd.atomic_val (GPU sets it); else 1.
-        // Use atomic_val whenever GPU set it (non-zero), not only when
-        // get_low_latency(cmd_type); dispatch can also set atomic_val.
         uint32_t num_tokens_imm =
             cmd.atomic_val ? static_cast<uint32_t>(cmd.atomic_val) : 1u;
-        // get_is_combine(cmd.cmd_type) ? printf("Receiving this combine imm? num_tokens_imm: %d, cmd.atomic_val: %d\n", num_tokens_imm, cmd.atomic_val) : printf("Receiving this dispatch imm? num_tokens_imm: %d, cmd.atomic_val: %d\n", num_tokens_imm, cmd.atomic_val);
-        // fflush(stdout);
         uint32_t imm = WriteImm::Pack(get_is_combine(cmd.cmd_type),
                                       get_low_latency(cmd.cmd_type),
                                       cmd.expert_idx, num_tokens_imm, my_rank)

From 38a54df357c2c841e54b2a49eb33d070eae4f856 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Sun, 22 Feb 2026 07:30:05 +0000
Subject: [PATCH 03/19] Cleanup code and comments

---
 ep/src/internode_ll.cu | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 303883916..9b04b737d 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -1048,6 +1048,8 @@ __global__ __launch_bounds__(1024, 1) void combine(
 // Receiving phase
 LOW_LATENCY_COMBINE_RECV:
   if ((phases & LOW_LATENCY_RECV_PHASE) == 0) {
+    // if (blockIdx.x == 0 && threadIdx.x == 0)
+    //   printf("[combine] SEND finished\n");
     return;
   }
   // Wait all ranks to arrive
@@ -1159,9 +1161,9 @@ LOW_LATENCY_COMBINE_RECV:
       (static_cast<int4*>(combined_x) +
        token_idx * hidden_bf16_int4)[hidden_idx] = combined_int4;
     }
-    
-      // if (blockIdx.x == 0 && threadIdx.x == 0)
-      //   printf("[combine] RECV finished\n");
+
+    // if (blockIdx.x == 0 && threadIdx.x == 0)
+    //   printf("[combine] RECV finished\n");
   }
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)

From 247af5695f55634b6644972d5839023d82cb66b0 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Sun, 22 Feb 2026 07:45:47 +0000
Subject: [PATCH 04/19] Cleanup code and comments

---
 ep/include/ep_config.hpp |  11 ++--
 ep/src/internode_ll.cu   | 107 ++++++++++++++++++---------------------
 2 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index e4e96bcbc..958fef041 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -196,16 +196,19 @@ struct LowLatencyLayout {
     size_t num_bytes_per_combine_msg = hidden * sizeof(nv_bfloat16);
 
     // Send buffer
-    // Buffer layout for RDMA sends, used by the batched RDMA-send path in the dispatch-LL kernel.
+    // Buffer layout for RDMA sends, used by the batched RDMA-send path in the
+    // dispatch-LL kernel.
     // ┌──────────────────────────────────────────┬──────────────────────────────────────────────────────────┐
     // │ Temp buffer (offset 0)                   │ Per-expert RDMA batch buffer (offset num_max_token)      │
     // │ rdma_x[token_idx]                        │ rdma_x[num_max_token + expert * num_max_token + slot]    │
     // │ Size: num_max_token * msg_size           │ Size: num_experts * num_max_token * msg_size             │
     // └──────────────────────────────────────────┴──────────────────────────────────────────────────────────┘
-    // Flow: (optional FP8 cast) -> temp buffer -> copy to per-expert batch buffer -> batched RDMA send
+    // Flow: (optional FP8 cast) -> temp buffer -> copy to per-expert batch
+    // buffer -> batched RDMA send
     // TODO: Support per-GPU destination batching in this path.
-    size_t dispatch_send_buffer_bytes =
-        (num_experts + 1) * num_max_dispatch_tokens_per_rank * num_bytes_per_dispatch_msg;
+    size_t dispatch_send_buffer_bytes = (num_experts + 1) *
+                                        num_max_dispatch_tokens_per_rank *
+                                        num_bytes_per_dispatch_msg;
     size_t combine_send_buffer_bytes = num_experts *
                                        num_max_dispatch_tokens_per_rank *
                                        num_bytes_per_combine_msg;
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 9b04b737d..981ab9014 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -12,9 +12,6 @@ namespace cg = cooperative_groups;
 namespace uccl {
 namespace internode_ll {
 
-// Lam: Global lock for debug printing (ensures printf calls don't interleave)
-__device__ int g_print_lock = 0;
-
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 constexpr int kNumMaxWarpGroups = 16;
 #else
@@ -56,13 +53,12 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     int64_t* dispatch_wait_recv_cost_stats, void* rdma_recv_x,
     int* rdma_recv_count, void* rdma_x, void const* x, int64_t const* topk_idx,
     int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
-    int* atomic_send_counter_per_expert,
-    int* next_clean, int64_t* next_clean_second, int num_next_clean_int,
-    int num_tokens, int num_max_dispatch_tokens_per_rank, int num_topk,
-    int num_experts, int rank, int num_ranks, int num_warp_groups,
-    int num_warps_per_group, bool round_scale, int phases,
-    uint64_t const* d2h_channel_addrs, int num_d2h_channel_addrs,
-    int max_nvl_peers, int low_latency_buffer_idx,
+    int* atomic_send_counter_per_expert, int* next_clean,
+    int64_t* next_clean_second, int num_next_clean_int, int num_tokens,
+    int num_max_dispatch_tokens_per_rank, int num_topk, int num_experts,
+    int rank, int num_ranks, int num_warp_groups, int num_warps_per_group,
+    bool round_scale, int phases, uint64_t const* d2h_channel_addrs,
+    int num_d2h_channel_addrs, int max_nvl_peers, int low_latency_buffer_idx,
     void** ipc_rdma_base_ptrs = nullptr, void* rdma_buffer_ptr = nullptr,
     void* atomic_buffer_ptr = nullptr,
     int64_t* rdma_recv_count_internode = nullptr,
@@ -102,7 +98,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   // Expert counts
   __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
 
-  // Lam: Send slots for each topk destination (for batched send buffer layout)
+  // Global counter slots used for batching sends to each top-k destination.
   constexpr int kNumMaxTopK = 9;
   __shared__ int shared_send_slots[kNumMaxTopK];
   __shared__ int shared_dst_experts[kNumMaxTopK];
@@ -145,8 +141,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                              : -1;
       thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
 
-      // Lam: Allocate send slots for each topk destination
-      // Each warp (warp_id < num_topk) allocates a slot for its destination expert
+      // Allocate per-expert send slots for top-k destinations.
+      // Each warp (warp_id < num_topk) reserves one slot for its destination
+      // expert.
       if (warp_id < num_topk && lane_id == 0) {
         shared_dst_experts[warp_id] = dst_expert_idx;
         if (dst_expert_idx >= 0) {
@@ -244,12 +241,19 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                         dst_rank, max_nvl_peers, 0)
                 : 0;
         if (dst_p2p_ptr == 0) {
-          // Lam: IBGDA -> copy temp to rdma_batch_buffer, batch send later
-          auto const lam_slot = shared_send_slots[warp_id];
-          auto const batch_buf_offset = num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-          auto const batch_buf_ptr = static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-              (dst_expert_idx * num_max_dispatch_tokens_per_rank + lam_slot) * num_bytes_per_msg;
-          auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
+          // For inter-node send path, copy temp data to the per-expert batch
+          // buffer, then issue a batched RDMA send.
+          // TODO: This has an extra temp->per-expert copy in the FP8 path.
+          // FP8 output is written to the temp buffer first, then copied here.
+          auto const slot_idx = shared_send_slots[warp_id];
+          auto const batch_buf_offset =
+              num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+          auto const batch_buf_ptr =
+              static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+              (dst_expert_idx * num_max_dispatch_tokens_per_rank + slot_idx) *
+                  num_bytes_per_msg;
+          auto const* src_int4_ptr =
+              reinterpret_cast<int4 const*>(rdma_x_src_idx);
           auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
           UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
                              src_int4_ptr, ld_nc_global, st_na_global);
@@ -312,27 +316,19 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   }
   __syncthreads();
 
-  // Lam: Grid-wide sync before batch send phase.
-  // __syncthreads() only syncs within a single thread block (SM).
-  // The token loop distributes tokens round-robin across SMs
-  // (token_idx = sm_id, stepping by num_sms). When num_tokens < num_sms,
-  // most SMs skip the token loop and pass __syncthreads() immediately,
-  // while the SMs processing tokens are still writing to the batch buffer
-  // and incrementing atomic_send_counter_per_expert.
-  // Without grid sync, the batch send phase can read a stale/partial
-  // counter and send fewer tokens than actually produced, causing the
-  // receiver to hang waiting for data that never arrives.
+  // Grid-wide sync before batch-send.
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   amd::grid_sync(grid_sync_barrier_ptr, num_sms);
 #else
   cg::this_grid().sync();
 #endif
 
-  // Lam: Batch RDMA send phase - send entire expert buffer in ONE IBGDA call
+  // Batch RDMA send phase - send entire expert buffer in ONE IBGDA call
   // Each warp group handles one expert (only first sub_warp does the send)
   if (responsible_expert_idx < num_experts && sub_warp_id == 0) {
     auto const dst_rank = responsible_expert_idx / num_local_experts;
-    auto const dst_expert_local_idx = responsible_expert_idx % num_local_experts;
+    auto const dst_expert_local_idx =
+        responsible_expert_idx % num_local_experts;
 
     // Check if this destination is inter-node (needs IBGDA batch send)
     // IPC destinations were already sent in the token loop
@@ -370,12 +366,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 
         uccl::nvshmemi_ibgda_put_nbi_warp(
             dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
-            src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
-            total_bytes, dst_rank,
+            src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr), total_bytes,
+            dst_rank,
             /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting rb.
             lane_id, /*slot=*/0, d2h_channel_addrs, num_d2h_channel_addrs,
             false, low_latency_buffer_idx, 0, 0, num_tokens_to_send);
-        
       }
     }
     // IPC: already sent in the token loop, nothing to do here
@@ -424,11 +419,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr),
                             -num_tokens_sent - 1);
     }
-
     // Clean workspace for next use
     atomic_counter_per_expert[responsible_expert_idx] = 0;
     atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
-
     atomic_send_counter_per_expert[responsible_expert_idx] = 0;
     // Clean `packed_recv_count`
     if (dst_rank == 0) packed_recv_count[dst_expert_local_idx] = 0;
@@ -438,8 +431,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 // Receiving phase
 LOW_LATENCY_DISPATCH_RECV:
   if ((phases & LOW_LATENCY_RECV_PHASE) == 0) {
-    // if (blockIdx.x == 0 && threadIdx.x == 0)
-    //   printf("[combine] SEND finished\n");
     return;
   }
 
@@ -664,26 +655,26 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
   if (use_ue8m0)
     EP_HOST_ASSERT(round_scale and "UE8M0 SF requires `round_scale=True`");
 
-#define DISPATCH_LAUNCH_CASE(hidden)                                          \
-  {                                                                           \
-    auto dispatch_func = dispatch<false, false, hidden>;                      \
-    if (use_fp8 and not use_ue8m0)                                            \
-      dispatch_func = dispatch<true, false, hidden>;                          \
-    if (use_fp8 and use_ue8m0) dispatch_func = dispatch<true, true, hidden>;  \
-    LAUNCH_KERNEL(                                                            \
-        &cfg, dispatch_func, packed_recv_x, packed_recv_x_scales,             \
-        packed_recv_src_info, packed_recv_layout_range, packed_recv_count,    \
-        cumulative_local_expert_recv_stats, dispatch_wait_recv_cost_stats,    \
-        rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                    \
-        atomic_counter_per_expert, atomic_finish_counter_per_expert,          \
-        atomic_send_counter_per_expert,                                        \
-        next_clean, next_clean_second, num_next_clean_int, num_tokens,        \
-        num_max_dispatch_tokens_per_rank, num_topk, num_experts, rank,        \
-        num_ranks, num_warp_groups, num_warps_per_group, round_scale, phases, \
-        d2h_channel_addrs, num_d2h_channel_addrs, max_nvl_peers,              \
-        low_latency_buffer_idx, ipc_rdma_base_ptrs, rdma_buffer_ptr,          \
-        atomic_buffer_ptr, rdma_recv_count_internode, grid_sync_barrier_ptr); \
-  }                                                                           \
+#define DISPATCH_LAUNCH_CASE(hidden)                                         \
+  {                                                                          \
+    auto dispatch_func = dispatch<false, false, hidden>;                     \
+    if (use_fp8 and not use_ue8m0)                                           \
+      dispatch_func = dispatch<true, false, hidden>;                         \
+    if (use_fp8 and use_ue8m0) dispatch_func = dispatch<true, true, hidden>; \
+    LAUNCH_KERNEL(                                                           \
+        &cfg, dispatch_func, packed_recv_x, packed_recv_x_scales,            \
+        packed_recv_src_info, packed_recv_layout_range, packed_recv_count,   \
+        cumulative_local_expert_recv_stats, dispatch_wait_recv_cost_stats,   \
+        rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                   \
+        atomic_counter_per_expert, atomic_finish_counter_per_expert,         \
+        atomic_send_counter_per_expert, next_clean, next_clean_second,       \
+        num_next_clean_int, num_tokens, num_max_dispatch_tokens_per_rank,    \
+        num_topk, num_experts, rank, num_ranks, num_warp_groups,             \
+        num_warps_per_group, round_scale, phases, d2h_channel_addrs,         \
+        num_d2h_channel_addrs, max_nvl_peers, low_latency_buffer_idx,        \
+        ipc_rdma_base_ptrs, rdma_buffer_ptr, atomic_buffer_ptr,              \
+        rdma_recv_count_internode, grid_sync_barrier_ptr);                   \
+  }                                                                          \
   break
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   EP_HOST_ASSERT(num_warps * WARP_SIZE <= MAX_NTHREADS);

From 45d24127cd4b4660992a7f7b161e4198a56cb741 Mon Sep 17 00:00:00 2001
From: laochonlam <laochanlam@gmail.com>
Date: Sun, 22 Feb 2026 07:48:09 +0000
Subject: [PATCH 05/19] Cleanup code and comments

---
 ep/include/ep_config.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 958fef041..08282ea01 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -198,11 +198,13 @@ struct LowLatencyLayout {
     // Send buffer
     // Buffer layout for RDMA sends, used by the batched RDMA-send path in the
     // dispatch-LL kernel.
+    // clang-format off
     // ┌──────────────────────────────────────────┬──────────────────────────────────────────────────────────┐
     // │ Temp buffer (offset 0)                   │ Per-expert RDMA batch buffer (offset num_max_token)      │
     // │ rdma_x[token_idx]                        │ rdma_x[num_max_token + expert * num_max_token + slot]    │
     // │ Size: num_max_token * msg_size           │ Size: num_experts * num_max_token * msg_size             │
     // └──────────────────────────────────────────┴──────────────────────────────────────────────────────────┘
+    // clang-format on
     // Flow: (optional FP8 cast) -> temp buffer -> copy to per-expert batch
     // buffer -> batched RDMA send
     // TODO: Support per-GPU destination batching in this path.

From ea2ae4f285812d3e6501fc64388d8f6d39958d6f Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 17:23:42 -0500
Subject: [PATCH 06/19] seems to work for gb10

---
 ep/src/internode_ll.cu | 322 +++++++++++++++++++++++------------------
 1 file changed, 183 insertions(+), 139 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 981ab9014..3b00d8093 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -71,6 +71,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   auto const num_local_experts = num_experts / num_ranks;
   auto const warp_group_id = warp_id / num_warps_per_group;
   auto const sub_warp_id = warp_id % num_warps_per_group;
+  auto const responsible_rank = sm_id * num_warp_groups + warp_group_id;
   auto const responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
 
   // May extract UE8M0 from the scales
@@ -94,14 +95,15 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                               : (kHidden * sizeof(nv_bfloat16)));
   size_t const num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
   EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
+  // Per-rank layout: [src_rank][expert][slot] on receiver (partitioned by source)
+  size_t const rank_region_bytes = num_local_experts *
+      num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
 
-  // Expert counts
-  __shared__ int shared_num_tokens_sent_per_expert[kNumMaxWarpGroups];
+  __shared__ int shared_num_tokens_to_send_per_rank[kNumMaxWarpGroups];
 
   // Global counter slots used for batching sends to each top-k destination.
   constexpr int kNumMaxTopK = 9;
   __shared__ int shared_send_slots[kNumMaxTopK];
-  __shared__ int shared_dst_experts[kNumMaxTopK];
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   // initialize barrier
@@ -145,10 +147,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       // Each warp (warp_id < num_topk) reserves one slot for its destination
       // expert.
       if (warp_id < num_topk && lane_id == 0) {
-        shared_dst_experts[warp_id] = dst_expert_idx;
         if (dst_expert_idx >= 0) {
           shared_send_slots[warp_id] =
-              atomicAdd(atomic_send_counter_per_expert + dst_expert_idx, 1);
+              atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1);
         } else {
           shared_send_slots[warp_id] = -1;
         }
@@ -221,49 +222,20 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 
       // Issue IBGDA sends
       if (dst_expert_idx >= 0) {
-        int slot_idx =
-            lane_id == 0
-                ? atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1)
-                : 0;
-        slot_idx = __shfl_sync(WARP_MASK, slot_idx, 0);
         auto const dst_rank = dst_expert_idx / num_local_experts;
         auto const dst_expert_local_idx = dst_expert_idx % num_local_experts;
-        auto const src_ptr = reinterpret_cast<uint64_t>(rdma_x_src_idx);
-        auto const dst_ptr =
-            reinterpret_cast<uint64_t>(rdma_recv_x) +
-            dst_expert_local_idx * num_ranks *
-                num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
-            rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
-            slot_idx * num_bytes_per_msg;
-        auto const dst_p2p_ptr =
-            ipc_rdma_base_ptrs
-                ? uccl::get_ipc_p2p_ptr(dst_ptr, ipc_rdma_base_ptrs, rank,
-                                        dst_rank, max_nvl_peers, 0)
-                : 0;
-        if (dst_p2p_ptr == 0) {
-          // For inter-node send path, copy temp data to the per-expert batch
-          // buffer, then issue a batched RDMA send.
-          // TODO: This has an extra temp->per-expert copy in the FP8 path.
-          // FP8 output is written to the temp buffer first, then copied here.
-          auto const slot_idx = shared_send_slots[warp_id];
-          auto const batch_buf_offset =
-              num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-          auto const batch_buf_ptr =
-              static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-              (dst_expert_idx * num_max_dispatch_tokens_per_rank + slot_idx) *
-                  num_bytes_per_msg;
-          auto const* src_int4_ptr =
-              reinterpret_cast<int4 const*>(rdma_x_src_idx);
-          auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
-          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
-                             src_int4_ptr, ld_nc_global, st_na_global);
-        } else {
-          // Intra-node: use direct memory copy via IPC
-          auto const* src_int4_ptr = reinterpret_cast<int4 const*>(src_ptr);
-          auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
-          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr,
-                             src_int4_ptr, ld_nc_global, st_na_global);
-        }
+        auto const slot_idx = shared_send_slots[warp_id];
+        auto const batch_buf_offset =
+            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        auto const batch_buf_ptr =
+            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+            dst_rank * rank_region_bytes +
+            (dst_expert_local_idx * num_max_dispatch_tokens_per_rank + slot_idx) *
+                num_bytes_per_msg;
+        auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
+        auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
+        UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
+                           src_int4_ptr, ld_nc_global, st_na_global);
         // Increase counter after finishing
         __syncwarp();
         lane_id == 0 ? atomic_add_release_global(
@@ -308,7 +280,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     for (int i = expert_begin_idx; i < expert_end_idx; ++i) {
       auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
       if (lane_id == 0) {
-        shared_num_tokens_sent_per_expert[i - expert_begin_idx] = sum;
         atomic_add_release_global(atomic_finish_counter_per_expert + i,
                                   FINISHED_SUM_TAG - sum);
       }
@@ -323,83 +294,104 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   cg::this_grid().sync();
 #endif
 
-  // Batch RDMA send phase - send entire expert buffer in ONE IBGDA call
-  // Each warp group handles one expert (only first sub_warp does the send)
-  if (responsible_expert_idx < num_experts && sub_warp_id == 0) {
-    auto const dst_rank = responsible_expert_idx / num_local_experts;
-    auto const dst_expert_local_idx =
-        responsible_expert_idx % num_local_experts;
-
-    // Check if this destination is inter-node (needs IBGDA batch send)
-    // IPC destinations were already sent in the token loop
-    auto const test_dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_x);
-    auto const dst_p2p_ptr =
-        ipc_rdma_base_ptrs
-            ? uccl::get_ipc_p2p_ptr(test_dst_ptr, ipc_rdma_base_ptrs, rank,
-                                    dst_rank, max_nvl_peers, 0)
-            : 0;
+  // Batch RDMA send phase - one put per destination rank (contiguous slice)
+  // Each warp group handles one rank (only first sub_warp does the send)
+  if (responsible_rank < num_ranks && sub_warp_id == 0) {
+    // Wait for all experts on this rank to finish copying to batch buffer.
+    int num_tokens_to_send = 0;
+    for (int e = 0; e < num_local_experts; ++e) {
+      int expert_idx = responsible_rank * num_local_experts + e;
+      while (ld_acquire_global(atomic_finish_counter_per_expert + expert_idx) !=
+             FINISHED_SUM_TAG * 2)
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+          __builtin_amdgcn_s_sleep(1);
+#else
+          ;
+#endif
+      if (lane_id == 0)
+        num_tokens_to_send += atomic_counter_per_expert[expert_idx];
+    }
+    num_tokens_to_send = __shfl_sync(WARP_MASK, num_tokens_to_send, 0);
+    if (lane_id == 0)
+      shared_num_tokens_to_send_per_rank[warp_group_id] = num_tokens_to_send;
+    __syncwarp();
 
-    if (dst_p2p_ptr == 0) {
-      // Inter-node: batch send ALL tokens for this expert in ONE call
-      auto const num_tokens_to_send =
-          atomic_send_counter_per_expert[responsible_expert_idx];
+    if (num_tokens_to_send > 0) {
+      auto const batch_buf_offset =
+          num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+      auto const batch_buf_base =
+          static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+          responsible_rank * rank_region_bytes;
+      // Receiver partitions by src_rank; we write to our (sender) region.
+      auto const dst_base = reinterpret_cast<uint64_t>(rdma_recv_x) +
+                            rank * rank_region_bytes;
+      auto const dst_p2p_ptr =
+          ipc_rdma_base_ptrs
+              ? uccl::get_ipc_p2p_ptr(dst_base, ipc_rdma_base_ptrs, rank,
+                                      responsible_rank, max_nvl_peers, 0)
+              : 0;
 
-      if (num_tokens_to_send > 0) {
-        auto const batch_buf_offset =
-            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-        // Source: start of this expert's batch buffer (contiguous)
-        auto const batch_buf_ptr =
-            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-            responsible_expert_idx * num_max_dispatch_tokens_per_rank *
-                num_bytes_per_msg;
-        auto const src_ptr = reinterpret_cast<uint64_t>(batch_buf_ptr);
-        // Destination: start of this expert's recv buffer on remote rank
-        auto const dst_ptr =
-            reinterpret_cast<uint64_t>(rdma_recv_x) +
-            dst_expert_local_idx * num_ranks *
-                num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
-            rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-        // Total bytes: all tokens for this expert
-        auto const total_bytes = num_tokens_to_send * num_bytes_per_msg;
+      // Metadata for receiver:
+      // 1) stamp header[1] with rank-level token count for all valid tokens;
+      // 2) mark first unused slot per expert with src_idx=-1 and same stamp.
+      // Receiver checks this metadata before consuming payload, which helps
+      // guard against observing the atomic before payload visibility.
+      if (lane_id == 0) {
+        for (int e = 0; e < num_local_experts; ++e) {
+          auto const expert_idx = responsible_rank * num_local_experts + e;
+          auto const expert_tokens = atomic_counter_per_expert[expert_idx];
+          for (int s = 0; s < expert_tokens; ++s) {
+            auto* token_header = reinterpret_cast<int*>(
+                batch_buf_base + (e * num_max_dispatch_tokens_per_rank + s) *
+                                     num_bytes_per_msg);
+            token_header[1] = num_tokens_to_send;
+          }
+          if (expert_tokens < num_max_dispatch_tokens_per_rank) {
+            auto* sentinel_ptr = reinterpret_cast<int*>(
+                batch_buf_base + (e * num_max_dispatch_tokens_per_rank +
+                                  expert_tokens) *
+                                     num_bytes_per_msg);
+            sentinel_ptr[0] = -1;
+            sentinel_ptr[1] = num_tokens_to_send;
+          }
+        }
+      }
+      auto const total_bytes = rank_region_bytes;
 
-        __threadfence_system();
+      __threadfence_system();
 
+      if (dst_p2p_ptr != 0) {
+        auto const* src_int4_ptr = reinterpret_cast<int4 const*>(batch_buf_base);
+        auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
+        UNROLLED_WARP_COPY(8, lane_id, total_bytes / sizeof(int4), dst_int4_ptr,
+                           src_int4_ptr, ld_nc_global, st_na_global);
+      } else {
+        EP_DEVICE_ASSERT(num_tokens_to_send <= 255 &&
+                         "IBGDA low-latency path requires <=255 tokens");
         uccl::nvshmemi_ibgda_put_nbi_warp(
-            dst_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
-            src_ptr - reinterpret_cast<uint64_t>(rdma_buffer_ptr), total_bytes,
-            dst_rank,
-            /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting rb.
-            lane_id, /*slot=*/0, d2h_channel_addrs, num_d2h_channel_addrs,
-            false, low_latency_buffer_idx, 0, 0, num_tokens_to_send);
+            dst_base - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
+            reinterpret_cast<uint64_t>(batch_buf_base) -
+                reinterpret_cast<uint64_t>(rdma_buffer_ptr),
+            total_bytes, responsible_rank,
+            /*warp_id=*/responsible_rank, lane_id, /*slot=*/0,
+            d2h_channel_addrs, num_d2h_channel_addrs, false,
+            low_latency_buffer_idx, 0, 0, num_tokens_to_send);
       }
     }
-    // IPC: already sent in the token loop, nothing to do here
   }
 
   __threadfence_system();  // Ensure batch sends are visible before count sends
 
-  // Issue count sends
-  if (responsible_expert_idx < num_experts and sub_warp_id == 0 and
-      lane_id == 0) {
-    auto const dst_rank = responsible_expert_idx / num_local_experts;
-    auto const dst_expert_local_idx =
-        responsible_expert_idx % num_local_experts;
+  // Issue count sends — one atomic per (src rank, dst rank)
+  if (responsible_rank < num_ranks and sub_warp_id == 0 and lane_id == 0) {
+    auto const dst_rank = responsible_rank;
     auto const num_tokens_sent =
-        shared_num_tokens_sent_per_expert[responsible_expert_idx -
-                                          sm_id * num_warp_groups];
-    // Wait local sends issued and send expert counts
-    while (ld_acquire_global(atomic_finish_counter_per_expert +
-                             responsible_expert_idx) != FINISHED_SUM_TAG * 2)
-#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-      __builtin_amdgcn_s_sleep(1);
-#else
-      ;
-#endif
+        shared_num_tokens_to_send_per_rank[warp_group_id];
 
-    auto dst_ptr = reinterpret_cast<uint64_t>(
-        rdma_recv_count + dst_expert_local_idx * num_ranks + rank);
+    auto dst_ptr = reinterpret_cast<uint64_t>(rdma_recv_count +
+                                              dst_rank * num_ranks + rank);
     auto dst_ptr_internode = reinterpret_cast<uint64_t>(
-        rdma_recv_count_internode + dst_expert_local_idx * num_ranks + rank);
+        rdma_recv_count_internode + dst_rank * num_ranks + rank);
     // Try to use IPC for intra-node atomic operations
     auto const dst_p2p_ptr =
         ipc_rdma_base_ptrs
@@ -411,20 +403,23 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       uccl::nvshmemi_ibgda_amo_nonfetch_add(
           dst_ptr_internode, reinterpret_cast<uint64_t>(atomic_buffer_ptr),
           -num_tokens_sent - 1, dst_rank,
-          /*warp_id=*/dst_expert_local_idx,  // NOTE(Yang): for selecting rb.
-          false, d2h_channel_addrs, num_d2h_channel_addrs, false,
-          low_latency_buffer_idx);
+          /*warp_id=*/dst_rank, false, d2h_channel_addrs, num_d2h_channel_addrs,
+          false, low_latency_buffer_idx);
     } else {
       // Intra-node: use direct atomic operation
       st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr),
                             -num_tokens_sent - 1);
     }
-    // Clean workspace for next use
-    atomic_counter_per_expert[responsible_expert_idx] = 0;
-    atomic_finish_counter_per_expert[responsible_expert_idx] = 0;
-    atomic_send_counter_per_expert[responsible_expert_idx] = 0;
-    // Clean `packed_recv_count`
-    if (dst_rank == 0) packed_recv_count[dst_expert_local_idx] = 0;
+    // Clean workspace for next use (all experts for this dst_rank)
+    for (int e = 0; e < num_local_experts; ++e) {
+      int expert_idx = dst_rank * num_local_experts + e;
+      atomic_counter_per_expert[expert_idx] = 0;
+      atomic_finish_counter_per_expert[expert_idx] = 0;
+      atomic_send_counter_per_expert[expert_idx] = 0;
+    }
+    if (dst_rank == 0) {
+      for (int e = 0; e < num_local_experts; ++e) packed_recv_count[e] = 0;
+    }
   }
   __syncwarp();
 
@@ -449,9 +444,8 @@ LOW_LATENCY_DISPATCH_RECV:
     auto const local_expert_idx = responsible_expert_idx % num_local_experts;
     auto const rdma_recv_x_uint8 =
         static_cast<uint8_t*>(rdma_recv_x) +
-        local_expert_idx * num_ranks * num_max_dispatch_tokens_per_rank *
-            num_bytes_per_msg +
-        src_rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        src_rank * rank_region_bytes +
+        local_expert_idx * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
     auto const recv_x_int4 = static_cast<int4*>(packed_recv_x) +
                              local_expert_idx * num_ranks *
                                  num_max_dispatch_tokens_per_rank * hidden_int4;
@@ -484,8 +478,7 @@ LOW_LATENCY_DISPATCH_RECV:
       auto start_time = clock64();
       while ((src_rank / max_nvl_peers == rank / max_nvl_peers) &&
              (num_recv_tokens_ipc = ld_acquire_sys_global(
-                  rdma_recv_count + local_expert_idx * num_ranks + src_rank)) ==
-                 0)
+                  rdma_recv_count + rank * num_ranks + src_rank)) == 0)
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
         __builtin_amdgcn_s_sleep(1);
 #else
@@ -495,18 +488,17 @@ LOW_LATENCY_DISPATCH_RECV:
       while ((src_rank / max_nvl_peers != rank / max_nvl_peers) &&
              (num_recv_tokens_internode = static_cast<int>(
                   ld_acquire_sys_global(reinterpret_cast<uint64_t const*>(
-                      rdma_recv_count_internode + local_expert_idx * num_ranks +
+                      rdma_recv_count_internode + rank * num_ranks +
                       src_rank)))) == 0)
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
         __builtin_amdgcn_s_sleep(1);
 #else
         ;
 #endif
-
       if (src_rank / max_nvl_peers == rank / max_nvl_peers) {
         if (ld_acquire_sys_global(reinterpret_cast<uint64_t const*>(
-                rdma_recv_count_internode + local_expert_idx * num_ranks +
-                src_rank)) != 0) {
+                rdma_recv_count_internode + rank * num_ranks + src_rank)) !=
+            0) {
           printf(
               "Same node but rdma_recv_count_internode is not zero! src_rank: "
               "%d, rank: %d, max_nvl_peers: %d\n",
@@ -515,9 +507,8 @@ LOW_LATENCY_DISPATCH_RECV:
         }
       }
       if (src_rank / max_nvl_peers != rank / max_nvl_peers) {
-        if (ld_acquire_sys_global(rdma_recv_count +
-                                  local_expert_idx * num_ranks + src_rank) !=
-            0) {
+        if (ld_acquire_sys_global(rdma_recv_count + rank * num_ranks +
+                                  src_rank) != 0) {
           printf(
               "Different node but rdma_recv_count is not zero! src_rank: %d, "
               "rank: %d, max_nvl_peers: %d\n",
@@ -530,25 +521,78 @@ LOW_LATENCY_DISPATCH_RECV:
           num_recv_tokens_internode != 0 ? -num_recv_tokens_internode - 1 : 0;
       num_recv_tokens_ipc =
           num_recv_tokens_ipc != 0 ? -num_recv_tokens_ipc - 1 : 0;
+      auto const num_recv_tokens_total =
+          num_recv_tokens_internode + num_recv_tokens_ipc;
+
       // printf(
-      //     "num_recv_tokens_internode: %d, num_recv_tokens_ipc: %d, src_rank:"
-      //     "%d, rank: %d, max_nvl_peers: %d, responsible_expert_idx: %d,"
-      //     "num_experts: %d, num_local_experts: %d\n",
-      //     num_recv_tokens_internode, num_recv_tokens_ipc, src_rank, rank,
-      //     max_nvl_peers, responsible_expert_idx, num_experts,
-      //     num_local_experts);
-      num_recv_tokens = num_recv_tokens_internode + num_recv_tokens_ipc;
+      //     "[dispatch recv] rank=%d src_rank=%d expected ipc=%d internode=%d\n",
+      //     rank, src_rank, num_recv_tokens_ipc, num_recv_tokens_internode);
+
+      // Recover per-expert count from in-band sentinels in [expert][slot].
+      // Also verify metadata stamp (header[1]) before consuming payload.
+      if (num_recv_tokens_total == 0) {
+        num_recv_tokens = 0;
+      } else {
+        // bool metadata_ready = true;
+        bool saw_sentinel = false;
+        // int mismatch_slot = -1;
+        // int mismatch_meta = 0;
+        // int mismatch_src_idx = 0;
+        num_recv_tokens = 0;
+        for (; num_recv_tokens < num_max_dispatch_tokens_per_rank;
+             ++num_recv_tokens) {
+          auto const src_src_idx = reinterpret_cast<int*>(
+              rdma_recv_x_uint8 + num_recv_tokens * num_bytes_per_msg);
+          // auto const token_meta = ld_acquire_sys_global(src_src_idx + 1);
+          // if (token_meta != num_recv_tokens_total) {
+          //   metadata_ready = false;
+          //   mismatch_slot = num_recv_tokens;
+          //   mismatch_meta = token_meta;
+          //   mismatch_src_idx = ld_nc_global(src_src_idx);
+          //   break;
+          // }
+          if (ld_nc_global(src_src_idx) < 0) {
+            saw_sentinel = true;
+            break;
+          }
+        }
+        // if (!metadata_ready) {
+        //   printf(
+        //       "[dispatch recv meta mismatch] rank=%d src_rank=%d expert=%d "
+        //       "slot=%d meta=%d expected=%d src_idx=%d\n",
+        //       rank, src_rank, local_expert_idx, mismatch_slot,
+        //       mismatch_meta, num_recv_tokens_total, mismatch_src_idx);
+        // }
+        EP_DEVICE_ASSERT(saw_sentinel ||
+                         num_recv_tokens == num_max_dispatch_tokens_per_rank);
+      }
       recv_token_begin_idx =
           atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
       shared_num_recv_tokens[warp_group_id] = num_recv_tokens;
       shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx;
       recv_range[src_rank] =
           pack2<int, int64_t>(num_recv_tokens, recv_token_begin_idx);
+      auto const src_slice_offset =
+          static_cast<long long>(reinterpret_cast<uint8_t*>(rdma_recv_x_uint8) -
+                                 static_cast<uint8_t*>(rdma_recv_x));
+      auto const dst_slice_ptr =
+          reinterpret_cast<uint8_t*>(recv_x_int4) +
+          static_cast<size_t>(recv_token_begin_idx) * hidden_int4 *
+              sizeof(int4);
+      auto const dst_slice_offset =
+          static_cast<long long>(dst_slice_ptr -
+                                 static_cast<uint8_t*>(packed_recv_x));
+      // printf(
+      //     "[dispatch recv slice] rank=%d src_rank=%d expert=%d src_off=%lld "
+      //     "dst_off=%lld recv_tokens=%d begin=%d\n",
+      //     rank, src_rank, local_expert_idx, src_slice_offset, dst_slice_offset,
+      //     num_recv_tokens, recv_token_begin_idx);
+
       // Add stats for diagnosis
       if (cumulative_local_expert_recv_stats != nullptr)
         atomicAdd(cumulative_local_expert_recv_stats + local_expert_idx,
                   num_recv_tokens);
-      if (dispatch_wait_recv_cost_stats != nullptr)
+      if (dispatch_wait_recv_cost_stats != nullptr && local_expert_idx == 0)
         atomicAdd(reinterpret_cast<unsigned long long*>(
                       dispatch_wait_recv_cost_stats + src_rank),
                   wait_recv_cost);

From 2c90a1b616c8c203426477f939aa3b4d6c32048a Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 18:45:12 -0500
Subject: [PATCH 07/19] making reordering buffer on EFA work

---
 ep/src/internode_ll.cu |  6 ++++--
 ep/src/rdma.cpp        | 14 +++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 3b00d8093..8127f90d4 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -373,7 +373,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
             reinterpret_cast<uint64_t>(batch_buf_base) -
                 reinterpret_cast<uint64_t>(rdma_buffer_ptr),
             total_bytes, responsible_rank,
-            /*warp_id=*/responsible_rank, lane_id, /*slot=*/0,
+            /*warp_id=*/rank, lane_id, /*slot=*/0,
             d2h_channel_addrs, num_d2h_channel_addrs, false,
             low_latency_buffer_idx, 0, 0, num_tokens_to_send);
       }
@@ -403,9 +403,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       uccl::nvshmemi_ibgda_amo_nonfetch_add(
           dst_ptr_internode, reinterpret_cast<uint64_t>(atomic_buffer_ptr),
           -num_tokens_sent - 1, dst_rank,
-          /*warp_id=*/dst_rank, false, d2h_channel_addrs, num_d2h_channel_addrs,
+          /*warp_id=*/rank, false, d2h_channel_addrs, num_d2h_channel_addrs,
           false, low_latency_buffer_idx);
     } else {
+      EP_DEVICE_ASSERT(dst_rank / max_nvl_peers == rank / max_nvl_peers && 
+                    "IPC path should only be used for intra-node communication");
       // Intra-node: use direct atomic operation
       st_release_sys_global(reinterpret_cast<int*>(dst_p2p_ptr),
                             -num_tokens_sent - 1);
diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index 101388796..ad7f17793 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -1648,11 +1648,11 @@ void apply_pending_updates(ProxyCtx& ctx,
     int value = upd.value;
     if (!upd.is_combine) {
       int num_tokens = ctx.dispatch_token_counter.Get(
-          {upd.low_latency_buffer_idx, upd.expert_idx, upd.src_rank});
+          {upd.low_latency_buffer_idx, upd.src_rank, upd.src_rank});
       if ((-value - 1) == num_tokens) {
         is_atomic_ready = true;
         ctx.dispatch_token_counter.Reset(
-            {upd.low_latency_buffer_idx, upd.expert_idx, upd.src_rank});
+            {upd.low_latency_buffer_idx, upd.src_rank, upd.src_rank});
       }
     } else {
       int combine_num_tokens = ctx.combine_token_counter.Get(
@@ -1894,10 +1894,9 @@ void remote_process_completions_fast_mode(
       bool is_atomic_ready = false;
       int expert_idx = -1;
       if (!is_combine) {
-        expert_idx = new_index / num_ranks;
         src_rank = new_index % num_ranks;
         int num_tokens = S.dispatch_token_counter.Get(
-            {low_latency_buffer_idx, expert_idx, src_rank});
+            {low_latency_buffer_idx, src_rank, src_rank});
         if ((-value - 1) == num_tokens) {
           is_atomic_ready = true;
         }
@@ -1905,12 +1904,12 @@ void remote_process_completions_fast_mode(
           fprintf(stderr,
                   "[Error] Required Dispatch value %d is smaller than received "
                   "counter %d for "
-                  "expert_idx %d, src_rank %d\n",
-                  -value - 1, num_tokens, expert_idx, src_rank);
+                  "src_rank %d, src_rank %d\n",
+                  -value - 1, num_tokens, src_rank, src_rank);
         }
         if (is_atomic_ready) {
           S.dispatch_token_counter.Reset(
-              {low_latency_buffer_idx, expert_idx, src_rank});
+              {low_latency_buffer_idx, src_rank, src_rank});
         }
 
       } else {
@@ -2031,6 +2030,7 @@ void remote_process_completions_fast_mode(
 
       if (!is_combine) {
         /* expert_idx here is the local expert index of the receiver. */
+        assert(src_rank == expert_idx && "For dispatch tokens, src_rank should match expert_idx");
         S.dispatch_token_counter.Add({buffer_idx, expert_idx, src_rank}, k);
       } else {
         /* expert_idx here is the global expert index of the sender. */

From 1af1558920fd98f829ad14a367a43708fa732cd8 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 18:52:51 -0500
Subject: [PATCH 08/19] update data layout

---
 ep/include/ep_config.hpp  | 18 ++++++++++++++----
 ep/include/uccl_proxy.hpp |  6 +++++-
 ep/src/proxy.cpp          | 22 +++++++++++++++-------
 ep/src/rdma.cpp           | 10 +++++++---
 ep/src/uccl_ep.cc         |  4 ++++
 5 files changed, 45 insertions(+), 15 deletions(-)

diff --git a/ep/include/ep_config.hpp b/ep/include/ep_config.hpp
index 08282ea01..393ec5409 100644
--- a/ep/include/ep_config.hpp
+++ b/ep/include/ep_config.hpp
@@ -233,8 +233,11 @@ struct LowLatencyLayout {
     total_bytes += recv_buffer_bytes * 2;
 
     // Symmetric signaling buffers
-    size_t dispatch_recv_count_buffer_bytes = num_experts * sizeof(int);
-    size_t combine_recv_flag_buffer_bytes = dispatch_recv_count_buffer_bytes;
+    // Dispatch-LL uses one count per (dst_rank, src_rank); combine uses one
+    // flag per expert. Both share the same signaling region, so size by max.
+    size_t dispatch_recv_count_buffer_bytes =
+        static_cast<size_t>(num_ranks * num_ranks) * sizeof(int);
+    size_t combine_recv_flag_buffer_bytes = num_experts * sizeof(int);
     size_t signaling_buffer_bytes = std::max(dispatch_recv_count_buffer_bytes,
                                              combine_recv_flag_buffer_bytes);
     size_t signaling_buffer_bytes_aligned =
@@ -242,7 +245,14 @@ struct LowLatencyLayout {
     total_bytes += signaling_buffer_bytes_aligned * 2;
 
     // Internode signaling buffers (for RDMA atomics): use 64-bit slots.
-    size_t signaling_buffer_bytes_internode = num_experts * sizeof(int64_t);
+    // Dispatch count and combine flag internode buffers share this region.
+    size_t dispatch_recv_count_buffer_bytes_internode =
+        static_cast<size_t>(num_ranks * num_ranks) * sizeof(int64_t);
+    size_t combine_recv_flag_buffer_bytes_internode =
+        num_experts * sizeof(int64_t);
+    size_t signaling_buffer_bytes_internode = std::max(
+        dispatch_recv_count_buffer_bytes_internode,
+        combine_recv_flag_buffer_bytes_internode);
     size_t signaling_buffer_bytes_internode_aligned =
         align<size_t>(signaling_buffer_bytes_internode, 128);
     // These internode signaling buffers live inside `atomic_buffer_ptr` (not
@@ -299,4 +309,4 @@ size_t get_low_latency_rdma_size_hint(int num_max_dispatch_tokens_per_rank,
          NUM_BUFFER_ALIGNMENT_BYTES;
 }
 
-}  // namespace uccl
\ No newline at end of file
+}  // namespace uccl
diff --git a/ep/include/uccl_proxy.hpp b/ep/include/uccl_proxy.hpp
index 0a8749b74..999989f88 100644
--- a/ep/include/uccl_proxy.hpp
+++ b/ep/include/uccl_proxy.hpp
@@ -62,7 +62,11 @@ class UcclProxy {
         num_experts * num_tokens * hidden * 2;  // sizeof(bfloat16)
     size_t send_buffer_bytes =
         std::max(dispatch_send_buffer_bytes, combine_send_buffer_bytes);
-    size_t dispatch_recv_count_buffer_bytes = num_experts * 4;
+    size_t const signaling_slots = std::max(
+        static_cast<size_t>(num_experts),
+        static_cast<size_t>(proxy_->cfg_.num_ranks) *
+            static_cast<size_t>(proxy_->cfg_.num_ranks));
+    size_t dispatch_recv_count_buffer_bytes = signaling_slots * 4;
     size_t signaling_buffer_bytes_aligned =
         ((dispatch_recv_count_buffer_bytes + 127) / 128) * 128;
     uintptr_t dispatch_recv_data_offset =
diff --git a/ep/src/proxy.cpp b/ep/src/proxy.cpp
index e195b15ce..6d51de464 100644
--- a/ep/src/proxy.cpp
+++ b/ep/src/proxy.cpp
@@ -839,9 +839,13 @@ void Proxy::post_gpu_commands_mixed(
         if (!cfg_.use_normal_mode) {
           int value = cmds_to_post[i].value;
           uint32_t offset = static_cast<int64_t>(cmds_to_post[i].req_rptr);
-          uint32_t new_offset =
-              offset - get_low_latency(cmds_to_post[i].cmd_type) *
-                           align<size_t>(cfg_.num_experts * sizeof(int), 128);
+          size_t const signaling_slots = std::max(
+              static_cast<size_t>(cfg_.num_experts),
+              static_cast<size_t>(cfg_.num_ranks) *
+                  static_cast<size_t>(cfg_.num_ranks));
+          uint32_t new_offset = offset -
+              get_low_latency(cmds_to_post[i].cmd_type) *
+                  align<size_t>(signaling_slots * sizeof(int), 128);
           size_t new_index = new_offset / sizeof(int);
           int expected_value;
           int expert_idx;
@@ -873,9 +877,13 @@ void Proxy::post_gpu_commands_mixed(
 #ifdef USE_SENDER_BARRIER
         if (!cfg_.use_normal_mode) {
           uint32_t offset = static_cast<int64_t>(cmds_to_post[i].req_rptr);
-          uint32_t new_offset =
-              offset - get_low_latency(cmds_to_post[i].cmd_type) *
-                           align<size_t>(cfg_.num_experts * sizeof(int), 128);
+          size_t const signaling_slots = std::max(
+              static_cast<size_t>(cfg_.num_experts),
+              static_cast<size_t>(cfg_.num_ranks) *
+                  static_cast<size_t>(cfg_.num_ranks));
+          uint32_t new_offset = offset -
+              get_low_latency(cmds_to_post[i].cmd_type) *
+                  align<size_t>(signaling_slots * sizeof(int), 128);
           size_t new_index = new_offset / sizeof(int);
           int expert_idx;
           if (get_is_combine(cmds_to_post[i].cmd_type)) {
@@ -1367,4 +1375,4 @@ void Proxy::barrier_check() {
 #endif
   }
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index ad7f17793..5f4c7d04a 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -1886,9 +1886,13 @@ void remote_process_completions_fast_mode(
       // ep_config.hpp
       bool is_combine = aimm.IsCombine();
       int low_latency_buffer_idx = aimm.GetBufferIdx();
-      uint32_t new_offset =
-          offset - low_latency_buffer_idx *
-                       align<size_t>(num_experts * sizeof(int64_t), 128);
+      size_t const signaling_slots = std::max(
+          static_cast<size_t>(num_experts),
+          static_cast<size_t>(num_ranks) * static_cast<size_t>(num_ranks));
+      uint32_t new_offset = offset - low_latency_buffer_idx *
+                                         align<size_t>(signaling_slots *
+                                                           sizeof(int64_t),
+                                                       128);
       size_t new_index = new_offset / sizeof(int64_t);
       int src_rank = -1;
       bool is_atomic_ready = false;
diff --git a/ep/src/uccl_ep.cc b/ep/src/uccl_ep.cc
index de6e00107..33de5713e 100644
--- a/ep/src/uccl_ep.cc
+++ b/ep/src/uccl_ep.cc
@@ -1424,6 +1424,10 @@ class Buffer {
 
     uccl::internode_ll::clean_low_latency_buffer(
         ptr0, count0, ptr1, count1, at::cuda::getCurrentCUDAStream());
+    CUDA_CHECK(cudaMemsetAsync(ptr_internode0, 0, count0 * sizeof(int64_t),
+                               at::cuda::getCurrentCUDAStream()));
+    CUDA_CHECK(cudaMemsetAsync(ptr_internode1, 0, count1 * sizeof(int64_t),
+                               at::cuda::getCurrentCUDAStream()));
   }
 
   std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor,

From e0797006ed5b4aa4880ae3f02fc41b4f7baeea0e Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 19:27:31 -0500
Subject: [PATCH 09/19] adding more bits to cmd.bytes

---
 ep/include/ring_buffer.cuh | 33 ++++++++++++++++++++++++-
 ep/include/uccl_ibgda.cuh  | 50 +++++++++++++++++++++++++++++++-------
 ep/src/internode_ll.cu     | 41 ++++++++++++++++---------------
 ep/src/proxy.cpp           |  7 +++---
 ep/src/rdma.cpp            | 47 ++++++++++++++++++++---------------
 5 files changed, 126 insertions(+), 52 deletions(-)

diff --git a/ep/include/ring_buffer.cuh b/ep/include/ring_buffer.cuh
index 86f5cba00..32630c6b2 100644
--- a/ep/include/ring_buffer.cuh
+++ b/ep/include/ring_buffer.cuh
@@ -91,6 +91,37 @@ struct TransferCmd {
 static_assert(sizeof(TransferCmd) * 8 == 128, "TransferCmd must be 128 bits");
 #endif
 
+// TransferCmd::bytes is 24-bit. For low-latency dispatch WRITE commands we can
+// borrow the top 2 bits from expert_idx to extend bytes to 26-bit.
+constexpr uint32_t kTransferCmdBytesMask = (1u << 24) - 1;
+constexpr uint16_t kTransferCmdBytesExtShift = 14;
+constexpr uint16_t kTransferCmdBytesExtMask = (1u << 2) - 1;
+constexpr uint16_t kTransferCmdExpertIdxMask = (1u << 14) - 1;
+
+__host__ __device__ inline bool is_low_latency_dispatch_write(
+    TransferCmd const& cmd) {
+  return get_base_cmd(cmd.cmd_type) == CmdType::WRITE &&
+         get_low_latency(cmd.cmd_type) && !get_is_combine(cmd.cmd_type);
+}
+
+__host__ __device__ inline uint32_t get_transfer_cmd_bytes(
+    TransferCmd const& cmd) {
+  uint32_t bytes = cmd.bytes;
+  if (is_low_latency_dispatch_write(cmd)) {
+    bytes |=
+        (static_cast<uint32_t>(cmd.expert_idx >> kTransferCmdBytesExtShift)
+         << 24);
+  }
+  return bytes;
+}
+
+__host__ __device__ inline uint16_t get_transfer_cmd_expert_idx(
+    TransferCmd const& cmd) {
+  if (is_low_latency_dispatch_write(cmd))
+    return static_cast<uint16_t>(cmd.expert_idx & kTransferCmdExpertIdxMask);
+  return cmd.expert_idx;
+}
+
 struct CopyTask {
   uint64_t wr_id;
   int dst_dev;
@@ -461,4 +492,4 @@ static inline void free_cmd_ring(uintptr_t addr) {
   }
 }
 
-#endif  // RING_BUFFER_CUH
\ No newline at end of file
+#endif  // RING_BUFFER_CUH
diff --git a/ep/include/uccl_ibgda.cuh b/ep/include/uccl_ibgda.cuh
index 424fddec3..85ff20307 100644
--- a/ep/include/uccl_ibgda.cuh
+++ b/ep/include/uccl_ibgda.cuh
@@ -60,13 +60,28 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
         make_cmd_type(CmdType::WRITE, is_combine, low_latency_buffer_idx);
     cmd.req_rptr = rptr_val;
     cmd.req_lptr = lptr_val;
-    cmd.bytes = bytes_val;
+    uint32_t cmd_bytes = static_cast<uint32_t>(bytes_val);
+    uint16_t cmd_expert_idx = static_cast<uint16_t>(expert_idx);
+    if constexpr (!use_normal_mode) {
+      if (!is_combine) {
+        EP_DEVICE_ASSERT((expert_idx & ~kTransferCmdExpertIdxMask) == 0);
+        EP_DEVICE_ASSERT((cmd_bytes >> 26) == 0);
+        auto bytes_hi2 = static_cast<uint16_t>(cmd_bytes >> 24);
+        cmd_expert_idx = static_cast<uint16_t>(
+            (expert_idx & kTransferCmdExpertIdxMask) |
+            (bytes_hi2 << kTransferCmdBytesExtShift));
+        cmd_bytes &= kTransferCmdBytesMask;
+      } else {
+        EP_DEVICE_ASSERT((cmd_bytes >> 24) == 0);
+      }
+    }
+    cmd.bytes = cmd_bytes;
     cmd.dst_rank = dst_rank;
     if constexpr (use_normal_mode) {
       cmd.atomic_offset = atomic_offset;
       cmd.atomic_val = atomic_val;
     } else {
-      cmd.expert_idx = expert_idx;
+      cmd.expert_idx = cmd_expert_idx;
       // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
       EP_DEVICE_ASSERT(num_tokens > 0 && num_tokens <= 255);
       cmd.atomic_val = static_cast<uint8_t>(num_tokens);
@@ -94,12 +109,29 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
           make_cmd_type(CmdType::WRITE, is_combine, low_latency_buffer_idx);
       cmd.req_rptr = rptr_val;
       cmd.req_lptr = lptr_val;
-      cmd.bytes = bytes_val;
+      uint32_t cmd_bytes = static_cast<uint32_t>(bytes_val);
+      uint16_t cmd_expert_idx = static_cast<uint16_t>(expert_idx);
+      if constexpr (!use_normal_mode) {
+        if (!is_combine) {
+          EP_DEVICE_ASSERT((expert_idx & ~kTransferCmdExpertIdxMask) == 0);
+          EP_DEVICE_ASSERT((cmd_bytes >> 26) == 0);
+          auto bytes_hi2 = static_cast<uint16_t>(cmd_bytes >> 24);
+          cmd_expert_idx = static_cast<uint16_t>(
+              (expert_idx & kTransferCmdExpertIdxMask) |
+              (bytes_hi2 << kTransferCmdBytesExtShift));
+          cmd_bytes &= kTransferCmdBytesMask;
+        } else {
+          EP_DEVICE_ASSERT((cmd_bytes >> 24) == 0);
+        }
+      }
+      cmd.bytes = cmd_bytes;
       cmd.dst_rank = dst_rank;
-      if (bytes_val >> 24) {
-        printf("[nvshmemi_ibgda_put_nbi_warp] bytes too large: %llu\n",
-               (unsigned long long)bytes_val);
-        trap();
+      if constexpr (use_normal_mode) {
+        if (bytes_val >> 24) {
+          printf("[nvshmemi_ibgda_put_nbi_warp] bytes too large: %llu\n",
+                 (unsigned long long)bytes_val);
+          trap();
+        }
       }
 
       if constexpr (use_normal_mode) {
@@ -117,7 +149,7 @@ __device__ __forceinline__ void nvshmemi_ibgda_put_nbi_warp(
         cmd.atomic_offset = atomic_offset;
         cmd.atomic_val = atomic_val;
       } else {
-        cmd.expert_idx = expert_idx;
+        cmd.expert_idx = cmd_expert_idx;
         // Low-latency WRITE: use atomic_val byte for num_tokens (1..255).
         EP_DEVICE_ASSERT(num_tokens > 0 && num_tokens <= 255);
         cmd.atomic_val = static_cast<uint8_t>(num_tokens);
@@ -386,4 +418,4 @@ __forceinline__ __device__ void nvshmem_sync_with_same_gpu_idx(
   }
 }
 
-}  // namespace uccl
\ No newline at end of file
+}  // namespace uccl
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 8127f90d4..e861b0964 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -356,6 +356,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
           }
         }
       }
+      // Lane 0 stamps metadata/sentinels. Ensure all lanes observe these
+      // writes before any lane starts copying the rank slice.
+      __syncwarp();
       auto const total_bytes = rank_region_bytes;
 
       __threadfence_system();
@@ -535,36 +538,36 @@ LOW_LATENCY_DISPATCH_RECV:
       if (num_recv_tokens_total == 0) {
         num_recv_tokens = 0;
       } else {
-        // bool metadata_ready = true;
+        bool metadata_ready = true;
         bool saw_sentinel = false;
-        // int mismatch_slot = -1;
-        // int mismatch_meta = 0;
-        // int mismatch_src_idx = 0;
+        int mismatch_slot = -1;
+        int mismatch_meta = 0;
+        int mismatch_src_idx = 0;
         num_recv_tokens = 0;
         for (; num_recv_tokens < num_max_dispatch_tokens_per_rank;
              ++num_recv_tokens) {
           auto const src_src_idx = reinterpret_cast<int*>(
               rdma_recv_x_uint8 + num_recv_tokens * num_bytes_per_msg);
-          // auto const token_meta = ld_acquire_sys_global(src_src_idx + 1);
-          // if (token_meta != num_recv_tokens_total) {
-          //   metadata_ready = false;
-          //   mismatch_slot = num_recv_tokens;
-          //   mismatch_meta = token_meta;
-          //   mismatch_src_idx = ld_nc_global(src_src_idx);
-          //   break;
-          // }
+          auto const token_meta = ld_acquire_sys_global(src_src_idx + 1);
+          if (token_meta != num_recv_tokens_total) {
+            metadata_ready = false;
+            mismatch_slot = num_recv_tokens;
+            mismatch_meta = token_meta;
+            mismatch_src_idx = ld_nc_global(src_src_idx);
+            break;
+          }
           if (ld_nc_global(src_src_idx) < 0) {
             saw_sentinel = true;
             break;
           }
         }
-        // if (!metadata_ready) {
-        //   printf(
-        //       "[dispatch recv meta mismatch] rank=%d src_rank=%d expert=%d "
-        //       "slot=%d meta=%d expected=%d src_idx=%d\n",
-        //       rank, src_rank, local_expert_idx, mismatch_slot,
-        //       mismatch_meta, num_recv_tokens_total, mismatch_src_idx);
-        // }
+        if (!metadata_ready) {
+          printf(
+              "[dispatch recv meta mismatch] rank=%d src_rank=%d expert=%d "
+              "slot=%d meta=%d expected=%d src_idx=%d\n",
+              rank, src_rank, local_expert_idx, mismatch_slot,
+              mismatch_meta, num_recv_tokens_total, mismatch_src_idx);
+        }
         EP_DEVICE_ASSERT(saw_sentinel ||
                          num_recv_tokens == num_max_dispatch_tokens_per_rank);
       }
diff --git a/ep/src/proxy.cpp b/ep/src/proxy.cpp
index 6d51de464..f5d4ad2c6 100644
--- a/ep/src/proxy.cpp
+++ b/ep/src/proxy.cpp
@@ -614,10 +614,11 @@ void Proxy::post_gpu_command(uint64_t& my_tail, size_t& seen) {
                               (fifo_seq_[rb_idx]++ & 0xFFFFFFFFULL);
       wrs_to_post.push_back(unique_wr_id);
       cmds_to_post.push_back(cmd);
+      auto const cmd_bytes = get_transfer_cmd_bytes(cmd);
       fifo_pending_[rb_idx].push_back(
-          std::make_pair(unique_wr_id, static_cast<size_t>(cmd.bytes)));
-      if (get_base_cmd(cmd.cmd_type) == CmdType::WRITE && cmd.bytes > 0) {
-        current_inflight_bytes.fetch_add(static_cast<size_t>(cmd.bytes),
+          std::make_pair(unique_wr_id, static_cast<size_t>(cmd_bytes)));
+      if (get_base_cmd(cmd.cmd_type) == CmdType::WRITE && cmd_bytes > 0) {
+        current_inflight_bytes.fetch_add(static_cast<size_t>(cmd_bytes),
                                          std::memory_order_release);
       }
 
diff --git a/ep/src/rdma.cpp b/ep/src/rdma.cpp
index 5f4c7d04a..93c0d99c9 100644
--- a/ep/src/rdma.cpp
+++ b/ep/src/rdma.cpp
@@ -1022,6 +1022,8 @@ static void post_rdma_async_batched_normal_mode(
       for (size_t j = 0; j < idxs.size(); ++j) {
         size_t i = idxs[j];
         auto const& cmd = cmds_to_post[i];
+        auto const cmd_bytes = get_transfer_cmd_bytes(cmd);
+        auto const cmd_expert_idx = get_transfer_cmd_expert_idx(cmd);
 
         qpx->wr_id = wrs_to_post[i];
         qpx->comp_mask = 0;
@@ -1032,11 +1034,11 @@ static void post_rdma_async_batched_normal_mode(
         uint64_t remote_end = ctx->remote_addr + ctx->remote_len;
 
         if (remote_addr < ctx->remote_addr ||
-            remote_addr + cmd.bytes > remote_end) {
+            remote_addr + cmd_bytes > remote_end) {
           fprintf(stderr,
                   "[ERROR] Remote write OOB: addr=0x%llx len=%u (base=0x%llx, "
                   "size=%zu), cmd.req_rptr: 0x%llx\n",
-                  (unsigned long long)remote_addr, cmd.bytes,
+                  (unsigned long long)remote_addr, cmd_bytes,
                   (unsigned long long)ctx->remote_addr, (size_t)ctx->remote_len,
                   (unsigned long long)cmd.req_rptr);
           cudaError_t err = cudaDeviceSynchronize();
@@ -1074,7 +1076,7 @@ static void post_rdma_async_batched_normal_mode(
         } else if (j + 1 == idxs.size()) {
           uint32_t imm =
               WriteImm::Pack(get_is_combine(cmd.cmd_type),
-                             get_low_latency(cmd.cmd_type), cmd.expert_idx,
+                             get_low_latency(cmd.cmd_type), cmd_expert_idx,
                              (uint32_t)idxs.size(), my_rank)
                   .GetImmData();
           ibv_wr_rdma_write_imm(qpx, ctx->remote_rkey, remote_addr, htonl(imm));
@@ -1086,7 +1088,7 @@ static void post_rdma_async_batched_normal_mode(
             cmd.req_lptr + reinterpret_cast<uintptr_t>(ctx->mr->addr);
         ibv_wr_set_ud_addr(qpx, ctx->dst_ah, dst_qpn, QKEY);
         ibv_wr_set_sge(qpx, ctx->mr->lkey, laddr,
-                       static_cast<uint32_t>(cmd.bytes));
+                       cmd_bytes);
 
         ring_wrids.push_back(wrs_to_post[i]);
       }
@@ -1113,6 +1115,7 @@ static void post_rdma_async_batched_normal_mode(
         for (size_t j = 0; j < kgroup; ++j) {
           size_t i = idxs[j];
           auto const& cmd = cmds_to_post[i];
+          auto const cmd_bytes = get_transfer_cmd_bytes(cmd);
           ring_wrids.push_back(wrs_to_post[i]);
 
           // Remote address bounds check
@@ -1121,12 +1124,12 @@ static void post_rdma_async_batched_normal_mode(
           uint64_t remote_end = ctx->remote_addr + ctx->remote_len;
 
           if (remote_addr < ctx->remote_addr ||
-              remote_addr + cmd.bytes > remote_end) {
+              remote_addr + cmd_bytes > remote_end) {
             fprintf(
                 stderr,
                 "[ERROR] Remote write OOB: addr=0x%llx len=%u (base=0x%llx, "
                 "size=%zu), cmd.req_rptr: 0x%llx\n",
-                (unsigned long long)remote_addr, cmd.bytes,
+                (unsigned long long)remote_addr, cmd_bytes,
                 (unsigned long long)ctx->remote_addr, (size_t)ctx->remote_len,
                 (unsigned long long)cmd.req_rptr);
             cudaError_t err = cudaDeviceSynchronize();
@@ -1142,7 +1145,7 @@ static void post_rdma_async_batched_normal_mode(
               cmd.req_lptr + reinterpret_cast<uintptr_t>(ctx->mr->addr);
           sges[j] = {
               .addr = laddr,
-              .length = static_cast<uint32_t>(cmd.bytes),
+              .length = cmd_bytes,
               .lkey = ctx->mr->lkey,
           };
 
@@ -1221,6 +1224,7 @@ static void post_rdma_async_batched_normal_mode(
         for (size_t j = 0; j < kgroup; ++j) {
           size_t i = idxs[j];
           auto const& cmd = cmds_to_post[i];
+          auto const cmd_bytes = get_transfer_cmd_bytes(cmd);
           ring_wrids.push_back(wrs_to_post[i]);
 
           // Remote address bounds check
@@ -1229,12 +1233,12 @@ static void post_rdma_async_batched_normal_mode(
           uint64_t remote_end = ctx->remote_addr + ctx->remote_len;
 
           if (remote_addr < ctx->remote_addr ||
-              remote_addr + cmd.bytes > remote_end) {
+              remote_addr + cmd_bytes > remote_end) {
             fprintf(
                 stderr,
                 "[ERROR] Remote write OOB: addr=0x%llx len=%u (base=0x%llx, "
                 "size=%zu), cmd.req_rptr: 0x%llx\n",
-                (unsigned long long)remote_addr, cmd.bytes,
+                (unsigned long long)remote_addr, cmd_bytes,
                 (unsigned long long)ctx->remote_addr, (size_t)ctx->remote_len,
                 (unsigned long long)cmd.req_rptr);
             cudaError_t err = cudaDeviceSynchronize();
@@ -1250,7 +1254,7 @@ static void post_rdma_async_batched_normal_mode(
               cmd.req_lptr + reinterpret_cast<uintptr_t>(ctx->mr->addr);
           sges[j] = {
               .addr = laddr,
-              .length = static_cast<uint32_t>(cmd.bytes),
+              .length = cmd_bytes,
               .lkey = ctx->mr->lkey,
           };
 
@@ -1348,7 +1352,7 @@ static void post_rdma_async_batched_fast_mode(
     std::unordered_map<int, std::vector<size_t>> dst_expert_wr_ids;
     for (size_t j = 0; j < k; ++j) {
       size_t i = wr_ids[j];
-      int expert_idx = cmds_to_post[i].expert_idx;
+      int expert_idx = get_transfer_cmd_expert_idx(cmds_to_post[i]);
       dst_expert_wr_ids[expert_idx].push_back(i);
     }
 #endif
@@ -1364,6 +1368,8 @@ static void post_rdma_async_batched_fast_mode(
 #endif
 
         auto const& cmd = cmds_to_post[i];
+        auto const cmd_bytes = get_transfer_cmd_bytes(cmd);
+        auto const cmd_expert_idx = get_transfer_cmd_expert_idx(cmd);
 #ifdef USE_RECEIVER_BARRIER
         expert_wr_ids[j] = wrs_to_post[i];
 #else
@@ -1378,11 +1384,11 @@ static void post_rdma_async_batched_fast_mode(
         uint64_t remote_end = ctx->remote_addr + ctx->remote_len;
 
         if (remote_addr < ctx->remote_addr ||
-            remote_addr + cmd.bytes > remote_end) {
+            remote_addr + cmd_bytes > remote_end) {
           fprintf(stderr,
                   "[ERROR] Remote write OOB: addr=0x%llx len=%u (base=0x%llx, "
                   "size=%zu), cmd.req_rptr: 0x%llx\n",
-                  (unsigned long long)remote_addr, cmd.bytes,
+                  (unsigned long long)remote_addr, cmd_bytes,
                   (unsigned long long)ctx->remote_addr, (size_t)ctx->remote_len,
                   (unsigned long long)cmd.req_rptr);
           cudaError_t err = cudaDeviceSynchronize();
@@ -1394,7 +1400,7 @@ static void post_rdma_async_batched_fast_mode(
           std::abort();
         }
 #ifdef USE_SENDER_BARRIER
-        S.wr_id_to_write_struct[qpx->wr_id] = {cmd.expert_idx, dst_rank,
+        S.wr_id_to_write_struct[qpx->wr_id] = {cmd_expert_idx, dst_rank,
                                                get_is_combine(cmd.cmd_type),
                                                get_low_latency(cmd.cmd_type)};
 #endif
@@ -1403,14 +1409,14 @@ static void post_rdma_async_batched_fast_mode(
             cmd.atomic_val ? static_cast<uint32_t>(cmd.atomic_val) : 1u;
         uint32_t imm = WriteImm::Pack(get_is_combine(cmd.cmd_type),
                                       get_low_latency(cmd.cmd_type),
-                                      cmd.expert_idx, num_tokens_imm, my_rank)
+                                      cmd_expert_idx, num_tokens_imm, my_rank)
                            .GetImmData();
         ibv_wr_rdma_write_imm(qpx, ctx->remote_rkey, remote_addr, htonl(imm));
 #else
       if (j + 1 == k) {
         uint32_t imm = WriteImm::Pack(get_is_combine(cmd.cmd_type),
                                       get_low_latency(cmd.cmd_type),
-                                      cmd.expert_idx, k, my_rank)
+                                      cmd_expert_idx, k, my_rank)
                            .GetImmData();
         ibv_wr_rdma_write_imm(qpx, ctx->remote_rkey, remote_addr, htonl(imm));
       } else {
@@ -1421,7 +1427,7 @@ static void post_rdma_async_batched_fast_mode(
             cmd.req_lptr + reinterpret_cast<uintptr_t>(ctx->mr->addr);
         ibv_wr_set_ud_addr(qpx, ctx->dst_ah, ctx->dst_qpn, QKEY);
         ibv_wr_set_sge(qpx, ctx->mr->lkey, laddr,
-                       static_cast<uint32_t>(cmd.bytes));
+                       cmd_bytes);
       }
 
 #ifdef USE_RECEIVER_BARRIER
@@ -1444,9 +1450,10 @@ static void post_rdma_async_batched_fast_mode(
     for (size_t j = 0; j < k; ++j) {
       size_t i = wr_ids[j];
       auto const& cmd = cmds_to_post[i];
+      auto const cmd_bytes = get_transfer_cmd_bytes(cmd);
       wr_ids[j] = wrs_to_post[i];
       sges[j].addr = cmd.req_lptr + reinterpret_cast<uintptr_t>(ctx->mr->addr);
-      sges[j].length = static_cast<uint32_t>(cmd.bytes);
+      sges[j].length = cmd_bytes;
       sges[j].lkey = ctx->mr->lkey;
       std::memset(&wrs[j], 0, sizeof(wrs[j]));
       wrs[j].sg_list = &sges[j];
@@ -1457,11 +1464,11 @@ static void post_rdma_async_batched_fast_mode(
 
       uint64_t remote_end = ctx->remote_addr + ctx->remote_len;
       if (wrs[j].wr.rdma.remote_addr < ctx->remote_addr ||
-          wrs[j].wr.rdma.remote_addr + cmd.bytes > remote_end) {
+          wrs[j].wr.rdma.remote_addr + cmd_bytes > remote_end) {
         fprintf(stderr,
                 "[ERROR] Remote write OOB: addr=0x%llx len=%u (base=0x%llx, "
                 "size=%zu), cmd.req_rptr: 0x%llx\n",
-                (unsigned long long)wrs[j].wr.rdma.remote_addr, cmd.bytes,
+                (unsigned long long)wrs[j].wr.rdma.remote_addr, cmd_bytes,
                 (unsigned long long)ctx->remote_addr, (size_t)ctx->remote_len,
                 (unsigned long long)cmd.req_rptr);
         cudaError_t err = cudaDeviceSynchronize();

From 70b739e68c2c15e735924e3e744754649bb9419f Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 19:34:54 -0500
Subject: [PATCH 10/19] adding debugging check

---
 ep/src/internode_ll.cu | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index e861b0964..0e2c88424 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -360,6 +360,10 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       // writes before any lane starts copying the rank slice.
       __syncwarp();
       auto const total_bytes = rank_region_bytes;
+      constexpr size_t kMaxCmdBytes = (1u << 26) - 1;
+      EP_DEVICE_ASSERT(
+          total_bytes <= kMaxCmdBytes &&
+          "Low-latency dispatch rank slice exceeds command byte encoding");
 
       __threadfence_system();
 
@@ -690,6 +694,23 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
   auto const num_warps = num_warp_groups * num_warps_per_group;
   auto const num_sms = ceil_div(num_experts, num_warp_groups);
   EP_HOST_ASSERT(num_topk <= kNumMaxTopK);
+  // Low-latency dispatch encodes write size in an extended 26-bit field.
+  // Guard here to fail fast for oversized rank slices.
+  {
+    auto const num_local_experts = num_experts / num_ranks;
+    auto const num_scales = hidden / 128;
+    size_t const num_bytes_per_msg =
+        sizeof(int4) + (use_fp8 ? (hidden + num_scales * sizeof(float))
+                                : (hidden * sizeof(nv_bfloat16)));
+    size_t const rank_region_bytes =
+        static_cast<size_t>(num_local_experts) *
+        static_cast<size_t>(num_max_dispatch_tokens_per_rank) *
+        num_bytes_per_msg;
+    constexpr size_t kMaxCmdBytes = (1u << 26) - 1;
+    EP_HOST_ASSERT(
+        rank_region_bytes <= kMaxCmdBytes &&
+        "Low-latency dispatch rank slice exceeds command byte encoding");
+  }
 
   // Workspace checks
   auto atomic_counter_per_expert = static_cast<int*>(workspace);

From 144a814ec043618236620f2d6c8286081aae8253 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 19:43:48 -0500
Subject: [PATCH 11/19] fix bug transfercmd

---
 ep/include/ring_buffer.cuh | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/ep/include/ring_buffer.cuh b/ep/include/ring_buffer.cuh
index 32630c6b2..0243ba5a9 100644
--- a/ep/include/ring_buffer.cuh
+++ b/ep/include/ring_buffer.cuh
@@ -91,23 +91,22 @@ struct TransferCmd {
 static_assert(sizeof(TransferCmd) * 8 == 128, "TransferCmd must be 128 bits");
 #endif
 
-// TransferCmd::bytes is 24-bit. For low-latency dispatch WRITE commands we can
+// TransferCmd::bytes is 24-bit. For dispatch WRITE commands (non-combine), we
 // borrow the top 2 bits from expert_idx to extend bytes to 26-bit.
 constexpr uint32_t kTransferCmdBytesMask = (1u << 24) - 1;
 constexpr uint16_t kTransferCmdBytesExtShift = 14;
 constexpr uint16_t kTransferCmdBytesExtMask = (1u << 2) - 1;
 constexpr uint16_t kTransferCmdExpertIdxMask = (1u << 14) - 1;
 
-__host__ __device__ inline bool is_low_latency_dispatch_write(
+__host__ __device__ inline bool is_dispatch_write_cmd(
     TransferCmd const& cmd) {
-  return get_base_cmd(cmd.cmd_type) == CmdType::WRITE &&
-         get_low_latency(cmd.cmd_type) && !get_is_combine(cmd.cmd_type);
+  return get_base_cmd(cmd.cmd_type) == CmdType::WRITE && !get_is_combine(cmd.cmd_type);
 }
 
 __host__ __device__ inline uint32_t get_transfer_cmd_bytes(
     TransferCmd const& cmd) {
   uint32_t bytes = cmd.bytes;
-  if (is_low_latency_dispatch_write(cmd)) {
+  if (is_dispatch_write_cmd(cmd)) {
     bytes |=
         (static_cast<uint32_t>(cmd.expert_idx >> kTransferCmdBytesExtShift)
          << 24);
@@ -117,7 +116,7 @@ __host__ __device__ inline uint32_t get_transfer_cmd_bytes(
 
 __host__ __device__ inline uint16_t get_transfer_cmd_expert_idx(
     TransferCmd const& cmd) {
-  if (is_low_latency_dispatch_write(cmd))
+  if (is_dispatch_write_cmd(cmd))
     return static_cast<uint16_t>(cmd.expert_idx & kTransferCmdExpertIdxMask);
   return cmd.expert_idx;
 }

From 5b8fe5112edbe4cb53e1c21cce857b9f8ba05f8b Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Mon, 23 Feb 2026 00:56:48 +0000
Subject: [PATCH 12/19] remove meta checks

---
 ep/src/internode_ll.cu | 38 +++++++++++++++++++-------------------
 ep/src/uccl_ep.cc      |  4 ----
 2 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 0e2c88424..04882d81d 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -542,36 +542,36 @@ LOW_LATENCY_DISPATCH_RECV:
       if (num_recv_tokens_total == 0) {
         num_recv_tokens = 0;
       } else {
-        bool metadata_ready = true;
+        // bool metadata_ready = true;
         bool saw_sentinel = false;
-        int mismatch_slot = -1;
-        int mismatch_meta = 0;
-        int mismatch_src_idx = 0;
+        // int mismatch_slot = -1;
+        // int mismatch_meta = 0;
+        // int mismatch_src_idx = 0;
         num_recv_tokens = 0;
         for (; num_recv_tokens < num_max_dispatch_tokens_per_rank;
              ++num_recv_tokens) {
           auto const src_src_idx = reinterpret_cast<int*>(
               rdma_recv_x_uint8 + num_recv_tokens * num_bytes_per_msg);
-          auto const token_meta = ld_acquire_sys_global(src_src_idx + 1);
-          if (token_meta != num_recv_tokens_total) {
-            metadata_ready = false;
-            mismatch_slot = num_recv_tokens;
-            mismatch_meta = token_meta;
-            mismatch_src_idx = ld_nc_global(src_src_idx);
-            break;
-          }
+          // auto const token_meta = ld_acquire_sys_global(src_src_idx + 1);
+          // if (token_meta != num_recv_tokens_total) {
+          //   metadata_ready = false;
+          //   mismatch_slot = num_recv_tokens;
+          //   mismatch_meta = token_meta;
+          //   mismatch_src_idx = ld_nc_global(src_src_idx);
+          //   break;
+          // }
           if (ld_nc_global(src_src_idx) < 0) {
             saw_sentinel = true;
             break;
           }
         }
-        if (!metadata_ready) {
-          printf(
-              "[dispatch recv meta mismatch] rank=%d src_rank=%d expert=%d "
-              "slot=%d meta=%d expected=%d src_idx=%d\n",
-              rank, src_rank, local_expert_idx, mismatch_slot,
-              mismatch_meta, num_recv_tokens_total, mismatch_src_idx);
-        }
+        // if (!metadata_ready) {
+        //   printf(
+        //       "[dispatch recv meta mismatch] rank=%d src_rank=%d expert=%d "
+        //       "slot=%d meta=%d expected=%d src_idx=%d\n",
+        //       rank, src_rank, local_expert_idx, mismatch_slot,
+        //       mismatch_meta, num_recv_tokens_total, mismatch_src_idx);
+        // }
         EP_DEVICE_ASSERT(saw_sentinel ||
                          num_recv_tokens == num_max_dispatch_tokens_per_rank);
       }
diff --git a/ep/src/uccl_ep.cc b/ep/src/uccl_ep.cc
index 33de5713e..de6e00107 100644
--- a/ep/src/uccl_ep.cc
+++ b/ep/src/uccl_ep.cc
@@ -1424,10 +1424,6 @@ class Buffer {
 
     uccl::internode_ll::clean_low_latency_buffer(
         ptr0, count0, ptr1, count1, at::cuda::getCurrentCUDAStream());
-    CUDA_CHECK(cudaMemsetAsync(ptr_internode0, 0, count0 * sizeof(int64_t),
-                               at::cuda::getCurrentCUDAStream()));
-    CUDA_CHECK(cudaMemsetAsync(ptr_internode1, 0, count1 * sizeof(int64_t),
-                               at::cuda::getCurrentCUDAStream()));
   }
 
   std::tuple<torch::Tensor, std::optional<torch::Tensor>, torch::Tensor,

From edd53743fb270c2f27d7cf56af80f44c20582646 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 20:03:43 -0500
Subject: [PATCH 13/19] fix counter poll

---
 ep/src/internode_ll.cu | 48 +++++++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 04882d81d..83956aa6a 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -53,7 +53,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     int64_t* dispatch_wait_recv_cost_stats, void* rdma_recv_x,
     int* rdma_recv_count, void* rdma_x, void const* x, int64_t const* topk_idx,
     int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
-    int* atomic_send_counter_per_expert, int* next_clean,
+    int* atomic_send_counter_per_expert, int* atomic_tokens_per_rank,
+    int* atomic_done_experts_per_rank, int* next_clean,
     int64_t* next_clean_second, int num_next_clean_int, int num_tokens,
     int num_max_dispatch_tokens_per_rank, int num_topk, int num_experts,
     int rank, int num_ranks, int num_warp_groups, int num_warps_per_group,
@@ -148,8 +149,10 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       // expert.
       if (warp_id < num_topk && lane_id == 0) {
         if (dst_expert_idx >= 0) {
+          auto const dst_rank = dst_expert_idx / num_local_experts;
           shared_send_slots[warp_id] =
               atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1);
+          atomicAdd(atomic_tokens_per_rank + dst_rank, 1);
         } else {
           shared_send_slots[warp_id] = -1;
         }
@@ -238,9 +241,13 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                            src_int4_ptr, ld_nc_global, st_na_global);
         // Increase counter after finishing
         __syncwarp();
-        lane_id == 0 ? atomic_add_release_global(
-                           atomic_finish_counter_per_expert + dst_expert_idx, 1)
-                     : 0;
+        if (lane_id == 0) {
+          auto const old = atomic_add_release_global(
+              atomic_finish_counter_per_expert + dst_expert_idx, 1);
+          if (old + 1 == FINISHED_SUM_TAG * 2) {
+            atomicAdd(atomic_done_experts_per_rank + dst_rank, 1);
+          }
+        }
       }
     }
   } else if (warp_id == num_warps - 1) {
@@ -280,8 +287,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     for (int i = expert_begin_idx; i < expert_end_idx; ++i) {
       auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
       if (lane_id == 0) {
-        atomic_add_release_global(atomic_finish_counter_per_expert + i,
-                                  FINISHED_SUM_TAG - sum);
+        auto const old = atomic_add_release_global(
+            atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG - sum);
+        if (old + FINISHED_SUM_TAG - sum == FINISHED_SUM_TAG * 2) {
+          atomicAdd(atomic_done_experts_per_rank + i / num_local_experts, 1);
+        }
       }
     }
   }
@@ -299,17 +309,15 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   if (responsible_rank < num_ranks && sub_warp_id == 0) {
     // Wait for all experts on this rank to finish copying to batch buffer.
     int num_tokens_to_send = 0;
-    for (int e = 0; e < num_local_experts; ++e) {
-      int expert_idx = responsible_rank * num_local_experts + e;
-      while (ld_acquire_global(atomic_finish_counter_per_expert + expert_idx) !=
-             FINISHED_SUM_TAG * 2)
+    if (lane_id == 0) {
+      while (ld_acquire_global(atomic_done_experts_per_rank + responsible_rank) !=
+             num_local_experts)
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-          __builtin_amdgcn_s_sleep(1);
+        __builtin_amdgcn_s_sleep(1);
 #else
-          ;
+        ;
 #endif
-      if (lane_id == 0)
-        num_tokens_to_send += atomic_counter_per_expert[expert_idx];
+      num_tokens_to_send = ld_acquire_global(atomic_tokens_per_rank + responsible_rank);
     }
     num_tokens_to_send = __shfl_sync(WARP_MASK, num_tokens_to_send, 0);
     if (lane_id == 0)
@@ -426,6 +434,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       atomic_finish_counter_per_expert[expert_idx] = 0;
       atomic_send_counter_per_expert[expert_idx] = 0;
     }
+    atomic_tokens_per_rank[dst_rank] = 0;
+    atomic_done_experts_per_rank[dst_rank] = 0;
     if (dst_rank == 0) {
       for (int e = 0; e < num_local_experts; ++e) packed_recv_count[e] = 0;
     }
@@ -718,8 +728,11 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
       atomic_counter_per_expert + num_experts;
   auto atomic_send_counter_per_expert =
       atomic_finish_counter_per_expert + num_experts;
-  auto grid_sync_barrier_ptr = atomic_send_counter_per_expert + num_experts;
-  EP_HOST_ASSERT((num_experts * 3 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
+  auto atomic_tokens_per_rank = atomic_send_counter_per_expert + num_experts;
+  auto atomic_done_experts_per_rank = atomic_tokens_per_rank + num_ranks;
+  auto grid_sync_barrier_ptr = atomic_done_experts_per_rank + num_ranks;
+  EP_HOST_ASSERT((num_experts * 3 + num_ranks * 2 + 1) * sizeof(int) <=
+                 NUM_WORKSPACE_BYTES);
 
   // FP8 checks
   if (use_ue8m0)
@@ -737,7 +750,8 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         cumulative_local_expert_recv_stats, dispatch_wait_recv_cost_stats,   \
         rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                   \
         atomic_counter_per_expert, atomic_finish_counter_per_expert,         \
-        atomic_send_counter_per_expert, next_clean, next_clean_second,       \
+        atomic_send_counter_per_expert, atomic_tokens_per_rank,              \
+        atomic_done_experts_per_rank, next_clean, next_clean_second,         \
         num_next_clean_int, num_tokens, num_max_dispatch_tokens_per_rank,    \
         num_topk, num_experts, rank, num_ranks, num_warp_groups,             \
         num_warps_per_group, round_scale, phases, d2h_channel_addrs,         \

From 623fb7b625199d56abb7415c40956ee5483bb9bc Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Sun, 22 Feb 2026 20:06:09 -0500
Subject: [PATCH 14/19] revert

---
 ep/src/internode_ll.cu | 47 +++++++++++++++---------------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 83956aa6a..deea1a601 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -53,8 +53,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     int64_t* dispatch_wait_recv_cost_stats, void* rdma_recv_x,
     int* rdma_recv_count, void* rdma_x, void const* x, int64_t const* topk_idx,
     int* atomic_counter_per_expert, int* atomic_finish_counter_per_expert,
-    int* atomic_send_counter_per_expert, int* atomic_tokens_per_rank,
-    int* atomic_done_experts_per_rank, int* next_clean,
+    int* atomic_send_counter_per_expert, int* next_clean,
     int64_t* next_clean_second, int num_next_clean_int, int num_tokens,
     int num_max_dispatch_tokens_per_rank, int num_topk, int num_experts,
     int rank, int num_ranks, int num_warp_groups, int num_warps_per_group,
@@ -149,10 +148,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       // expert.
       if (warp_id < num_topk && lane_id == 0) {
         if (dst_expert_idx >= 0) {
-          auto const dst_rank = dst_expert_idx / num_local_experts;
           shared_send_slots[warp_id] =
               atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1);
-          atomicAdd(atomic_tokens_per_rank + dst_rank, 1);
         } else {
           shared_send_slots[warp_id] = -1;
         }
@@ -241,13 +238,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                            src_int4_ptr, ld_nc_global, st_na_global);
         // Increase counter after finishing
         __syncwarp();
-        if (lane_id == 0) {
-          auto const old = atomic_add_release_global(
-              atomic_finish_counter_per_expert + dst_expert_idx, 1);
-          if (old + 1 == FINISHED_SUM_TAG * 2) {
-            atomicAdd(atomic_done_experts_per_rank + dst_rank, 1);
-          }
-        }
+        lane_id == 0 ? atomic_add_release_global(
+                           atomic_finish_counter_per_expert + dst_expert_idx, 1)
+                     : 0;
       }
     }
   } else if (warp_id == num_warps - 1) {
@@ -287,11 +280,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     for (int i = expert_begin_idx; i < expert_end_idx; ++i) {
       auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
       if (lane_id == 0) {
-        auto const old = atomic_add_release_global(
-            atomic_finish_counter_per_expert + i, FINISHED_SUM_TAG - sum);
-        if (old + FINISHED_SUM_TAG - sum == FINISHED_SUM_TAG * 2) {
-          atomicAdd(atomic_done_experts_per_rank + i / num_local_experts, 1);
-        }
+        atomic_add_release_global(atomic_finish_counter_per_expert + i,
+                                  FINISHED_SUM_TAG - sum);
       }
     }
   }
@@ -310,14 +300,17 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     // Wait for all experts on this rank to finish copying to batch buffer.
     int num_tokens_to_send = 0;
     if (lane_id == 0) {
-      while (ld_acquire_global(atomic_done_experts_per_rank + responsible_rank) !=
-             num_local_experts)
+      for (int e = 0; e < num_local_experts; ++e) {
+        int expert_idx = responsible_rank * num_local_experts + e;
+        while (ld_acquire_global(atomic_finish_counter_per_expert + expert_idx) !=
+               FINISHED_SUM_TAG * 2)
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-        __builtin_amdgcn_s_sleep(1);
+            __builtin_amdgcn_s_sleep(1);
 #else
-        ;
+            ;
 #endif
-      num_tokens_to_send = ld_acquire_global(atomic_tokens_per_rank + responsible_rank);
+        num_tokens_to_send += atomic_counter_per_expert[expert_idx];
+      }
     }
     num_tokens_to_send = __shfl_sync(WARP_MASK, num_tokens_to_send, 0);
     if (lane_id == 0)
@@ -434,8 +427,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       atomic_finish_counter_per_expert[expert_idx] = 0;
       atomic_send_counter_per_expert[expert_idx] = 0;
     }
-    atomic_tokens_per_rank[dst_rank] = 0;
-    atomic_done_experts_per_rank[dst_rank] = 0;
     if (dst_rank == 0) {
       for (int e = 0; e < num_local_experts; ++e) packed_recv_count[e] = 0;
     }
@@ -728,11 +719,8 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
       atomic_counter_per_expert + num_experts;
   auto atomic_send_counter_per_expert =
       atomic_finish_counter_per_expert + num_experts;
-  auto atomic_tokens_per_rank = atomic_send_counter_per_expert + num_experts;
-  auto atomic_done_experts_per_rank = atomic_tokens_per_rank + num_ranks;
-  auto grid_sync_barrier_ptr = atomic_done_experts_per_rank + num_ranks;
-  EP_HOST_ASSERT((num_experts * 3 + num_ranks * 2 + 1) * sizeof(int) <=
-                 NUM_WORKSPACE_BYTES);
+  auto grid_sync_barrier_ptr = atomic_send_counter_per_expert + num_experts;
+  EP_HOST_ASSERT((num_experts * 3 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
 
   // FP8 checks
   if (use_ue8m0)
@@ -750,8 +738,7 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
         cumulative_local_expert_recv_stats, dispatch_wait_recv_cost_stats,   \
         rdma_recv_x, rdma_recv_count, rdma_x, x, topk_idx,                   \
         atomic_counter_per_expert, atomic_finish_counter_per_expert,         \
-        atomic_send_counter_per_expert, atomic_tokens_per_rank,              \
-        atomic_done_experts_per_rank, next_clean, next_clean_second,         \
+        atomic_send_counter_per_expert, next_clean, next_clean_second,       \
         num_next_clean_int, num_tokens, num_max_dispatch_tokens_per_rank,    \
         num_topk, num_experts, rank, num_ranks, num_warp_groups,             \
         num_warps_per_group, round_scale, phases, d2h_channel_addrs,         \

From 45a4e841b08f8f3ee247c2e280be79e535c99a86 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Mon, 23 Feb 2026 23:15:48 +0000
Subject: [PATCH 15/19] improve perf

---
 ep/src/internode_ll.cu | 195 +++++++++++++++++++++++++----------------
 1 file changed, 118 insertions(+), 77 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index deea1a601..84bbd1eab 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -18,6 +18,11 @@ constexpr int kNumMaxWarpGroups = 16;
 constexpr int kNumMaxWarpGroups = 32;
 #endif
 
+struct PackedDispatchExpertHeader {
+  int token_offset;
+  int token_count;
+};
+
 template <int kNumThreads>
 __launch_bounds__(kNumThreads, 1) __global__
     void clean_low_latency_buffer(int* clean_0, int num_clean_int_0,
@@ -95,9 +100,15 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                               : (kHidden * sizeof(nv_bfloat16)));
   size_t const num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
   EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
-  // Per-rank layout: [src_rank][expert][slot] on receiver (partitioned by source)
-  size_t const rank_region_bytes = num_local_experts *
-      num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+  size_t const rank_header_bytes = align<size_t>(
+      static_cast<size_t>(num_local_experts) *
+          sizeof(PackedDispatchExpertHeader),
+      sizeof(int4));
+  size_t const rank_payload_bytes =
+      static_cast<size_t>(num_local_experts) *
+      static_cast<size_t>(num_max_dispatch_tokens_per_rank) * num_bytes_per_msg;
+  // Per-rank layout: [src_rank][header + packed payload] on receiver.
+  size_t const rank_region_bytes = rank_header_bytes + rank_payload_bytes;
 
   __shared__ int shared_num_tokens_to_send_per_rank[kNumMaxWarpGroups];
 
@@ -229,7 +240,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
             num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
         auto const batch_buf_ptr =
             static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-            dst_rank * rank_region_bytes +
+            dst_rank * rank_region_bytes + rank_header_bytes +
             (dst_expert_local_idx * num_max_dispatch_tokens_per_rank + slot_idx) *
                 num_bytes_per_msg;
         auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
@@ -302,13 +313,25 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     if (lane_id == 0) {
       for (int e = 0; e < num_local_experts; ++e) {
         int expert_idx = responsible_rank * num_local_experts + e;
+        // if (rank == 0)
+        //   printf(
+        //       "[dispatch wait enter] rank=%d responsible_rank=%d expert_local=%d "
+        //       "expert_idx=%d finish=%d\n",
+        //       rank, responsible_rank, e, expert_idx,
+        //       ld_acquire_global(atomic_finish_counter_per_expert + expert_idx));
         while (ld_acquire_global(atomic_finish_counter_per_expert + expert_idx) !=
                FINISHED_SUM_TAG * 2)
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-            __builtin_amdgcn_s_sleep(1);
+        __builtin_amdgcn_s_sleep(1);
 #else
-            ;
+        ;
 #endif
+        // if (rank == 0)
+        //   printf(
+        //       "[dispatch wait exit] rank=%d responsible_rank=%d expert_local=%d "
+        //       "expert_idx=%d finish=%d\n",
+        //       rank, responsible_rank, e, expert_idx,
+        //       ld_acquire_global(atomic_finish_counter_per_expert + expert_idx));
         num_tokens_to_send += atomic_counter_per_expert[expert_idx];
       }
     }
@@ -320,9 +343,12 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     if (num_tokens_to_send > 0) {
       auto const batch_buf_offset =
           num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-      auto const batch_buf_base =
+      auto const rank_buf_base =
           static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
           responsible_rank * rank_region_bytes;
+      auto const packed_header =
+          reinterpret_cast<PackedDispatchExpertHeader*>(rank_buf_base);
+      auto const packed_payload_base = rank_buf_base + rank_header_bytes;
       // Receiver partitions by src_rank; we write to our (sender) region.
       auto const dst_base = reinterpret_cast<uint64_t>(rdma_recv_x) +
                             rank * rank_region_bytes;
@@ -332,35 +358,44 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                       responsible_rank, max_nvl_peers, 0)
               : 0;
 
-      // Metadata for receiver:
-      // 1) stamp header[1] with rank-level token count for all valid tokens;
-      // 2) mark first unused slot per expert with src_idx=-1 and same stamp.
-      // Receiver checks this metadata before consuming payload, which helps
-      // guard against observing the atomic before payload visibility.
+      // Build per-expert offsets/counts and compact [expert][slot] payload
+      // into a contiguous packed payload.
       if (lane_id == 0) {
+        int token_offset = 0;
         for (int e = 0; e < num_local_experts; ++e) {
           auto const expert_idx = responsible_rank * num_local_experts + e;
           auto const expert_tokens = atomic_counter_per_expert[expert_idx];
-          for (int s = 0; s < expert_tokens; ++s) {
-            auto* token_header = reinterpret_cast<int*>(
-                batch_buf_base + (e * num_max_dispatch_tokens_per_rank + s) *
-                                     num_bytes_per_msg);
-            token_header[1] = num_tokens_to_send;
-          }
-          if (expert_tokens < num_max_dispatch_tokens_per_rank) {
-            auto* sentinel_ptr = reinterpret_cast<int*>(
-                batch_buf_base + (e * num_max_dispatch_tokens_per_rank +
-                                  expert_tokens) *
-                                     num_bytes_per_msg);
-            sentinel_ptr[0] = -1;
-            sentinel_ptr[1] = num_tokens_to_send;
-          }
+          atomic_send_counter_per_expert[expert_idx] = token_offset;
+          packed_header[e] = {token_offset, expert_tokens};
+          token_offset += expert_tokens;
         }
+        EP_DEVICE_ASSERT(token_offset == num_tokens_to_send);
       }
-      // Lane 0 stamps metadata/sentinels. Ensure all lanes observe these
-      // writes before any lane starts copying the rank slice.
       __syncwarp();
-      auto const total_bytes = rank_region_bytes;
+
+      for (int e = 0; e < num_local_experts; ++e) {
+        auto const expert_idx = responsible_rank * num_local_experts + e;
+        auto const expert_tokens = atomic_counter_per_expert[expert_idx];
+        auto const dst_token_begin = atomic_send_counter_per_expert[expert_idx];
+        for (int s = 0; s < expert_tokens; ++s) {
+          auto const src_slot = e * num_max_dispatch_tokens_per_rank + s;
+          auto const dst_slot = dst_token_begin + s;
+          if (src_slot == dst_slot) continue;
+          auto const* src_int4_ptr = reinterpret_cast<int4 const*>(
+              packed_payload_base +
+              static_cast<size_t>(src_slot) * num_bytes_per_msg);
+          auto* dst_int4_ptr = reinterpret_cast<int4*>(
+              packed_payload_base +
+              static_cast<size_t>(dst_slot) * num_bytes_per_msg);
+          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr,
+                             src_int4_ptr, ld_nc_global, st_na_global);
+          __syncwarp();
+        }
+      }
+
+      auto const total_bytes =
+          rank_header_bytes +
+          static_cast<size_t>(num_tokens_to_send) * num_bytes_per_msg;
       constexpr size_t kMaxCmdBytes = (1u << 26) - 1;
       EP_DEVICE_ASSERT(
           total_bytes <= kMaxCmdBytes &&
@@ -369,7 +404,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       __threadfence_system();
 
       if (dst_p2p_ptr != 0) {
-        auto const* src_int4_ptr = reinterpret_cast<int4 const*>(batch_buf_base);
+        auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rank_buf_base);
         auto* dst_int4_ptr = reinterpret_cast<int4*>(dst_p2p_ptr);
         UNROLLED_WARP_COPY(8, lane_id, total_bytes / sizeof(int4), dst_int4_ptr,
                            src_int4_ptr, ld_nc_global, st_na_global);
@@ -378,7 +413,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                          "IBGDA low-latency path requires <=255 tokens");
         uccl::nvshmemi_ibgda_put_nbi_warp(
             dst_base - reinterpret_cast<uint64_t>(rdma_buffer_ptr),
-            reinterpret_cast<uint64_t>(batch_buf_base) -
+            reinterpret_cast<uint64_t>(rank_buf_base) -
                 reinterpret_cast<uint64_t>(rdma_buffer_ptr),
             total_bytes, responsible_rank,
             /*warp_id=*/rank, lane_id, /*slot=*/0,
@@ -452,10 +487,11 @@ LOW_LATENCY_DISPATCH_RECV:
   if (responsible_expert_idx < num_experts) {
     auto const src_rank = responsible_expert_idx / num_local_experts;
     auto const local_expert_idx = responsible_expert_idx % num_local_experts;
-    auto const rdma_recv_x_uint8 =
-        static_cast<uint8_t*>(rdma_recv_x) +
-        src_rank * rank_region_bytes +
-        local_expert_idx * num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+    auto const rank_recv_base_uint8 =
+        static_cast<uint8_t*>(rdma_recv_x) + src_rank * rank_region_bytes;
+    auto const rank_recv_header =
+        reinterpret_cast<PackedDispatchExpertHeader const*>(rank_recv_base_uint8);
+    auto const rank_recv_payload_uint8 = rank_recv_base_uint8 + rank_header_bytes;
     auto const recv_x_int4 = static_cast<int4*>(packed_recv_x) +
                              local_expert_idx * num_ranks *
                                  num_max_dispatch_tokens_per_rank * hidden_int4;
@@ -473,12 +509,14 @@ LOW_LATENCY_DISPATCH_RECV:
 
     // Shared between sub-warps in warp groups
     __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups],
-        shared_recv_token_begin_idx[kNumMaxWarpGroups];
+        shared_recv_token_begin_idx[kNumMaxWarpGroups],
+        shared_recv_src_token_offset[kNumMaxWarpGroups];
 
     // Wait tokens to arrive
     // NOTES: using sub-warp 1 to overlap with sub-warp 0
     int num_recv_tokens_internode = 0, num_recv_tokens_ipc = 0,
-        num_recv_tokens = 0, recv_token_begin_idx = 0;
+        num_recv_tokens = 0, recv_token_begin_idx = 0,
+        recv_src_token_offset = 0;
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
     EP_DEVICE_ASSERT(num_warps_per_group > 1);
 #else
@@ -486,6 +524,10 @@ LOW_LATENCY_DISPATCH_RECV:
 #endif
     if (sub_warp_id == 1 and lane_id == 0) {
       auto start_time = clock64();
+      // if (rank == 0)
+      //   printf(
+      //       "[dispatch recv wait ipc enter] rank=%d src_rank=%d same_node=%d\n",
+      //       rank, src_rank, src_rank / max_nvl_peers == rank / max_nvl_peers);
       while ((src_rank / max_nvl_peers == rank / max_nvl_peers) &&
              (num_recv_tokens_ipc = ld_acquire_sys_global(
                   rdma_recv_count + rank * num_ranks + src_rank)) == 0)
@@ -494,7 +536,15 @@ LOW_LATENCY_DISPATCH_RECV:
 #else
         ;
 #endif
-
+      // if (rank == 0)
+      //   printf(
+      //       "[dispatch recv wait ipc exit] rank=%d src_rank=%d ipc_raw=%d\n",
+      //       rank, src_rank, num_recv_tokens_ipc);
+
+      // if (rank == 0)
+      //   printf(
+      //       "[dispatch recv wait internode enter] rank=%d src_rank=%d diff_node=%d\n",
+      //       rank, src_rank, src_rank / max_nvl_peers != rank / max_nvl_peers);
       while ((src_rank / max_nvl_peers != rank / max_nvl_peers) &&
              (num_recv_tokens_internode = static_cast<int>(
                   ld_acquire_sys_global(reinterpret_cast<uint64_t const*>(
@@ -505,6 +555,11 @@ LOW_LATENCY_DISPATCH_RECV:
 #else
         ;
 #endif
+      // if (rank == 0)
+      //   printf(
+      //       "[dispatch recv wait internode exit] rank=%d src_rank=%d "
+      //       "internode_raw=%d\n",
+      //       rank, src_rank, num_recv_tokens_internode);
       if (src_rank / max_nvl_peers == rank / max_nvl_peers) {
         if (ld_acquire_sys_global(reinterpret_cast<uint64_t const*>(
                 rdma_recv_count_internode + rank * num_ranks + src_rank)) !=
@@ -538,52 +593,31 @@ LOW_LATENCY_DISPATCH_RECV:
       //     "[dispatch recv] rank=%d src_rank=%d expected ipc=%d internode=%d\n",
       //     rank, src_rank, num_recv_tokens_ipc, num_recv_tokens_internode);
 
-      // Recover per-expert count from in-band sentinels in [expert][slot].
-      // Also verify metadata stamp (header[1]) before consuming payload.
+      // Read per-expert packed [offset, count] from the rank header.
       if (num_recv_tokens_total == 0) {
         num_recv_tokens = 0;
+        recv_src_token_offset = 0;
       } else {
-        // bool metadata_ready = true;
-        bool saw_sentinel = false;
-        // int mismatch_slot = -1;
-        // int mismatch_meta = 0;
-        // int mismatch_src_idx = 0;
-        num_recv_tokens = 0;
-        for (; num_recv_tokens < num_max_dispatch_tokens_per_rank;
-             ++num_recv_tokens) {
-          auto const src_src_idx = reinterpret_cast<int*>(
-              rdma_recv_x_uint8 + num_recv_tokens * num_bytes_per_msg);
-          // auto const token_meta = ld_acquire_sys_global(src_src_idx + 1);
-          // if (token_meta != num_recv_tokens_total) {
-          //   metadata_ready = false;
-          //   mismatch_slot = num_recv_tokens;
-          //   mismatch_meta = token_meta;
-          //   mismatch_src_idx = ld_nc_global(src_src_idx);
-          //   break;
-          // }
-          if (ld_nc_global(src_src_idx) < 0) {
-            saw_sentinel = true;
-            break;
-          }
-        }
-        // if (!metadata_ready) {
-        //   printf(
-        //       "[dispatch recv meta mismatch] rank=%d src_rank=%d expert=%d "
-        //       "slot=%d meta=%d expected=%d src_idx=%d\n",
-        //       rank, src_rank, local_expert_idx, mismatch_slot,
-        //       mismatch_meta, num_recv_tokens_total, mismatch_src_idx);
-        // }
-        EP_DEVICE_ASSERT(saw_sentinel ||
-                         num_recv_tokens == num_max_dispatch_tokens_per_rank);
+        auto const header_int_ptr = reinterpret_cast<int const*>(
+            rank_recv_header + local_expert_idx);
+        recv_src_token_offset = ld_acquire_sys_global(header_int_ptr);
+        num_recv_tokens = ld_acquire_sys_global(header_int_ptr + 1);
+        EP_DEVICE_ASSERT(recv_src_token_offset >= 0 && num_recv_tokens >= 0);
+        EP_DEVICE_ASSERT(recv_src_token_offset + num_recv_tokens <=
+                         num_recv_tokens_total);
       }
       recv_token_begin_idx =
           atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
       shared_num_recv_tokens[warp_group_id] = num_recv_tokens;
       shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx;
+      shared_recv_src_token_offset[warp_group_id] = recv_src_token_offset;
       recv_range[src_rank] =
           pack2<int, int64_t>(num_recv_tokens, recv_token_begin_idx);
+      auto const recv_src_base_uint8 =
+          rank_recv_payload_uint8 +
+          static_cast<size_t>(recv_src_token_offset) * num_bytes_per_msg;
       auto const src_slice_offset =
-          static_cast<long long>(reinterpret_cast<uint8_t*>(rdma_recv_x_uint8) -
+          static_cast<long long>(reinterpret_cast<uint8_t*>(recv_src_base_uint8) -
                                  static_cast<uint8_t*>(rdma_recv_x));
       auto const dst_slice_ptr =
           reinterpret_cast<uint8_t*>(recv_x_int4) +
@@ -615,13 +649,15 @@ LOW_LATENCY_DISPATCH_RECV:
 #endif
     num_recv_tokens = shared_num_recv_tokens[warp_group_id];
     recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id];
+    recv_src_token_offset = shared_recv_src_token_offset[warp_group_id];
 
     // Copy tokens
     EP_DEVICE_ASSERT(num_scales <= 64);
     for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
       // Copy source info
-      auto const src_src_idx =
-          reinterpret_cast<int*>(rdma_recv_x_uint8 + i * num_bytes_per_msg);
+      auto const src_src_idx = reinterpret_cast<int*>(
+          rank_recv_payload_uint8 +
+          static_cast<size_t>(recv_src_token_offset + i) * num_bytes_per_msg);
       if (lane_id == 0)
         recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
       __syncwarp();
@@ -703,10 +739,15 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
     size_t const num_bytes_per_msg =
         sizeof(int4) + (use_fp8 ? (hidden + num_scales * sizeof(float))
                                 : (hidden * sizeof(nv_bfloat16)));
+    size_t const rank_header_bytes = align<size_t>(
+        static_cast<size_t>(num_local_experts) *
+            sizeof(PackedDispatchExpertHeader),
+        sizeof(int4));
     size_t const rank_region_bytes =
+        rank_header_bytes +
         static_cast<size_t>(num_local_experts) *
-        static_cast<size_t>(num_max_dispatch_tokens_per_rank) *
-        num_bytes_per_msg;
+            static_cast<size_t>(num_max_dispatch_tokens_per_rank) *
+            num_bytes_per_msg;
     constexpr size_t kMaxCmdBytes = (1u << 26) - 1;
     EP_HOST_ASSERT(
         rank_region_bytes <= kMaxCmdBytes &&

From c429d3ed13bb7187e0ba681a538af413216fa119 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Mon, 23 Feb 2026 23:55:06 +0000
Subject: [PATCH 16/19] fix occasional torch run stuck issue

---
 ep/bench/utils.py      |   8 +-
 ep/src/internode_ll.cu | 264 +++++++++++++++++++----------------------
 2 files changed, 127 insertions(+), 145 deletions(-)

diff --git a/ep/bench/utils.py b/ep/bench/utils.py
index 1bcc7fee3..cd7ca75f9 100644
--- a/ep/bench/utils.py
+++ b/ep/bench/utils.py
@@ -1,6 +1,7 @@
 import inspect
 from typing import Any, Optional, Tuple, Union
 import os
+import datetime
 import torch
 import torch.distributed as dist
 from typing import Optional
@@ -74,13 +75,16 @@ def init_dist(local_rank: int, num_local_ranks: int):
 
 def init_dist_under_torchrun(local_rank: int, num_local_ranks: int):
     # torchrun already sets RANK, WORLD_SIZE, MASTER_ADDR, MASTER_PORT
+    torch.cuda.set_device(local_rank)
+    timeout_secs = int(os.getenv("UCCL_PG_TIMEOUT_SECS", "120"))
     dist.init_process_group(
-        backend="nccl", device_id=torch.device(f"cuda:{local_rank}")
+        backend="nccl",
+        device_id=torch.device(f"cuda:{local_rank}"),
+        timeout=datetime.timedelta(seconds=timeout_secs),
     )
 
     torch.set_default_dtype(torch.bfloat16)
     torch.set_default_device(f"cuda:{local_rank}")
-    torch.cuda.set_device(local_rank)
 
     return (
         dist.get_rank(),
diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 84bbd1eab..167444060 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -98,7 +98,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   size_t const num_bytes_per_msg =
       sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float))
                               : (kHidden * sizeof(nv_bfloat16)));
-  size_t const num_int4_per_msg = num_bytes_per_msg / sizeof(int4);
   EP_DEVICE_ASSERT(num_bytes_per_msg % sizeof(int4) == 0);
   size_t const rank_header_bytes = align<size_t>(
       static_cast<size_t>(num_local_experts) *
@@ -114,7 +113,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 
   // Global counter slots used for batching sends to each top-k destination.
   constexpr int kNumMaxTopK = 9;
-  __shared__ int shared_send_slots[kNumMaxTopK];
 
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
   // initialize barrier
@@ -128,54 +126,137 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   // 1. The first-kind warps for FP8 cast and sending top-k tokens
   // 2. The last warp for reading `topk_idx` and count for per-expert
   // information
+  if (warp_id < num_warps - 1) {
+    for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
+      auto dst_expert_idx =
+          warp_id < num_topk ? static_cast<int>(__ldg(
+                                   topk_idx + token_idx * num_topk + warp_id))
+                             : -1;
+      if (warp_id < num_topk && lane_id == 0 && dst_expert_idx >= 0) {
+        atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1);
+      }
+    }
+  } else if (warp_id == num_warps - 1) {
+    // NOTE(MaoZiming): These checks are ibgda specific.
+    EP_DEVICE_ASSERT(num_sms > 1);
+    if (sm_id == 0) {
+      // The first SM is also responsible for cleaning the next buffer
+#pragma unroll
+      for (int i = lane_id; i < num_next_clean_int; i += WARP_SIZE) {
+        next_clean[i] = 0;
+        next_clean_second[i] = 0;
+      }
+      // Notify before executing `int_p`
+      __syncwarp();
+#pragma unroll
+      for (int i = lane_id; i < num_experts; i += WARP_SIZE)
+        atomic_add_release_global(atomic_finish_counter_per_expert + i,
+                                  FINISHED_SUM_TAG);
+    }
+    // This SM should be responsible for some destination experts, read
+    // `topk_idx` for them
+    int expert_count[kNumMaxWarpGroups] = {0};
+    auto const expert_begin_idx = sm_id * num_warp_groups;
+    auto const expert_end_idx =
+        min(expert_begin_idx + num_warp_groups, num_experts);
+
+// Per lane count
+#pragma unroll 8
+    for (int i = lane_id; i < num_tokens * num_topk; i += WARP_SIZE) {
+      auto idx = static_cast<int>(__ldg(topk_idx + i));
+      if (idx >= expert_begin_idx and idx < expert_end_idx)
+        expert_count[idx - expert_begin_idx]++;
+    }
+
+// Warp reduce
+#pragma unroll
+    for (int i = expert_begin_idx; i < expert_end_idx; ++i) {
+      auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
+      if (lane_id == 0) {
+        atomic_add_release_global(atomic_finish_counter_per_expert + i,
+                                  FINISHED_SUM_TAG - sum);
+      }
+    }
+  }
+  __syncthreads();
+
+  // Grid-wide sync before batch-send.
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+  amd::grid_sync(grid_sync_barrier_ptr, num_sms);
+#else
+  cg::this_grid().sync();
+#endif
+
+  // Build per-rank headers and initialize packed write cursors.
+  if (responsible_rank < num_ranks && sub_warp_id == 0 && lane_id == 0) {
+    auto const batch_buf_offset =
+        num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+    auto const rank_buf_base =
+        static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+        responsible_rank * rank_region_bytes;
+    auto* packed_header =
+        reinterpret_cast<PackedDispatchExpertHeader*>(rank_buf_base);
+    int token_offset = 0;
+    for (int e = 0; e < num_local_experts; ++e) {
+      auto const expert_idx = responsible_rank * num_local_experts + e;
+      auto const expert_tokens = atomic_counter_per_expert[expert_idx];
+      packed_header[e] = {token_offset, expert_tokens};
+      // Reuse as per-expert packed write cursor in pass-2.
+      atomic_send_counter_per_expert[expert_idx] = token_offset;
+      token_offset += expert_tokens;
+    }
+  }
+  __syncwarp();
+
+  // Grid-wide sync before direct app->packed transport writes.
+#if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
+  amd::grid_sync(grid_sync_barrier_ptr, num_sms);
+#else
+  cg::this_grid().sync();
+#endif
+
+  // Pass-2: write tokens directly to packed transport payload.
   if (warp_id < num_warps - 1) {
     constexpr int kNumElemsPerRead = sizeof(int4) / sizeof(nv_bfloat16);
     EP_STATIC_ASSERT(kHidden % (WARP_SIZE * kNumElemsPerRead) == 0,
                      "Invalid hidden");
     EP_STATIC_ASSERT(kNumElemsPerRead * WARP_SIZE % kNumPerChannels == 0,
                      "Invalid vectorization");
-    auto const num_threads = (num_warps - 1) * WARP_SIZE;
     size_t const hidden_bf16_int4 = kHidden / kNumElemsPerRead;
 
     for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
-      auto const x_int4 =
-          static_cast<int4 const*>(x) + token_idx * hidden_bf16_int4;
-      auto const rdma_x_src_idx = reinterpret_cast<int*>(
-          static_cast<uint8_t*>(rdma_x) + token_idx * num_bytes_per_msg);
-      auto const rdma_x_vec = reinterpret_cast<vec_t*>(
-          reinterpret_cast<uint8_t*>(rdma_x_src_idx) + sizeof(int4));
-      auto const rdma_x_scales = reinterpret_cast<float*>(
-          reinterpret_cast<uint8_t*>(rdma_x_vec) + hidden_bytes);
-
-      // Overlap top-k index read and source token index writes
       auto dst_expert_idx =
           warp_id < num_topk ? static_cast<int>(__ldg(
                                    topk_idx + token_idx * num_topk + warp_id))
                              : -1;
-      thread_id == 0 ? (*rdma_x_src_idx = token_idx) : 0;
-
-      // Allocate per-expert send slots for top-k destinations.
-      // Each warp (warp_id < num_topk) reserves one slot for its destination
-      // expert.
-      if (warp_id < num_topk && lane_id == 0) {
-        if (dst_expert_idx >= 0) {
-          shared_send_slots[warp_id] =
-              atomicAdd(atomic_counter_per_expert + dst_expert_idx, 1);
-        } else {
-          shared_send_slots[warp_id] = -1;
-        }
-      }
-      // Sync to make shared_send_slots visible to all threads
-      sync_barrier_1((num_warps - 1) * WARP_SIZE);
+      if (dst_expert_idx < 0) continue;
+
+      int slot_idx = 0;
+      if (lane_id == 0)
+        slot_idx = atomicAdd(atomic_send_counter_per_expert + dst_expert_idx, 1);
+      slot_idx = __shfl_sync(WARP_MASK, slot_idx, 0);
+
+      auto const dst_rank = dst_expert_idx / num_local_experts;
+      auto const batch_buf_offset =
+          num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+      auto* const dst_msg_ptr =
+          static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+          dst_rank * rank_region_bytes + rank_header_bytes +
+          static_cast<size_t>(slot_idx) * num_bytes_per_msg;
+
+      auto* const dst_src_idx = reinterpret_cast<int*>(dst_msg_ptr);
+      auto* const dst_vec = reinterpret_cast<vec_t*>(dst_msg_ptr + sizeof(int4));
+      auto* const dst_scales =
+          reinterpret_cast<float*>(dst_msg_ptr + sizeof(int4) + hidden_bytes);
+      auto const* x_int4 =
+          static_cast<int4 const*>(x) + token_idx * hidden_bf16_int4;
+
+      if (lane_id == 0) *dst_src_idx = token_idx;
 
-// FP8 cast
 #pragma unroll
-      for (int i = thread_id; i < hidden_bf16_int4; i += num_threads) {
-        // Read
+      for (int i = lane_id; i < hidden_bf16_int4; i += WARP_SIZE) {
         auto int4_value = __ldg(x_int4 + i);
-
         if constexpr (kUseFP8) {
-          // Calculate local amax
           auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
           float fp32_values[kNumElemsPerRead];
           float amax = kFP8Margin, scale, scale_inv;
@@ -184,9 +265,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
             fp32_values[j] = static_cast<float>(bf16_values[j]);
             amax = fmaxf(amax, fabsf(fp32_values[j]));
           }
-
-          // Reduce amax and scale
-
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
           EP_STATIC_ASSERT(kNumElemsPerRead * WARP_SIZE / kNumPerChannels == 4,
                            "Invalid vectorization");
@@ -200,9 +278,8 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
           calculate_fp8_scales(amax, scale, scale_inv, round_scale);
           if (lane_id == 0 or lane_id == 16)
 #endif
-            rdma_x_scales[i * kNumElemsPerRead / 128] = scale_inv;
+            dst_scales[i * kNumElemsPerRead / 128] = scale_inv;
 
-          // Cast into send buffer
           vec_t int2_value;
           auto fp8x2_values =
               reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value);
@@ -223,77 +300,16 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 #endif
                 );
           }
-          rdma_x_vec[i] = int2_value;
+          dst_vec[i] = int2_value;
         } else {
-          // Reinterpret-cast is for C++14 compatibility
-          rdma_x_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
+          dst_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
         }
       }
-      sync_barrier_1(num_threads);
-
-      // Issue IBGDA sends
-      if (dst_expert_idx >= 0) {
-        auto const dst_rank = dst_expert_idx / num_local_experts;
-        auto const dst_expert_local_idx = dst_expert_idx % num_local_experts;
-        auto const slot_idx = shared_send_slots[warp_id];
-        auto const batch_buf_offset =
-            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-        auto const batch_buf_ptr =
-            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-            dst_rank * rank_region_bytes + rank_header_bytes +
-            (dst_expert_local_idx * num_max_dispatch_tokens_per_rank + slot_idx) *
-                num_bytes_per_msg;
-        auto const* src_int4_ptr = reinterpret_cast<int4 const*>(rdma_x_src_idx);
-        auto* batch_buf_int4_ptr = reinterpret_cast<int4*>(batch_buf_ptr);
-        UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, batch_buf_int4_ptr,
-                           src_int4_ptr, ld_nc_global, st_na_global);
-        // Increase counter after finishing
-        __syncwarp();
-        lane_id == 0 ? atomic_add_release_global(
-                           atomic_finish_counter_per_expert + dst_expert_idx, 1)
-                     : 0;
-      }
-    }
-  } else if (warp_id == num_warps - 1) {
-    // NOTE(MaoZiming): These checks are ibgda specific.
-    EP_DEVICE_ASSERT(num_sms > 1);
-    if (sm_id == 0) {
-      // The first SM is also responsible for cleaning the next buffer
-#pragma unroll
-      for (int i = lane_id; i < num_next_clean_int; i += WARP_SIZE) {
-        next_clean[i] = 0;
-        next_clean_second[i] = 0;
-      }
-      // Notify before executing `int_p`
       __syncwarp();
-#pragma unroll
-      for (int i = lane_id; i < num_experts; i += WARP_SIZE)
-        atomic_add_release_global(atomic_finish_counter_per_expert + i,
-                                  FINISHED_SUM_TAG);
-    }
-    // This SM should be responsible for some destination experts, read
-    // `topk_idx` for them
-    int expert_count[kNumMaxWarpGroups] = {0};
-    auto const expert_begin_idx = sm_id * num_warp_groups;
-    auto const expert_end_idx =
-        min(expert_begin_idx + num_warp_groups, num_experts);
-
-// Per lane count
-#pragma unroll 8
-    for (int i = lane_id; i < num_tokens * num_topk; i += WARP_SIZE) {
-      auto idx = static_cast<int>(__ldg(topk_idx + i));
-      if (idx >= expert_begin_idx and idx < expert_end_idx)
-        expert_count[idx - expert_begin_idx]++;
-    }
-
-// Warp reduce
-#pragma unroll
-    for (int i = expert_begin_idx; i < expert_end_idx; ++i) {
-      auto sum = warp_reduce_sum(expert_count[i - expert_begin_idx]);
-      if (lane_id == 0) {
-        atomic_add_release_global(atomic_finish_counter_per_expert + i,
-                                  FINISHED_SUM_TAG - sum);
-      }
+      if (lane_id == 0)
+        atomic_add_release_global(atomic_finish_counter_per_expert +
+                                      dst_expert_idx,
+                                  1);
     }
   }
   __syncthreads();
@@ -346,9 +362,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       auto const rank_buf_base =
           static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
           responsible_rank * rank_region_bytes;
-      auto const packed_header =
-          reinterpret_cast<PackedDispatchExpertHeader*>(rank_buf_base);
-      auto const packed_payload_base = rank_buf_base + rank_header_bytes;
       // Receiver partitions by src_rank; we write to our (sender) region.
       auto const dst_base = reinterpret_cast<uint64_t>(rdma_recv_x) +
                             rank * rank_region_bytes;
@@ -358,41 +371,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                                       responsible_rank, max_nvl_peers, 0)
               : 0;
 
-      // Build per-expert offsets/counts and compact [expert][slot] payload
-      // into a contiguous packed payload.
-      if (lane_id == 0) {
-        int token_offset = 0;
-        for (int e = 0; e < num_local_experts; ++e) {
-          auto const expert_idx = responsible_rank * num_local_experts + e;
-          auto const expert_tokens = atomic_counter_per_expert[expert_idx];
-          atomic_send_counter_per_expert[expert_idx] = token_offset;
-          packed_header[e] = {token_offset, expert_tokens};
-          token_offset += expert_tokens;
-        }
-        EP_DEVICE_ASSERT(token_offset == num_tokens_to_send);
-      }
-      __syncwarp();
-
-      for (int e = 0; e < num_local_experts; ++e) {
-        auto const expert_idx = responsible_rank * num_local_experts + e;
-        auto const expert_tokens = atomic_counter_per_expert[expert_idx];
-        auto const dst_token_begin = atomic_send_counter_per_expert[expert_idx];
-        for (int s = 0; s < expert_tokens; ++s) {
-          auto const src_slot = e * num_max_dispatch_tokens_per_rank + s;
-          auto const dst_slot = dst_token_begin + s;
-          if (src_slot == dst_slot) continue;
-          auto const* src_int4_ptr = reinterpret_cast<int4 const*>(
-              packed_payload_base +
-              static_cast<size_t>(src_slot) * num_bytes_per_msg);
-          auto* dst_int4_ptr = reinterpret_cast<int4*>(
-              packed_payload_base +
-              static_cast<size_t>(dst_slot) * num_bytes_per_msg);
-          UNROLLED_WARP_COPY(8, lane_id, num_int4_per_msg, dst_int4_ptr,
-                             src_int4_ptr, ld_nc_global, st_na_global);
-          __syncwarp();
-        }
-      }
-
       auto const total_bytes =
           rank_header_bytes +
           static_cast<size_t>(num_tokens_to_send) * num_bytes_per_msg;

From 77eb3a65d635798885b903741189ab43ecb7a926 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Tue, 24 Feb 2026 05:02:20 +0000
Subject: [PATCH 17/19] remove torch.cuda.set_device

---
 ep/bench/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ep/bench/utils.py b/ep/bench/utils.py
index cd7ca75f9..42bc1ad50 100644
--- a/ep/bench/utils.py
+++ b/ep/bench/utils.py
@@ -129,7 +129,7 @@ def get_cpu_proxies_meta(proxies, rank, scratch_ptr, scratch_bytes, num_ranks, g
         device_index = int(os.environ["LOCAL_RANK"])
     else:
         device_index = torch.cuda.current_device()
-    torch.cuda.set_device(device_index)
+    # torch.cuda.set_device(device_index)
     dist.all_gather_object(all_meta, meta, group=group)
     rank2meta = {m["rank"]: m for m in all_meta}
 

From 07e25e157e57d44708132b66730c4c61ec90fb14 Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Tue, 24 Feb 2026 05:35:54 +0000
Subject: [PATCH 18/19] separate out Gloo object group

---
 ep/bench/buffer.py | 22 ++++++++++++++++++----
 ep/bench/utils.py  | 25 ++++++++++++++++++++-----
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/ep/bench/buffer.py b/ep/bench/buffer.py
index 14ef8cc34..8f4303482 100644
--- a/ep/bench/buffer.py
+++ b/ep/bench/buffer.py
@@ -1,4 +1,5 @@
 import os
+import datetime
 import torch
 import torch.distributed as dist
 from typing import Callable, Tuple, Optional, Union, List
@@ -97,6 +98,16 @@ def __init__(
             rdma_buffer_is_host_allocated = bool(torch.version.cuda)
 
         rdma_buffer_ptr = self.scratch.data_ptr()
+        obj_timeout_secs = int(
+            os.getenv(
+                "UCCL_OBJ_PG_TIMEOUT_SECS", os.getenv("UCCL_PG_TIMEOUT_SECS", "120")
+            )
+        )
+        self.object_group = dist.new_group(
+            list(range(dist.get_world_size(group))),
+            backend="gloo",
+            timeout=datetime.timedelta(seconds=obj_timeout_secs),
+        )
         self.proxies, self.workers = initialize_uccl(
             rdma_buffer_ptr,
             num_rdma_bytes,
@@ -106,8 +117,9 @@ def __init__(
             use_normal_mode=not low_latency_mode,
             is_intranode=is_intranode,
             rdma_buffer_is_host_allocated=rdma_buffer_is_host_allocated,
+            object_group=self.object_group,
         )
-        check_nvlink_connections(group)
+        check_nvlink_connections(group, object_group=self.object_group)
 
         # Initialize the CPP runtime
         self.rank = group.rank()
@@ -135,14 +147,14 @@ def __init__(
         ] * self.group_size
         local_device_id = self.runtime.get_local_device_id()
         # print("Before all_gather_object device_ids", local_device_id, flush=True)
-        dist.all_gather_object(device_ids, local_device_id, group)
+        dist.all_gather_object(device_ids, local_device_id, self.object_group)
         # Synchronize IPC handles
         ipc_handles = [
             None,
         ] * self.group_size
         local_ipc_handle = self.runtime.get_local_ipc_handle()
         # print("Before all_gather_object ipc_handles", local_ipc_handle, flush=True)
-        dist.all_gather_object(ipc_handles, local_ipc_handle, group)
+        dist.all_gather_object(ipc_handles, local_ipc_handle, self.object_group)
 
         rdma_ipc_handles = [None] * self.group_size
         # CUDA IPC only works with device memory; skip when using cudaMallocHost.
@@ -151,7 +163,9 @@ def __init__(
             if self.num_rdma_bytes > 0 and not rdma_buffer_is_host_allocated
             else None
         )
-        dist.all_gather_object(rdma_ipc_handles, local_rdma_ipc_handle, group)
+        dist.all_gather_object(
+            rdma_ipc_handles, local_rdma_ipc_handle, self.object_group
+        )
         root_unique_id = None
         # Make CPP runtime available
         self.runtime.sync(
diff --git a/ep/bench/utils.py b/ep/bench/utils.py
index 42bc1ad50..46ae81c1d 100644
--- a/ep/bench/utils.py
+++ b/ep/bench/utils.py
@@ -114,7 +114,9 @@ def get_peer_ip(rank: int, num_ranks: int, group: dist.ProcessGroup):
     return peer_ip if peer_ip else ""
 
 
-def get_cpu_proxies_meta(proxies, rank, scratch_ptr, scratch_bytes, num_ranks, group):
+def get_cpu_proxies_meta(
+    proxies, rank, scratch_ptr, scratch_bytes, num_ranks, group, object_group=None
+):
     my_ip = ep.get_oob_ip()
     meta = {
         "rank": rank,
@@ -130,7 +132,8 @@ def get_cpu_proxies_meta(proxies, rank, scratch_ptr, scratch_bytes, num_ranks, g
     else:
         device_index = torch.cuda.current_device()
     # torch.cuda.set_device(device_index)
-    dist.all_gather_object(all_meta, meta, group=group)
+    collect_group = object_group if object_group is not None else group
+    dist.all_gather_object(all_meta, meta, group=collect_group)
     rank2meta = {m["rank"]: m for m in all_meta}
 
     # Debug: print IP distribution
@@ -146,7 +149,9 @@ def get_cpu_proxies_meta(proxies, rank, scratch_ptr, scratch_bytes, num_ranks, g
     return rank2meta
 
 
-def check_nvlink_connections(group: dist.ProcessGroup):
+def check_nvlink_connections(
+    group: dist.ProcessGroup, object_group: Optional[dist.ProcessGroup] = None
+):
     """
     Check NVLink connection between every pair of GPUs.
 
@@ -174,7 +179,10 @@ def check_nvlink_connections(group: dist.ProcessGroup):
         physical_device_indices = [
             0,
         ] * group.size()
-        dist.all_gather_object(physical_device_indices, physical_device_idx, group)
+        collect_group = object_group if object_group is not None else group
+        dist.all_gather_object(
+            physical_device_indices, physical_device_idx, collect_group
+        )
 
         # Check whether they are all connected via NVLink
         # Reference: https://github.com/vllm-project/vllm/blob/b8e809a057765c574726a6077fd124db5077ce1f/vllm/platforms/cuda.py#L438
@@ -518,6 +526,7 @@ def initialize_uccl(
     is_intranode=False,
     use_normal_mode=False,
     rdma_buffer_is_host_allocated=False,
+    object_group=None,
 ):
     try:
         for shm_file in glob.glob("/dev/shm/uccl_barrier_*"):
@@ -580,7 +589,13 @@ def initialize_uccl(
         proxies.append(proxy)
 
     rank2meta = get_cpu_proxies_meta(
-        proxies, rank, scratch_ptr, scratch_nbytes, num_ranks, group
+        proxies,
+        rank,
+        scratch_ptr,
+        scratch_nbytes,
+        num_ranks,
+        group,
+        object_group=object_group,
     )
     peers_meta_list = [rank2meta[r] for r in range(num_ranks)]
 

From 2f86484f9c42ed0d56c91d907b2e3c5731903b7a Mon Sep 17 00:00:00 2001
From: MaoZiming <ziming.mao@berkeley.edu>
Date: Fri, 27 Feb 2026 13:00:57 -0500
Subject: [PATCH 19/19] tmp checkpoint

---
 ep/src/internode_ll.cu | 422 +++++++++++++++++++++++++++--------------
 1 file changed, 280 insertions(+), 142 deletions(-)

diff --git a/ep/src/internode_ll.cu b/ep/src/internode_ll.cu
index 167444060..fb30d2d19 100644
--- a/ep/src/internode_ll.cu
+++ b/ep/src/internode_ll.cu
@@ -78,6 +78,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   auto const sub_warp_id = warp_id % num_warps_per_group;
   auto const responsible_rank = sm_id * num_warp_groups + warp_group_id;
   auto const responsible_expert_idx = sm_id * num_warp_groups + warp_group_id;
+  auto* rank_unique_write_cursor = grid_sync_barrier_ptr + 1;
 
   // May extract UE8M0 from the scales
   using scale_t = std::conditional_t<kUseUE8M0, uint8_t, float>;
@@ -93,7 +94,11 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   size_t const hidden_int4 = hidden_bytes / sizeof(int4);
 
   // Message package: hidden data, FP8 scales, index at source
-  // NOTES: currently we have 3 reserved int fields for future use
+  // NOTES: metadata int4 layout:
+  //   [0] src_token_idx
+  //   [1] packed local expert list (low 32 bits, 5 bits per entry)
+  //   [2] packed local expert list (high 32 bits)
+  //   [3] number of local expert entries (duplicates preserved)
   using vec_t = typename std::conditional<kUseFP8, int2, int4>::type;
   size_t const num_bytes_per_msg =
       sizeof(int4) + (kUseFP8 ? (kHidden + num_scales * sizeof(float))
@@ -121,6 +126,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
 
   // Sending phase
   if ((phases & LOW_LATENCY_SEND_PHASE) == 0) goto LOW_LATENCY_DISPATCH_RECV;
+  EP_DEVICE_ASSERT(num_local_experts <= 32);
 
   // There are 2 kinds of warps in this part:
   // 1. The first-kind warps for FP8 cast and sending top-k tokens
@@ -148,6 +154,9 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       }
       // Notify before executing `int_p`
       __syncwarp();
+#pragma unroll
+      for (int i = lane_id; i < num_ranks; i += WARP_SIZE)
+        rank_unique_write_cursor[i] = 0;
 #pragma unroll
       for (int i = lane_id; i < num_experts; i += WARP_SIZE)
         atomic_add_release_global(atomic_finish_counter_per_expert + i,
@@ -196,15 +205,29 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
         responsible_rank * rank_region_bytes;
     auto* packed_header =
         reinterpret_cast<PackedDispatchExpertHeader*>(rank_buf_base);
-    int token_offset = 0;
     for (int e = 0; e < num_local_experts; ++e) {
       auto const expert_idx = responsible_rank * num_local_experts + e;
       auto const expert_tokens = atomic_counter_per_expert[expert_idx];
-      packed_header[e] = {token_offset, expert_tokens};
-      // Reuse as per-expert packed write cursor in pass-2.
-      atomic_send_counter_per_expert[expert_idx] = token_offset;
-      token_offset += expert_tokens;
+      packed_header[e] = {0, expert_tokens};
+    }
+    // Count unique source tokens for this destination rank.
+    int unique_tokens = 0;
+    for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+      bool rank_hit = false;
+      for (int k = 0; k < num_topk; ++k) {
+        auto const dst_expert_idx =
+            static_cast<int>(__ldg(topk_idx + token_idx * num_topk + k));
+        if (dst_expert_idx < 0) continue;
+        if (dst_expert_idx / num_local_experts == responsible_rank) {
+          rank_hit = true;
+          break;
+        }
+      }
+      unique_tokens += static_cast<int>(rank_hit);
     }
+    // Store per-rank unique-token count in header[0].token_offset.
+    packed_header[0].token_offset = unique_tokens;
+    rank_unique_write_cursor[responsible_rank] = 0;
   }
   __syncwarp();
 
@@ -224,92 +247,154 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
                      "Invalid vectorization");
     size_t const hidden_bf16_int4 = kHidden / kNumElemsPerRead;
 
-    for (int token_idx = sm_id; token_idx < num_tokens; token_idx += num_sms) {
-      auto dst_expert_idx =
-          warp_id < num_topk ? static_cast<int>(__ldg(
-                                   topk_idx + token_idx * num_topk + warp_id))
-                             : -1;
-      if (dst_expert_idx < 0) continue;
-
-      int slot_idx = 0;
-      if (lane_id == 0)
-        slot_idx = atomicAdd(atomic_send_counter_per_expert + dst_expert_idx, 1);
-      slot_idx = __shfl_sync(WARP_MASK, slot_idx, 0);
-
-      auto const dst_rank = dst_expert_idx / num_local_experts;
-      auto const batch_buf_offset =
-          num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-      auto* const dst_msg_ptr =
-          static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-          dst_rank * rank_region_bytes + rank_header_bytes +
-          static_cast<size_t>(slot_idx) * num_bytes_per_msg;
-
-      auto* const dst_src_idx = reinterpret_cast<int*>(dst_msg_ptr);
-      auto* const dst_vec = reinterpret_cast<vec_t*>(dst_msg_ptr + sizeof(int4));
-      auto* const dst_scales =
-          reinterpret_cast<float*>(dst_msg_ptr + sizeof(int4) + hidden_bytes);
-      auto const* x_int4 =
-          static_cast<int4 const*>(x) + token_idx * hidden_bf16_int4;
-
-      if (lane_id == 0) *dst_src_idx = token_idx;
+    auto const num_token_strides = num_sms * (num_warps - 1);
+    for (int token_idx = sm_id * (num_warps - 1) + warp_id; token_idx < num_tokens;
+         token_idx += num_token_strides) {
+      int unique_ranks[kNumMaxTopK];
+      uint64_t unique_local_expert_lists[kNumMaxTopK];
+      int unique_local_expert_counts[kNumMaxTopK];
+      int unique_rank_count = 0;
 
+      if (lane_id == 0) {
 #pragma unroll
-      for (int i = lane_id; i < hidden_bf16_int4; i += WARP_SIZE) {
-        auto int4_value = __ldg(x_int4 + i);
-        if constexpr (kUseFP8) {
-          auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
-          float fp32_values[kNumElemsPerRead];
-          float amax = kFP8Margin, scale, scale_inv;
+        for (int i = 0; i < kNumMaxTopK; ++i) {
+          unique_ranks[i] = -1;
+          unique_local_expert_lists[i] = 0;
+          unique_local_expert_counts[i] = 0;
+        }
+        for (int k = 0; k < num_topk; ++k) {
+          auto const dst_expert_idx =
+              static_cast<int>(__ldg(topk_idx + token_idx * num_topk + k));
+          if (dst_expert_idx < 0) continue;
+          auto const dst_rank = dst_expert_idx / num_local_experts;
+          auto const dst_local_expert = dst_expert_idx % num_local_experts;
+
+          int idx = -1;
 #pragma unroll
-          for (int j = 0; j < kNumElemsPerRead; ++j) {
-            fp32_values[j] = static_cast<float>(bf16_values[j]);
-            amax = fmaxf(amax, fabsf(fp32_values[j]));
+          for (int u = 0; u < kNumMaxTopK; ++u) {
+            if (u < unique_rank_count && unique_ranks[u] == dst_rank) {
+              idx = u;
+              break;
+            }
+          }
+          if (idx == -1) {
+            idx = unique_rank_count++;
+            unique_ranks[idx] = dst_rank;
+            unique_local_expert_lists[idx] = 0;
+            unique_local_expert_counts[idx] = 0;
           }
+          auto const entry_idx = unique_local_expert_counts[idx]++;
+          EP_DEVICE_ASSERT(entry_idx < kNumMaxTopK);
+          unique_local_expert_lists[idx] |=
+              (static_cast<uint64_t>(dst_local_expert) << (entry_idx * 5));
+        }
+      }
+
+      unique_rank_count = __shfl_sync(WARP_MASK, unique_rank_count, 0);
+
+      for (int u = 0; u < unique_rank_count; ++u) {
+        auto const dst_rank = __shfl_sync(WARP_MASK, unique_ranks[u], 0);
+        auto const dst_local_expert_list_lo =
+            __shfl_sync(WARP_MASK,
+                        static_cast<int>(unique_local_expert_lists[u]), 0);
+        auto const dst_local_expert_list_hi =
+            __shfl_sync(WARP_MASK,
+                        static_cast<int>(unique_local_expert_lists[u] >> 32), 0);
+        auto const dst_local_expert_count =
+            __shfl_sync(WARP_MASK, unique_local_expert_counts[u], 0);
+
+        int slot_idx = 0;
+        if (lane_id == 0)
+          slot_idx = atomicAdd(rank_unique_write_cursor + dst_rank, 1);
+        slot_idx = __shfl_sync(WARP_MASK, slot_idx, 0);
+
+        auto const batch_buf_offset =
+            num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+        auto* const dst_msg_ptr =
+            static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+            dst_rank * rank_region_bytes + rank_header_bytes +
+            static_cast<size_t>(slot_idx) * num_bytes_per_msg;
+
+        auto* const dst_meta = reinterpret_cast<int*>(dst_msg_ptr);
+        auto* const dst_vec =
+            reinterpret_cast<vec_t*>(dst_msg_ptr + sizeof(int4));
+        auto* const dst_scales =
+            reinterpret_cast<float*>(dst_msg_ptr + sizeof(int4) + hidden_bytes);
+        auto const* x_int4 =
+            static_cast<int4 const*>(x) + token_idx * hidden_bf16_int4;
+
+        if (lane_id == 0) {
+          dst_meta[0] = token_idx;
+          dst_meta[1] = dst_local_expert_list_lo;
+          dst_meta[2] = dst_local_expert_list_hi;
+          dst_meta[3] = dst_local_expert_count;
+        }
+
+#pragma unroll
+        for (int i = lane_id; i < hidden_bf16_int4; i += WARP_SIZE) {
+          auto int4_value = __ldg(x_int4 + i);
+          if constexpr (kUseFP8) {
+            auto bf16_values = reinterpret_cast<nv_bfloat16*>(&int4_value);
+            float fp32_values[kNumElemsPerRead];
+            float amax = kFP8Margin, scale, scale_inv;
+#pragma unroll
+            for (int j = 0; j < kNumElemsPerRead; ++j) {
+              fp32_values[j] = static_cast<float>(bf16_values[j]);
+              amax = fmaxf(amax, fabsf(fp32_values[j]));
+            }
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
-          EP_STATIC_ASSERT(kNumElemsPerRead * WARP_SIZE / kNumPerChannels == 4,
-                           "Invalid vectorization");
-          amax = warp_reduce_max<16>(amax);
-          calculate_fp8_scales(amax, scale, scale_inv, round_scale);
-          if (lane_id % 16 == 0)
+            EP_STATIC_ASSERT(kNumElemsPerRead * WARP_SIZE / kNumPerChannels == 4,
+                             "Invalid vectorization");
+            amax = warp_reduce_max<16>(amax);
+            calculate_fp8_scales(amax, scale, scale_inv, round_scale);
+            if (lane_id % 16 == 0)
 #else
-          EP_STATIC_ASSERT(kNumElemsPerRead * WARP_SIZE / kNumPerChannels == 2,
-                           "Invalid vectorization");
-          amax = warp_reduce_max<16>(amax);
-          calculate_fp8_scales(amax, scale, scale_inv, round_scale);
-          if (lane_id == 0 or lane_id == 16)
+            EP_STATIC_ASSERT(kNumElemsPerRead * WARP_SIZE / kNumPerChannels == 2,
+                             "Invalid vectorization");
+            amax = warp_reduce_max<16>(amax);
+            calculate_fp8_scales(amax, scale, scale_inv, round_scale);
+            if (lane_id == 0 or lane_id == 16)
 #endif
-            dst_scales[i * kNumElemsPerRead / 128] = scale_inv;
+              dst_scales[i * kNumElemsPerRead / 128] = scale_inv;
 
-          vec_t int2_value;
-          auto fp8x2_values =
-              reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value);
+            vec_t int2_value;
+            auto fp8x2_values =
+                reinterpret_cast<__nv_fp8x2_storage_t*>(&int2_value);
 #pragma unroll
-          for (int j = 0; j < kNumElemsPerRead; j += 2) {
-            float2 fp32x2 = {fp32_values[j] * scale,
-                             fp32_values[j + 1] * scale};
-            fp8x2_values[j / 2] =
-                __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE,
+            for (int j = 0; j < kNumElemsPerRead; j += 2) {
+              float2 fp32x2 = {fp32_values[j] * scale,
+                               fp32_values[j + 1] * scale};
+              fp8x2_values[j / 2] =
+                  __nv_cvt_float2_to_fp8x2(fp32x2, __NV_SATFINITE,
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
 #if defined(__gfx942__)
-                                         __HIP_E4M3_FNUZ
+                                           __HIP_E4M3_FNUZ
 #else
-                                         __HIP_E4M3
+                                           __HIP_E4M3
 #endif
 #else
-                                         __NV_E4M3
+                                           __NV_E4M3
 #endif
-                );
+                  );
+            }
+            dst_vec[i] = int2_value;
+          } else {
+            dst_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
           }
-          dst_vec[i] = int2_value;
-        } else {
-          dst_vec[i] = *reinterpret_cast<vec_t*>(&int4_value);
+        }
+        __syncwarp();
+      }
+
+      if (lane_id == 0) {
+        for (int k = 0; k < num_topk; ++k) {
+          auto const dst_expert_idx =
+              static_cast<int>(__ldg(topk_idx + token_idx * num_topk + k));
+          if (dst_expert_idx >= 0)
+            atomic_add_release_global(atomic_finish_counter_per_expert +
+                                          dst_expert_idx,
+                                      1);
         }
       }
-      __syncwarp();
-      if (lane_id == 0)
-        atomic_add_release_global(atomic_finish_counter_per_expert +
-                                      dst_expert_idx,
-                                  1);
     }
   }
   __syncthreads();
@@ -326,6 +411,13 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
   if (responsible_rank < num_ranks && sub_warp_id == 0) {
     // Wait for all experts on this rank to finish copying to batch buffer.
     int num_tokens_to_send = 0;
+    auto const batch_buf_offset =
+        num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
+    auto const rank_buf_base =
+        static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
+        responsible_rank * rank_region_bytes;
+    auto const* packed_header =
+        reinterpret_cast<PackedDispatchExpertHeader const*>(rank_buf_base);
     if (lane_id == 0) {
       for (int e = 0; e < num_local_experts; ++e) {
         int expert_idx = responsible_rank * num_local_experts + e;
@@ -348,7 +440,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
         //       "expert_idx=%d finish=%d\n",
         //       rank, responsible_rank, e, expert_idx,
         //       ld_acquire_global(atomic_finish_counter_per_expert + expert_idx));
-        num_tokens_to_send += atomic_counter_per_expert[expert_idx];
+        num_tokens_to_send += (e == 0 ? packed_header[0].token_offset : 0);
       }
     }
     num_tokens_to_send = __shfl_sync(WARP_MASK, num_tokens_to_send, 0);
@@ -357,11 +449,6 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
     __syncwarp();
 
     if (num_tokens_to_send > 0) {
-      auto const batch_buf_offset =
-          num_max_dispatch_tokens_per_rank * num_bytes_per_msg;
-      auto const rank_buf_base =
-          static_cast<uint8_t*>(rdma_x) + batch_buf_offset +
-          responsible_rank * rank_region_bytes;
       // Receiver partitions by src_rank; we write to our (sender) region.
       auto const dst_base = reinterpret_cast<uint64_t>(rdma_recv_x) +
                             rank * rank_region_bytes;
@@ -440,6 +527,7 @@ __global__ __launch_bounds__(1024, 1) void dispatch(
       atomic_finish_counter_per_expert[expert_idx] = 0;
       atomic_send_counter_per_expert[expert_idx] = 0;
     }
+    rank_unique_write_cursor[dst_rank] = 0;
     if (dst_rank == 0) {
       for (int e = 0; e < num_local_experts; ++e) packed_recv_count[e] = 0;
     }
@@ -488,13 +576,14 @@ LOW_LATENCY_DISPATCH_RECV:
     // Shared between sub-warps in warp groups
     __shared__ int shared_num_recv_tokens[kNumMaxWarpGroups],
         shared_recv_token_begin_idx[kNumMaxWarpGroups],
-        shared_recv_src_token_offset[kNumMaxWarpGroups];
+        shared_num_unique_recv_tokens[kNumMaxWarpGroups],
+        shared_recv_write_cursor[kNumMaxWarpGroups];
 
     // Wait tokens to arrive
     // NOTES: using sub-warp 1 to overlap with sub-warp 0
     int num_recv_tokens_internode = 0, num_recv_tokens_ipc = 0,
         num_recv_tokens = 0, recv_token_begin_idx = 0,
-        recv_src_token_offset = 0;
+        num_unique_recv_tokens = 0;
 #if defined(__HIP_PLATFORM_AMD__) || defined(__HIPCC__)
     EP_DEVICE_ASSERT(num_warps_per_group > 1);
 #else
@@ -571,31 +660,32 @@ LOW_LATENCY_DISPATCH_RECV:
       //     "[dispatch recv] rank=%d src_rank=%d expected ipc=%d internode=%d\n",
       //     rank, src_rank, num_recv_tokens_ipc, num_recv_tokens_internode);
 
-      // Read per-expert packed [offset, count] from the rank header.
+      // Read per-expert packed [count] and per-rank unique-token count.
       if (num_recv_tokens_total == 0) {
         num_recv_tokens = 0;
-        recv_src_token_offset = 0;
+        num_unique_recv_tokens = 0;
       } else {
+        auto const rank_unique_token_count =
+            ld_acquire_sys_global(reinterpret_cast<int const*>(rank_recv_header));
         auto const header_int_ptr = reinterpret_cast<int const*>(
             rank_recv_header + local_expert_idx);
-        recv_src_token_offset = ld_acquire_sys_global(header_int_ptr);
         num_recv_tokens = ld_acquire_sys_global(header_int_ptr + 1);
-        EP_DEVICE_ASSERT(recv_src_token_offset >= 0 && num_recv_tokens >= 0);
-        EP_DEVICE_ASSERT(recv_src_token_offset + num_recv_tokens <=
-                         num_recv_tokens_total);
+        num_unique_recv_tokens = rank_unique_token_count;
+        EP_DEVICE_ASSERT(num_recv_tokens >= 0);
+        EP_DEVICE_ASSERT(num_unique_recv_tokens >= 0);
+        EP_DEVICE_ASSERT(num_unique_recv_tokens <= num_recv_tokens_total);
       }
       recv_token_begin_idx =
           atomicAdd(packed_recv_count + local_expert_idx, num_recv_tokens);
       shared_num_recv_tokens[warp_group_id] = num_recv_tokens;
       shared_recv_token_begin_idx[warp_group_id] = recv_token_begin_idx;
-      shared_recv_src_token_offset[warp_group_id] = recv_src_token_offset;
+      shared_num_unique_recv_tokens[warp_group_id] = num_unique_recv_tokens;
+      shared_recv_write_cursor[warp_group_id] = recv_token_begin_idx;
       recv_range[src_rank] =
           pack2<int, int64_t>(num_recv_tokens, recv_token_begin_idx);
-      auto const recv_src_base_uint8 =
-          rank_recv_payload_uint8 +
-          static_cast<size_t>(recv_src_token_offset) * num_bytes_per_msg;
       auto const src_slice_offset =
-          static_cast<long long>(reinterpret_cast<uint8_t*>(recv_src_base_uint8) -
+          static_cast<long long>(reinterpret_cast<uint8_t const*>(
+                                     rank_recv_payload_uint8) -
                                  static_cast<uint8_t*>(rdma_recv_x));
       auto const dst_slice_ptr =
           reinterpret_cast<uint8_t*>(recv_x_int4) +
@@ -606,9 +696,9 @@ LOW_LATENCY_DISPATCH_RECV:
                                  static_cast<uint8_t*>(packed_recv_x));
       // printf(
       //     "[dispatch recv slice] rank=%d src_rank=%d expert=%d src_off=%lld "
-      //     "dst_off=%lld recv_tokens=%d begin=%d\n",
+      //     "dst_off=%lld recv_tokens=%d unique_tokens=%d begin=%d\n",
       //     rank, src_rank, local_expert_idx, src_slice_offset, dst_slice_offset,
-      //     num_recv_tokens, recv_token_begin_idx);
+      //     num_recv_tokens, num_unique_recv_tokens, recv_token_begin_idx);
 
       // Add stats for diagnosis
       if (cumulative_local_expert_recv_stats != nullptr)
@@ -627,59 +717,104 @@ LOW_LATENCY_DISPATCH_RECV:
 #endif
     num_recv_tokens = shared_num_recv_tokens[warp_group_id];
     recv_token_begin_idx = shared_recv_token_begin_idx[warp_group_id];
-    recv_src_token_offset = shared_recv_src_token_offset[warp_group_id];
+    num_unique_recv_tokens = shared_num_unique_recv_tokens[warp_group_id];
 
     // Copy tokens
     EP_DEVICE_ASSERT(num_scales <= 64);
-    for (int i = sub_warp_id; i < num_recv_tokens; i += num_warps_per_group) {
-      // Copy source info
-      auto const src_src_idx = reinterpret_cast<int*>(
-          rank_recv_payload_uint8 +
-          static_cast<size_t>(recv_src_token_offset + i) * num_bytes_per_msg);
-      if (lane_id == 0)
-        recv_src_info[recv_token_begin_idx + i] = ld_nc_global(src_src_idx);
-      __syncwarp();
+    for (int i = sub_warp_id; i < num_unique_recv_tokens;
+         i += num_warps_per_group) {
+      auto const src_msg_ptr =
+          rank_recv_payload_uint8 + static_cast<size_t>(i) * num_bytes_per_msg;
+      auto const src_meta_ptr = reinterpret_cast<int const*>(src_msg_ptr);
+
+      int src_token_idx = 0;
+      int dst_local_expert_list_lo = 0;
+      int dst_local_expert_list_hi = 0;
+      int dst_local_expert_count = 0;
+      if (lane_id == 0) {
+        src_token_idx = ld_nc_global(src_meta_ptr);
+        dst_local_expert_list_lo = ld_nc_global(src_meta_ptr + 1);
+        dst_local_expert_list_hi = ld_nc_global(src_meta_ptr + 2);
+        dst_local_expert_count = ld_nc_global(src_meta_ptr + 3);
+      }
+      src_token_idx = __shfl_sync(WARP_MASK, src_token_idx, 0);
+      dst_local_expert_list_lo =
+          __shfl_sync(WARP_MASK, dst_local_expert_list_lo, 0);
+      dst_local_expert_list_hi =
+          __shfl_sync(WARP_MASK, dst_local_expert_list_hi, 0);
+      dst_local_expert_count =
+          __shfl_sync(WARP_MASK, dst_local_expert_count, 0);
+      EP_DEVICE_ASSERT(dst_local_expert_count >= 0 &&
+                       dst_local_expert_count <= kNumMaxTopK);
+
+      auto const dst_local_expert_list =
+          (static_cast<uint64_t>(static_cast<uint32_t>(dst_local_expert_list_hi))
+           << 32) |
+          static_cast<uint32_t>(dst_local_expert_list_lo);
+
+      int local_expert_multiplicity = 0;
+      for (int j = 0; j < dst_local_expert_count; ++j) {
+        auto const expert_j =
+            static_cast<int>((dst_local_expert_list >> (j * 5)) & 0x1f);
+        local_expert_multiplicity += (expert_j == local_expert_idx);
+      }
+      if (local_expert_multiplicity == 0) continue;
+
+      for (int rep = 0; rep < local_expert_multiplicity; ++rep) {
+        int dst_token_idx = 0;
+        if (lane_id == 0)
+          dst_token_idx = atomicAdd(shared_recv_write_cursor + warp_group_id, 1);
+        dst_token_idx = __shfl_sync(WARP_MASK, dst_token_idx, 0);
+
+        // Copy source info
+        auto const src_src_idx = const_cast<int*>(src_meta_ptr);
+        if (lane_id == 0)
+          recv_src_info[dst_token_idx] = src_token_idx;
+        __syncwarp();
 
-      // Copy data
-      // NOTES: only 2 load iterations for 7K hidden with 7 unrolls
-      auto const src_data = reinterpret_cast<int4*>(
-          reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
-      auto const dst_data =
-          recv_x_int4 + (recv_token_begin_idx + i) * hidden_int4;
-      UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data,
-                         ld_nc_global, st_na_global);
-
-      // Copy scales
-      if constexpr (kUseFP8) {
-        // Equivalent CuTe layout:
-        //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack,
-        //   (num_tokens * num_elems_per_pack, 1))
-        auto const src_scales = reinterpret_cast<float*>(
-            reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
-        auto const num_elems_per_pack =
-            static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
-        auto const token_idx = recv_token_begin_idx + i;
-        auto const token_stride = num_elems_per_pack;
-        auto const pack_stride =
-            num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
-        if (lane_id < num_scales) {
-          auto const pack_idx = lane_id / num_elems_per_pack;
-          auto const elem_idx = lane_id % num_elems_per_pack;
-          auto scale = extract_required_scale_format<kUseUE8M0>(
-              ld_nc_global(src_scales + lane_id));
-          recv_x_scales[token_idx * token_stride + pack_idx * pack_stride +
-                        elem_idx] = scale;
-        }
-        if (lane_id + WARP_SIZE < num_scales) {
-          auto const pack_idx = (lane_id + WARP_SIZE) / num_elems_per_pack;
-          auto const elem_idx = (lane_id + WARP_SIZE) % num_elems_per_pack;
-          auto scale = extract_required_scale_format<kUseUE8M0>(
-              ld_nc_global(src_scales + lane_id + WARP_SIZE));
-          recv_x_scales[token_idx * token_stride + pack_idx * pack_stride +
-                        elem_idx] = scale;
+        // Copy data
+        // NOTES: only 2 load iterations for 7K hidden with 7 unrolls
+        auto const src_data = reinterpret_cast<int4*>(
+            reinterpret_cast<uint8_t*>(src_src_idx) + sizeof(int4));
+        auto const dst_data = recv_x_int4 + dst_token_idx * hidden_int4;
+        UNROLLED_WARP_COPY(7, lane_id, hidden_int4, dst_data, src_data,
+                           ld_nc_global, st_na_global);
+
+        // Copy scales
+        if constexpr (kUseFP8) {
+          // Equivalent CuTe layout:
+          //   (num_tokens, (num_packed, num_elems_per_pack)):(num_elems_per_pack,
+          //   (num_tokens * num_elems_per_pack, 1))
+          auto const src_scales = reinterpret_cast<float*>(
+              reinterpret_cast<uint8_t*>(src_data) + hidden_bytes);
+          auto const num_elems_per_pack =
+              static_cast<int>(sizeof(packed_t) / sizeof(scale_t));
+          auto const token_idx = dst_token_idx;
+          auto const token_stride = num_elems_per_pack;
+          auto const pack_stride =
+              num_ranks * num_max_dispatch_tokens_per_rank * num_elems_per_pack;
+          if (lane_id < num_scales) {
+            auto const pack_idx = lane_id / num_elems_per_pack;
+            auto const elem_idx = lane_id % num_elems_per_pack;
+            auto scale = extract_required_scale_format<kUseUE8M0>(
+                ld_nc_global(src_scales + lane_id));
+            recv_x_scales[token_idx * token_stride + pack_idx * pack_stride +
+                          elem_idx] = scale;
+          }
+          if (lane_id + WARP_SIZE < num_scales) {
+            auto const pack_idx = (lane_id + WARP_SIZE) / num_elems_per_pack;
+            auto const elem_idx = (lane_id + WARP_SIZE) % num_elems_per_pack;
+            auto scale = extract_required_scale_format<kUseUE8M0>(
+                ld_nc_global(src_scales + lane_id + WARP_SIZE));
+            recv_x_scales[token_idx * token_stride + pack_idx * pack_stride +
+                          elem_idx] = scale;
+          }
         }
       }
     }
+    if (sub_warp_id == 1 && lane_id == 0)
+      EP_DEVICE_ASSERT(shared_recv_write_cursor[warp_group_id] ==
+                       recv_token_begin_idx + num_recv_tokens);
     // if (blockIdx.x == 0 && threadIdx.x == 0)
     //   printf("[dispatch] RECV finished\n");
   }
@@ -713,6 +848,8 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
   // Guard here to fail fast for oversized rank slices.
   {
     auto const num_local_experts = num_experts / num_ranks;
+    EP_HOST_ASSERT(num_local_experts <= 32 &&
+                   "dispatch dedup metadata uses a 32-bit local expert mask");
     auto const num_scales = hidden / 128;
     size_t const num_bytes_per_msg =
         sizeof(int4) + (use_fp8 ? (hidden + num_scales * sizeof(float))
@@ -739,7 +876,8 @@ void dispatch(void* packed_recv_x, void* packed_recv_x_scales,
   auto atomic_send_counter_per_expert =
       atomic_finish_counter_per_expert + num_experts;
   auto grid_sync_barrier_ptr = atomic_send_counter_per_expert + num_experts;
-  EP_HOST_ASSERT((num_experts * 3 + 1) * sizeof(int) <= NUM_WORKSPACE_BYTES);
+  EP_HOST_ASSERT((num_experts * 3 + 1 + num_ranks) * sizeof(int) <=
+                 NUM_WORKSPACE_BYTES);
 
   // FP8 checks
   if (use_ue8m0)