From 5e420bfa988911655d2752c0f8304eb70faadff3 Mon Sep 17 00:00:00 2001
From: Yuanyuan Chen <cyyever@outlook.com>
Date: Thu, 16 Apr 2026 14:17:20 -0700
Subject: [PATCH] Validate total_num_blocks divisibility by my_size in
 block_bucketize (#5646)

Summary:
X-link: https://github.com/facebookresearch/FBGEMM/pull/2596

Add input validation for `total_num_blocks` divisibility by `my_size` in `block_bucketize_sparse_features` to prevent silent buffer overflows.


Test Plan:
# Full Suites
# Requires: Any CUDA GPU (has CPU fallback)
buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize
buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize_2d_weights

# Individual Tests (new in this diff)
buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize -- -k test_block_bucketize_sparse_features_total_num_blocks_not_divisible
buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize_2d_weights -- -k test_block_bucketize_sparse_features_2d_weights_total_num_blocks_not_divisible

# Related Tests (validates existing behavior not broken)
buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize -- -k total_num_blocks

# Benchmark (does NOT pass total_num_blocks, zero overhead on common path)
buck2 run @//mode/opt fbcode//deeplearning/fbgemm/fbgemm_gpu/bench:sparse_ops -- block-bucketize-sparse-features-bench

Reviewed By: henrylhtsang

Differential Revision: D101141810

Pulled By: q10
---
 fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h    | 24 +++++++++++++++++++
 .../sparse_block_bucketize_features.cu        |  5 ++++
 ...rse_block_bucketize_features_2d_weights.cu |  5 ++++
 fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp  | 10 ++++++++
 .../sparse/block_bucketize_2d_weights_test.py | 16 +++++++++++++
 .../test/sparse/block_bucketize_test.py       | 14 +++++++++++
 6 files changed, 74 insertions(+)

diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
index 0e7bd37234..4e11003ca4 100644
--- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
+++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h
@@ -18,6 +18,30 @@
 
 namespace fbgemm_gpu {
 
+// Validate that every element in total_num_blocks is divisible by my_size.
+inline void check_total_num_blocks_divisibility(
+    const at::Tensor& total_num_blocks,
+    int64_t my_size) {
+  const auto tnb = total_num_blocks.cpu();
+  AT_DISPATCH_INDEX_TYPES(
+      tnb.scalar_type(),
+      "block_bucketize_sparse_features_total_num_blocks_check",
+      [&] {
+        const auto* tnb_data = tnb.const_data_ptr<index_t>();
+        for (const auto t : c10::irange(tnb.numel())) {
+          TORCH_CHECK(
+              tnb_data[t] % my_size == 0,
+              "block_bucketize_sparse_features: total_num_blocks[",
+              t,
+              "] = ",
+              tnb_data[t],
+              " must be a multiple of my_size (",
+              my_size,
+              ")");
+        }
+      });
+}
+
 /// @defgroup sparse-data-cuda Sparse Data CUDA Operators
 /// The following are CUDA operators
 ///
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
index 8257ebeee7..1ae97de6e8 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu
@@ -800,6 +800,11 @@ _block_bucketize_sparse_features_cuda(
 
   CUDA_DEVICE_GUARD(lengths);
 
+  if (total_num_blocks.has_value()) {
+    fbgemm_gpu::check_total_num_blocks_divisibility(
+        total_num_blocks.value(), my_size);
+  }
+
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto T = block_sizes.numel();
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu
index b747fb2cf1..6a120e39df 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu
+++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu
@@ -583,6 +583,11 @@ _block_bucketize_sparse_features_2d_weights_cuda(
 
   CUDA_DEVICE_GUARD(lengths);
 
+  if (total_num_blocks.has_value()) {
+    fbgemm_gpu::check_total_num_blocks_divisibility(
+        total_num_blocks.value(), my_size);
+  }
+
   // allocate tensors and buffers
   const auto lengths_size = lengths.numel();
   const auto T = block_sizes.numel();
diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
index 411da4fcc1..e43de9f5e6 100644
--- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
+++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp
@@ -1163,6 +1163,11 @@ _block_bucketize_sparse_features_cpu(
     const bool return_bucket_mapping,
     const bool keep_orig_idx,
     const std::optional<Tensor>& keep_orig_idx_per_feature = std::nullopt) {
+  if (total_num_blocks.has_value()) {
+    fbgemm_gpu::check_total_num_blocks_divisibility(
+        total_num_blocks.value(), my_size);
+  }
+
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
@@ -1619,6 +1624,11 @@ _block_bucketize_sparse_features_2d_weights_cpu(
     const bool return_bucket_mapping,
     const bool keep_orig_idx,
     const std::optional<Tensor>& keep_orig_idx_per_feature = std::nullopt) {
+  if (total_num_blocks.has_value()) {
+    fbgemm_gpu::check_total_num_blocks_divisibility(
+        total_num_blocks.value(), my_size);
+  }
+
   const auto lengths_size = lengths.numel();
   const auto new_lengths_size = lengths_size * my_size;
   auto new_lengths = at::zeros({new_lengths_size}, lengths.options());
diff --git a/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py b/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py
index 4a43ec1cf2..9bc4aab1ee 100644
--- a/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py
+++ b/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py
@@ -998,6 +998,22 @@ def test_block_bucketize_sparse_features_2d_weights_with_variable_batch_sizes(
                     new_indices_ref, new_indices_gpu.cpu(), new_lengths_ref
                 )
 
+    def test_block_bucketize_sparse_features_2d_weights_total_num_blocks_not_divisible(
+        self,
+    ) -> None:
+        indices = torch.tensor([1, 2, 10, 4, 16, 6, 7, 18, 19, 10, 0], dtype=torch.int)
+        with self.assertRaisesRegex(RuntimeError, "must be a multiple of my_size"):
+            torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights(
+                torch.tensor([0, 3, 2, 0, 1, 5], dtype=torch.int),
+                indices,
+                False,
+                False,
+                torch.tensor([2, 3, 4], dtype=torch.int),
+                3,
+                torch.rand(indices.numel(), 3),
+                total_num_blocks=torch.tensor([7, 6, 6], dtype=torch.int),
+            )
+
 
 extend_test_class(BlockBucketize2DWeightsTest)
 
diff --git a/fbgemm_gpu/test/sparse/block_bucketize_test.py b/fbgemm_gpu/test/sparse/block_bucketize_test.py
index 1ad1876368..37458ba7bf 100644
--- a/fbgemm_gpu/test/sparse/block_bucketize_test.py
+++ b/fbgemm_gpu/test/sparse/block_bucketize_test.py
@@ -2111,6 +2111,20 @@ def test_block_bucketize_sparse_features_float64_weights(
                         new_pos_ref, new_pos_gpu.cpu(), new_lengths_ref
                     )
 
+    def test_block_bucketize_sparse_features_total_num_blocks_not_divisible(
+        self,
+    ) -> None:
+        with self.assertRaisesRegex(RuntimeError, "must be a multiple of my_size"):
+            torch.ops.fbgemm.block_bucketize_sparse_features(
+                torch.tensor([0, 3, 2, 0, 1, 5], dtype=torch.int),
+                torch.tensor([1, 2, 10, 4, 16, 6, 7, 18, 19, 10, 0], dtype=torch.int),
+                False,
+                False,
+                torch.tensor([2, 3, 4], dtype=torch.int),
+                3,
+                total_num_blocks=torch.tensor([7, 6, 6], dtype=torch.int),
+            )
+
 
 extend_test_class(BlockBucketizeTest)