From 5e420bfa988911655d2752c0f8304eb70faadff3 Mon Sep 17 00:00:00 2001 From: Yuanyuan Chen Date: Thu, 16 Apr 2026 14:17:20 -0700 Subject: [PATCH] Validate total_num_blocks divisibility by my_size in block_bucketize (#5646) Summary: X-link: https://github.com/facebookresearch/FBGEMM/pull/2596 Add input validation for `total_num_blocks` divisibility by `my_size` in `block_bucketize_sparse_features` to prevent silent buffer overflows. Test Plan: # Full Suites # Requires: Any CUDA GPU (has CPU fallback) buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize_2d_weights # Individual Tests (new in this diff) buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize -- -k test_block_bucketize_sparse_features_total_num_blocks_not_divisible buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize_2d_weights -- -k test_block_bucketize_sparse_features_2d_weights_total_num_blocks_not_divisible # Related Tests (validates existing behavior not broken) buck2 test fbcode//deeplearning/fbgemm/fbgemm_gpu/test/sparse:block_bucketize -- -k total_num_blocks # Benchmark (does NOT pass total_num_blocks, zero overhead on common path) buck2 run @//mode/opt fbcode//deeplearning/fbgemm/fbgemm_gpu/bench:sparse_ops -- block-bucketize-sparse-features-bench Reviewed By: henrylhtsang Differential Revision: D101141810 Pulled By: q10 --- fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h | 24 +++++++++++++++++++ .../sparse_block_bucketize_features.cu | 5 ++++ ...rse_block_bucketize_features_2d_weights.cu | 5 ++++ fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp | 10 ++++++++ .../sparse/block_bucketize_2d_weights_test.py | 16 +++++++++++++ .../test/sparse/block_bucketize_test.py | 14 +++++++++++ 6 files changed, 74 insertions(+) diff --git a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h index 0e7bd37234..4e11003ca4 100644 --- a/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h +++ b/fbgemm_gpu/include/fbgemm_gpu/sparse_ops.h @@ -18,6 +18,30 @@ namespace fbgemm_gpu { +// Validate that every element in total_num_blocks is divisible by my_size. +inline void check_total_num_blocks_divisibility( + const at::Tensor& total_num_blocks, + int64_t my_size) { + const auto tnb = total_num_blocks.cpu(); + AT_DISPATCH_INDEX_TYPES( + tnb.scalar_type(), + "block_bucketize_sparse_features_total_num_blocks_check", + [&] { + const auto* tnb_data = tnb.const_data_ptr(); + for (const auto t : c10::irange(tnb.numel())) { + TORCH_CHECK( + tnb_data[t] % my_size == 0, + "block_bucketize_sparse_features: total_num_blocks[", + t, + "] = ", + tnb_data[t], + " must be a multiple of my_size (", + my_size, + ")"); + } + }); +} + /// @defgroup sparse-data-cuda Sparse Data CUDA Operators /// The following are CUDA operators /// diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu index 8257ebeee7..1ae97de6e8 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features.cu @@ -800,6 +800,11 @@ _block_bucketize_sparse_features_cuda( CUDA_DEVICE_GUARD(lengths); + if (total_num_blocks.has_value()) { + fbgemm_gpu::check_total_num_blocks_divisibility( + total_num_blocks.value(), my_size); + } + // allocate tensors and buffers const auto lengths_size = lengths.numel(); const auto T = block_sizes.numel(); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu index b747fb2cf1..6a120e39df 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu +++ b/fbgemm_gpu/src/sparse_ops/sparse_block_bucketize_features_2d_weights.cu @@ -583,6 +583,11 @@ _block_bucketize_sparse_features_2d_weights_cuda( CUDA_DEVICE_GUARD(lengths); + if (total_num_blocks.has_value()) { + fbgemm_gpu::check_total_num_blocks_divisibility( + total_num_blocks.value(), my_size); + } + // allocate tensors and buffers const auto lengths_size = lengths.numel(); const auto T = block_sizes.numel(); diff --git a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp index 411da4fcc1..e43de9f5e6 100644 --- a/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp +++ b/fbgemm_gpu/src/sparse_ops/sparse_ops_cpu.cpp @@ -1163,6 +1163,11 @@ _block_bucketize_sparse_features_cpu( const bool return_bucket_mapping, const bool keep_orig_idx, const std::optional& keep_orig_idx_per_feature = std::nullopt) { + if (total_num_blocks.has_value()) { + fbgemm_gpu::check_total_num_blocks_divisibility( + total_num_blocks.value(), my_size); + } + const auto lengths_size = lengths.numel(); const auto new_lengths_size = lengths_size * my_size; auto new_lengths = at::zeros({new_lengths_size}, lengths.options()); @@ -1619,6 +1624,11 @@ _block_bucketize_sparse_features_2d_weights_cpu( const bool return_bucket_mapping, const bool keep_orig_idx, const std::optional& keep_orig_idx_per_feature = std::nullopt) { + if (total_num_blocks.has_value()) { + fbgemm_gpu::check_total_num_blocks_divisibility( + total_num_blocks.value(), my_size); + } + const auto lengths_size = lengths.numel(); const auto new_lengths_size = lengths_size * my_size; auto new_lengths = at::zeros({new_lengths_size}, lengths.options()); diff --git a/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py b/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py index 4a43ec1cf2..9bc4aab1ee 100644 --- a/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py +++ b/fbgemm_gpu/test/sparse/block_bucketize_2d_weights_test.py @@ -998,6 +998,22 @@ def test_block_bucketize_sparse_features_2d_weights_with_variable_batch_sizes( new_indices_ref, new_indices_gpu.cpu(), new_lengths_ref ) + def test_block_bucketize_sparse_features_2d_weights_total_num_blocks_not_divisible( + self, + ) -> None: + indices = torch.tensor([1, 2, 10, 4, 16, 6, 7, 18, 19, 10, 0], dtype=torch.int) + with self.assertRaisesRegex(RuntimeError, "must be a multiple of my_size"): + torch.ops.fbgemm.block_bucketize_sparse_features_2d_weights( + torch.tensor([0, 3, 2, 0, 1, 5], dtype=torch.int), + indices, + False, + False, + torch.tensor([2, 3, 4], dtype=torch.int), + 3, + torch.rand(indices.numel(), 3), + total_num_blocks=torch.tensor([7, 6, 6], dtype=torch.int), + ) + extend_test_class(BlockBucketize2DWeightsTest) diff --git a/fbgemm_gpu/test/sparse/block_bucketize_test.py b/fbgemm_gpu/test/sparse/block_bucketize_test.py index 1ad1876368..37458ba7bf 100644 --- a/fbgemm_gpu/test/sparse/block_bucketize_test.py +++ b/fbgemm_gpu/test/sparse/block_bucketize_test.py @@ -2111,6 +2111,20 @@ def test_block_bucketize_sparse_features_float64_weights( new_pos_ref, new_pos_gpu.cpu(), new_lengths_ref ) + def test_block_bucketize_sparse_features_total_num_blocks_not_divisible( + self, + ) -> None: + with self.assertRaisesRegex(RuntimeError, "must be a multiple of my_size"): + torch.ops.fbgemm.block_bucketize_sparse_features( + torch.tensor([0, 3, 2, 0, 1, 5], dtype=torch.int), + torch.tensor([1, 2, 10, 4, 16, 6, 7, 18, 19, 10, 0], dtype=torch.int), + False, + False, + torch.tensor([2, 3, 4], dtype=torch.int), + 3, + total_num_blocks=torch.tensor([7, 6, 6], dtype=torch.int), + ) + extend_test_class(BlockBucketizeTest)