diff --git a/include/fbgemm/UtilsAvx2.h b/include/fbgemm/UtilsAvx2.h index bc365bde85..f19302ca86 100644 --- a/include/fbgemm/UtilsAvx2.h +++ b/include/fbgemm/UtilsAvx2.h @@ -11,6 +11,7 @@ // flags. #include +#include #include namespace fbgemm { @@ -89,4 +90,22 @@ fbgemmAlignedAlloc(size_t align, size_t size, bool raiseException = false); */ FBGEMM_API void fbgemmAlignedFree(void* p); +/** + * @brief RAII wrapper for aligned allocations. + */ +struct AlignedFreeDeleter { + void operator()(void* p) const { + fbgemmAlignedFree(p); + } +}; + +template +using aligned_unique_ptr = std::unique_ptr; + +template +aligned_unique_ptr makeAlignedUniquePtr(size_t align, size_t count) { + return aligned_unique_ptr( + static_cast(fbgemmAlignedAlloc(align, count * sizeof(T)))); +} + } // namespace fbgemm diff --git a/src/FbgemmI8Depthwise2DAvx2-inl.h b/src/FbgemmI8Depthwise2DAvx2-inl.h index a17c3bc456..bd2c77ec7f 100644 --- a/src/FbgemmI8Depthwise2DAvx2-inl.h +++ b/src/FbgemmI8Depthwise2DAvx2-inl.h @@ -152,8 +152,9 @@ static ALWAYS_INLINE void depthwise_2d_( int W_OUT = (W + PAD_L + PAD_R - S) / stride_w + 1; const std::int8_t* Bp = B.PackedMat(); - int32_t* row_offsets = static_cast( - fbgemmAlignedAlloc(64, (IC + 31) / 32 * 32 * sizeof(int32_t))); + auto row_offsets_owner = + makeAlignedUniquePtr(64, (IC + 31) / 32 * 32); + int32_t* row_offsets = row_offsets_owner.get(); int64_t n_begin = 0, n_end = 0, h_begin = 0, h_end = 0, w_begin = 0, w_end = 0; @@ -487,8 +488,6 @@ static ALWAYS_INLINE void depthwise_2d_( } } } // for each n - - fbgemmAlignedFree(row_offsets); } // Dispatch A_SYMMETRIC and B_SYMMETRIC @@ -518,8 +517,9 @@ static void depthwise_2d_( const float* act_times_w_scale, int thread_id, int num_threads) { - int32_t* C_int32_temp = static_cast( - fbgemmAlignedAlloc(64, (OC + 31) / 32 * 32 * sizeof(int32_t))); + auto C_int32_temp_owner = + makeAlignedUniquePtr(64, (OC + 31) / 32 * 32); + int32_t* C_int32_temp = C_int32_temp_owner.get(); if (A_zero_point == 0 || col_offsets == nullptr) { if (Q_GRAN == QuantizationGranularity::TENSOR && B_zero_point[0] == 0) { depthwise_2d_< @@ -637,7 +637,6 @@ static void depthwise_2d_( num_threads); } } - fbgemmAlignedFree(C_int32_temp); } // Dispatch HAS_BIAS diff --git a/src/FbgemmI8Depthwise3DAvx2.cc b/src/FbgemmI8Depthwise3DAvx2.cc index 4a3023f4dd..8f2e703e4b 100644 --- a/src/FbgemmI8Depthwise3DAvx2.cc +++ b/src/FbgemmI8Depthwise3DAvx2.cc @@ -167,8 +167,9 @@ static ALWAYS_INLINE void depthwise_3d_same_pad_( int W_OUT = (W + PAD_L + PAD_R - K_W) / stride_w + 1; const int8_t* Bp = B.PackedMat(); - int32_t* row_offsets = static_cast( - fbgemmAlignedAlloc(64, (IC + 31) / 32 * 32 * sizeof(int32_t))); + auto row_offsets_owner = + makeAlignedUniquePtr(64, (IC + 31) / 32 * 32); + int32_t* row_offsets = row_offsets_owner.get(); int64_t n_begin = 0, n_end = 0, t_begin = 0, t_end = 0, h_begin = 0, h_end = 0; @@ -779,7 +780,6 @@ static ALWAYS_INLINE void depthwise_3d_same_pad_( } // h } // t } // for each n - fbgemmAlignedFree(row_offsets); } // Dispatch A_SYMMETRIC and B_SYMMETRIC @@ -802,8 +802,9 @@ static void depthwise_3d_same_pad_( const float* act_times_w_scale, int thread_id, int num_threads) { - int32_t* C_int32_temp = static_cast( - fbgemmAlignedAlloc(64, (conv_p.OC + 31) / 32 * 32 * sizeof(int32_t))); + auto C_int32_temp_owner = + makeAlignedUniquePtr(64, (conv_p.OC + 31) / 32 * 32); + int32_t* C_int32_temp = C_int32_temp_owner.get(); if (A_zero_point == 0 || col_offsets == nullptr) { if (Q_GRAN == QuantizationGranularity::TENSOR && B_zero_point[0] == 0) { depthwise_3d_same_pad_< @@ -893,7 +894,6 @@ static void depthwise_3d_same_pad_( num_threads); } } - fbgemmAlignedFree(C_int32_temp); } // Dispatch HAS_BIAS diff --git a/src/FbgemmI8Spmdm.cc b/src/FbgemmI8Spmdm.cc index b15dd57a1f..e2ef2d55c3 100644 --- a/src/FbgemmI8Spmdm.cc +++ b/src/FbgemmI8Spmdm.cc @@ -78,8 +78,8 @@ void CompressedSparseColumn::SpMDM( // If NNZ/K is small, it's not worth doing transpose so we just use this // scalar loop. #ifdef _MSC_VER - int32_t* C_temp = static_cast( - fbgemmAlignedAlloc(64, block.row_size * sizeof(int32_t))); + auto C_temp_owner = makeAlignedUniquePtr(64, block.row_size); + int32_t* C_temp = C_temp_owner.get(); #else int32_t C_temp[block.row_size]; #endif @@ -141,9 +141,6 @@ void CompressedSparseColumn::SpMDM( } } // for each column of B } -#ifdef _MSC_VER - fbgemmAlignedFree(C_temp); -#endif return; } @@ -165,10 +162,10 @@ void CompressedSparseColumn::SpMDM( // dynamically allocated memory for MSVC even though dynamically allocated // memory works for all compilers. #ifdef _MSC_VER - uint8_t* A_buffer = - static_cast(fbgemmAlignedAlloc(64, K * 32 * sizeof(uint8_t))); - int32_t* C_buffer = - static_cast(fbgemmAlignedAlloc(64, N * 32 * sizeof(int32_t))); + auto A_buffer_owner = makeAlignedUniquePtr(64, K * 32); + auto C_buffer_owner = makeAlignedUniquePtr(64, N * 32); + uint8_t* A_buffer = A_buffer_owner.get(); + int32_t* C_buffer = C_buffer_owner.get(); #else alignas(64) uint8_t A_buffer[K * 32]; alignas(64) int32_t C_buffer[N * 32]; @@ -180,8 +177,8 @@ void CompressedSparseColumn::SpMDM( // Transpose 32 x K submatrix of A if (i_end - i1 < 32) { #ifdef _MSC_VER - uint8_t* A_temp_buffer = static_cast( - fbgemmAlignedAlloc(64, K * 32 * sizeof(uint8_t))); + auto A_temp_buffer_owner = makeAlignedUniquePtr(64, K * 32); + uint8_t* A_temp_buffer = A_temp_buffer_owner.get(); #else alignas(64) uint8_t A_temp_buffer[K * 32]; #endif @@ -200,9 +197,6 @@ void CompressedSparseColumn::SpMDM( for (int i2 = (i_end - i1) / 8 * 8; i2 < 32; i2 += 8) { transpose_8rows(K, A_temp_buffer + i2 * K, K, A_buffer + i2, 32); } -#ifdef _MSC_VER - fbgemmAlignedFree(A_temp_buffer); -#endif } else { for (int i2 = 0; i2 < 32; i2 += 8) { transpose_8rows(K, A + (i1 + i2) * lda, lda, A_buffer + i2, 32); @@ -280,10 +274,6 @@ void CompressedSparseColumn::SpMDM( spmdm_run_time += (dt); t_start = std::chrono::high_resolution_clock::now(); #endif -#ifdef _MSC_VER - fbgemmAlignedFree(A_buffer); - fbgemmAlignedFree(C_buffer); -#endif #endif // __aarch64__ } diff --git a/src/PackAWithQuantRowOffset.cc b/src/PackAWithQuantRowOffset.cc index 3e4dd13bfe..536ee311f6 100644 --- a/src/PackAWithQuantRowOffset.cc +++ b/src/PackAWithQuantRowOffset.cc @@ -20,6 +20,7 @@ #include "./OptimizedKernelsAvx2.h" // @manual #include "fbgemm/Fbgemm.h" #include "fbgemm/QuantUtils.h" +#include "fbgemm/UtilsAvx2.h" namespace fbgemm { @@ -147,10 +148,12 @@ void PackAWithQuantRowOffset::pack( (block.col_start % (this->numCols() / this->numGroups())) != 0; int32_t* row_offset_buf = getRowOffsetBuffer(); + aligned_unique_ptr smat_transposed_owner; float* smat_transposed = nullptr; if (tr) { - smat_transposed = static_cast(fbgemmAlignedAlloc( - 64, block.row_size * block.col_size * sizeof(float))); + smat_transposed_owner = + makeAlignedUniquePtr(64, block.row_size * block.col_size); + smat_transposed = smat_transposed_owner.get(); transpose_simd( block.col_size, block.row_size, @@ -197,9 +200,6 @@ void PackAWithQuantRowOffset::pack( out[i * BaseType::blockColSize() + j] = 0; } } - if (smat_transposed) { - fbgemmAlignedFree(smat_transposed); - } } template diff --git a/src/PackDepthwiseConvMatrixAvx2.cc b/src/PackDepthwiseConvMatrixAvx2.cc index 2ce3e976c9..07a7be85d1 100644 --- a/src/PackDepthwiseConvMatrixAvx2.cc +++ b/src/PackDepthwiseConvMatrixAvx2.cc @@ -26,8 +26,9 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( : OC_(OC), kernel_prod_(kernel_prod) { // The input is in OC T R S layout. // Transpose the input matrix to make packing faster. - int8_t* smat_transposed = static_cast( - fbgemmAlignedAlloc(64, OC * kernel_prod * sizeof(int8_t))); + auto smat_transposed_owner = + makeAlignedUniquePtr(64, OC * kernel_prod); + int8_t* smat_transposed = smat_transposed_owner.get(); for (int i = 0; i < kernel_prod; ++i) { for (int j = 0; j < OC; ++j) { smat_transposed[i * OC + j] = smat[i + j * kernel_prod]; @@ -93,12 +94,14 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( // (28, 8), (28, 9), (28, 10), zero, ..., (31, 8), (31, 9), (31, 10), zero // Allocate buffers - auto b_v = static_cast<__m256i*>( - fbgemmAlignedAlloc(64, kernel_prod * sizeof(__m256i))); - auto b_interleaved_epi16 = static_cast<__m256i*>( - fbgemmAlignedAlloc(64, kernel_prod_aligned * sizeof(__m256i))); - auto b_interleaved_epi32 = static_cast<__m256i*>( - fbgemmAlignedAlloc(64, kernel_prod_aligned * sizeof(__m256i))); + auto b_v_owner = makeAlignedUniquePtr<__m256i>(64, kernel_prod); + auto b_v = b_v_owner.get(); + auto b_interleaved_epi16_owner = + makeAlignedUniquePtr<__m256i>(64, kernel_prod_aligned); + auto b_interleaved_epi16 = b_interleaved_epi16_owner.get(); + auto b_interleaved_epi32_owner = + makeAlignedUniquePtr<__m256i>(64, kernel_prod_aligned); + auto b_interleaved_epi32 = b_interleaved_epi32_owner.get(); for (int k1 = 0; k1 < OC; k1 += 32) { int remainder = OC - k1; if (remainder < 32) { @@ -154,10 +157,6 @@ PackedDepthWiseConvMatrix::PackedDepthWiseConvMatrix( b_interleaved_epi32[i]); } } - fbgemmAlignedFree(b_v); - fbgemmAlignedFree(b_interleaved_epi16); - fbgemmAlignedFree(b_interleaved_epi32); - fbgemmAlignedFree(smat_transposed); } int PackedDepthWiseConvMatrix::addr(int r, int c) { diff --git a/src/PackWeightsForDirectConv.cc b/src/PackWeightsForDirectConv.cc index 0a22997b7b..fcbcd8dec4 100644 --- a/src/PackWeightsForDirectConv.cc +++ b/src/PackWeightsForDirectConv.cc @@ -257,10 +257,12 @@ void fbgemmDirectConv( fn = codeObj.getOrCreateDirectConvTrans( true, conv_p.stride[1], conv_p.K[1]); - int32_t* inSum = static_cast(fbgemmAlignedAlloc( - 64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1] * sizeof(int32_t))); - int32_t* rowSum = static_cast(fbgemmAlignedAlloc( - 64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1] * sizeof(int32_t))); + auto inSum_owner = makeAlignedUniquePtr( + 64, conv_p.IN_DIM[0] * conv_p.IN_DIM[1]); + int32_t* inSum = inSum_owner.get(); + auto rowSum_owner = makeAlignedUniquePtr( + 64, conv_p.OUT_DIM[0] * conv_p.OUT_DIM[1]); + int32_t* rowSum = rowSum_owner.get(); directConvRowSum(conv_p, Aint8, inSum, rowSum); int kernel_dim = conv_p.K[0] * conv_p.K[1]; @@ -450,8 +452,6 @@ void fbgemmDirectConv( } } } - fbgemmAlignedFree(inSum); - fbgemmAlignedFree(rowSum); } // transposed conv else { // non-transposed conv assert(false && "non-transposed direct conv not integrated yet."); diff --git a/src/QuantUtilsAvx2.cc b/src/QuantUtilsAvx2.cc index a9ae622795..032c1d2fd9 100644 --- a/src/QuantUtilsAvx2.cc +++ b/src/QuantUtilsAvx2.cc @@ -22,6 +22,7 @@ #include "fbgemm/FbgemmConvert.h" #include "fbgemm/FloatConversion.h" #include "fbgemm/Types.h" +#include "fbgemm/UtilsAvx2.h" namespace fbgemm { @@ -1599,12 +1600,14 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2( (input_columns + NUM_ELEM_PER_BYTE - 1) / NUM_ELEM_PER_BYTE + 2 * sizeof(std::uint16_t); + aligned_unique_ptr input_row_float_for_fp16_owner; float* input_row_float_for_fp16 = nullptr; float min_max_row_float_for_fp16[kRowwiseMinMaxNumCols]; const auto is_valid_rowwise_min_max = (rowwise_min_max != nullptr); if constexpr (std::is_same_v) { - input_row_float_for_fp16 = static_cast( - fbgemmAlignedAlloc(64, input_columns * sizeof(float))); + input_row_float_for_fp16_owner = + makeAlignedUniquePtr(64, input_columns); + input_row_float_for_fp16 = input_row_float_for_fp16_owner.get(); } for (size_t row = 0; row < input_rows; ++row) { @@ -1794,10 +1797,6 @@ void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2( } } } // for each row - - if constexpr (std::is_same_v) { - fbgemmAlignedFree(input_row_float_for_fp16); - } } template @@ -1824,12 +1823,14 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2( _mm256_set_epi32(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00); const int64_t output_columns = input_columns + 2 * sizeof(float); + aligned_unique_ptr input_row_float_for_fp16_owner; float* input_row_float_for_fp16 = nullptr; float min_max_row_float_for_fp16[kRowwiseMinMaxNumCols]; const auto is_valid_rowwise_min_max = (rowwise_min_max != nullptr); if constexpr (std::is_same_v) { - input_row_float_for_fp16 = static_cast( - fbgemmAlignedAlloc(64, input_columns * sizeof(float))); + input_row_float_for_fp16_owner = + makeAlignedUniquePtr(64, input_columns); + input_row_float_for_fp16 = input_row_float_for_fp16_owner.get(); } for (size_t row = 0; row < input_rows; ++row) { const InputType* input_row = input + row * input_columns; @@ -1957,9 +1958,6 @@ void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2( std::lrintf((input_row_float[col] - minimum_element) * inverse_scale); } } // for each row - if constexpr (std::is_same_v) { - fbgemmAlignedFree(input_row_float_for_fp16); - } } template diff --git a/src/Utils.cc b/src/Utils.cc index f3fc8ebc8a..7dcd2cfdff 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -25,6 +25,7 @@ #include #include #include +#include "fbgemm/UtilsAvx2.h" #ifdef _OPENMP #include @@ -675,10 +676,11 @@ std::pair radix_sort_parallel( #ifdef _MSC_VER const size_t array_size = static_cast(RDX_HIST_SIZE) * maxthreads; // fixes MSVC error C2131 - auto* const histogram = static_cast( - fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t))); - auto* const histogram_ps = static_cast( - fbgemm::fbgemmAlignedAlloc(64, array_size * sizeof(int64_t))); + auto histogram_owner = fbgemm::makeAlignedUniquePtr(64, array_size); + auto* const histogram = histogram_owner.get(); + auto histogram_ps_owner = + fbgemm::makeAlignedUniquePtr(64, array_size); + auto* const histogram_ps = histogram_ps_owner.get(); #else alignas(64) int64_t histogram[RDX_HIST_SIZE * maxthreads]; @@ -719,10 +721,6 @@ std::pair radix_sort_parallel( } } } -#ifdef _MSC_VER - fbgemm::fbgemmAlignedFree(histogram); - fbgemm::fbgemmAlignedFree(histogram_ps); -#endif return ( num_passes % 2 == 0 ? std::pair{inp_key_buf, inp_value_buf} : std::pair{tmp_key_buf, tmp_value_buf});